Implement user playlist scraping & changed track scrapers to generators

feature/scrape_user_items
Michael K. Steinberg 2023-01-09 20:47:59 +02:00
rodzic a0f85e6ca5
commit 3706a095a9
2 zmienionych plików z 44 dodań i 25 usunięć

Wyświetl plik

@ -65,7 +65,7 @@ def download_track_list(download_dir: str, track_list: list, recursive_artist: b
track.download_to_file(scraper, track_path)
console.happy(f'Thread<{my_thread_id}> | Downloaded: {track.preview_title()}')
if (recursive_album or recursive) and len(track_list) < recursive_limit:
new_tracks = scraper.scrape_album_tracks(track.album.spotify_id)
new_tracks = list(scraper.scrape_album_tracks(track.album.spotify_id))
for new_track in new_tracks:
if new_track not in track_list and len(track_list) < recursive_limit:
track_list.append(new_track)
@ -84,11 +84,11 @@ def download_track_list(download_dir: str, track_list: list, recursive_artist: b
if (recursive_artist or recursive) and len(track_list) < recursive_limit:
old_size = len(track_list)
track_list += scraper.scrape_artist_tracks(artist.spotify_id)
track_list += list(scraper.scrape_artist_tracks(artist.spotify_id))
if recursive_artist:
albums = scraper.scrape_artist_albums(artist.spotify_id)
albums = list(scraper.scrape_artist_albums(artist.spotify_id))
for album in albums:
track_list += scraper.scrape_album_tracks(album['id'])
track_list += list(scraper.scrape_album_tracks(album['id']))
console.log(f'Thread<{my_thread_id}> | Scraped {len(track_list) - old_size} new songs through recursive artist!')
except Exception as ex:
console.error(f'Thread<{my_thread_id}> | Exception: {ex}')
@ -129,15 +129,16 @@ def full_download(download_dir: str, identifier: str, recursive_artist: bool=Fal
client.refresh_tokens()
console.log(f'Recieved scrape command on identifier: {identifier}, {recursive=}, {recursive_artist=}, {recursive_album=}, {recursive_limit=}, {thread_count=}')
track_list = scraper.scrape_tracks(identifier, console=console)
console.log(f'Scraping on identifier: {identifier} yielded {len(track_list)} tracks!')
#console.log(f'Scraping on identifier: {identifier} yielded {len(track_list)} tracks!')
download_threads = []
thread_subsection_size = int(len(track_list) / thread_count)
for i in range(thread_count - 1):
download_threads.append(Thread(target=download_track_list, args=(download_dir, track_list[thread_subsection_size * i : (thread_subsection_size * i) + thread_subsection_size], recursive_artist, recursive_album, recursive, recursive_limit)))
download_threads[-1].start()
sleep(0.05)
download_threads.append(Thread(target=download_track_list, args=(download_dir, track_list[thread_subsection_size * (thread_count - 1):], recursive_artist, recursive_album, recursive, recursive_limit)))
track_list = []
for track in scraper.scrape_tracks(identifier, console=console):
track_list.append(track)
if len(track_list) == recursive_limit / thread_count:
download_threads.append(Thread(target=download_track_list, args=(download_dir, list(track_list), recursive_artist, recursive_album, recursive, recursive_limit)))
download_threads[-1].start()
sleep(0.05)
download_threads.append(Thread(target=download_track_list, args=(download_dir, list(track_list), recursive_artist, recursive_album, recursive, recursive_limit)))
download_threads[-1].start()
[x.join() for x in download_threads]
@ -165,7 +166,7 @@ def download_all_categories_playlists(download_meta_data_only=True):
playlist = scraper.get_playlist(playlist_id)
playlist.export_to_file()
if not download_meta_data_only:
full_download(f'{settings.DEFAULT_DOWNLOAD_DIRECTORY}', identifier=playlist.href)
full_download(f'{settings.DEFAULT_DOWNLOAD_DIRECTORY}', identifier=playlist.href, thread_count=15)
except Exception as ex:
console.error(f'Scraping categories exception: {ex}')
except Exception as ex:

Wyświetl plik

@ -13,6 +13,7 @@ class SpotifyScraper:
Album = 1
Artist = 2
Track = 3
User = 4
Unknown = -1
def __init__(self, sp_dc=None, sp_key=None, client=None) -> None:
@ -30,6 +31,8 @@ class SpotifyScraper:
return self.IDTypes.Artist
elif 'track' in link.lower():
return self.IDTypes.Track
elif 'user' in link.lower():
return self.IDTypes.User
return self.IDTypes.Unknown
def extract_id_from_link(self, link: str) -> str:
@ -45,6 +48,8 @@ class SpotifyScraper:
return self.scrape_artist_tracks(self.extract_id_from_link(link), intense=True, console=console)
elif id_type == self.IDTypes.Track:
return [SpotifyTrack(self.get(f'https://api.spotify.com/v1/tracks/{self.extract_id_from_link(link)}').json())]
elif id_type == self.IDTypes.User:
return self.scrape_user_items(self.extract_id_from_link(link))
def scrape_playlist(self, playlist_id: str):
return self._client.get(f'https://api.spotify.com/v1/playlists/{playlist_id}').json()
@ -54,17 +59,22 @@ class SpotifyScraper:
limit = 100
playlist_data = self._client.get(f'https://api.spotify.com/v1/playlists/{playlist_id}/tracks?offset={offset}&limit={limit}&market=from_token').json()
tracks = playlist_data['items']
for track_data in playlist_data['items']:
yield SpotifyTrack(track_data)
while playlist_data['next'] is not None:
offset += limit
playlist_data = self._client.get(f'https://api.spotify.com/v1/playlists/{playlist_id}/tracks?offset={offset}&limit={limit}&market=from_token').json()
tracks += playlist_data['items']
for track_data in playlist_data['items']:
yield SpotifyTrack(track_data)
if len(tracks) != int(playlist_data['total']):
print(f'Warning: track count does not match! {len(tracks)} != {int(playlist_data["tracks"]["total"])}')
for track_data in tracks:
yield SpotifyTrack(track_data)
spotify_tracks = [SpotifyTrack(track_data) for track_data in tracks]
if settings.AUTO_DOWNLOAD_PLAYLIST_METADATA:
playlist = SpotifyPlaylist(playlist_id, spotify_tracks, self.get_playlist_data(playlist_id))
playlist.export_to_file()
return spotify_tracks
def scrape_album(self, album_id: str):
return self._client.get(f'https://api.spotify.com/v1/albums/{album_id}').json()
@ -73,18 +83,13 @@ class SpotifyScraper:
limit = 50
offset = 0
ret = self._client.get(f'https://api.spotify.com/v1/albums/{album_id}/tracks?limit={limit}').json()
tracks = ret['items']
for track in ret['items']:
yield SpotifyTrack(self.get(track['href']).json())
while ret['next'] is not None:
offset += limit
ret = self._client.get(f'https://api.spotify.com/v1/albums/{album_id}/tracks?offset={offset}&limit={limit}').json()
tracks += ret['items']
if len(tracks) != int(ret['total']):
print(f'Warning: track count does not match! {len(tracks)} != {int(ret["total"])}')
processed_tracks = []
for track_data in tracks:
processed_tracks.append(SpotifyTrack(self.get(track_data['href']).json()))
return processed_tracks
for track in ret['items']:
yield SpotifyTrack(self.get(track['href']).json())
def scrape_artist(self, artist_id: str):
return self.get(f'https://api.spotify.com/v1/artists/{artist_id}/top-tracks?market=from_token').json()
@ -107,15 +112,16 @@ class SpotifyScraper:
except:
artist_name = 'Unknown'
proccessed_tracks = [SpotifyTrack(track_data) for track_data in tracks]
yield proccessed_tracks
if intense:
albums = self.scrape_artist_albums(artist_id)
proccessed_album_count = 0
for album in albums:
proccessed_tracks += self.scrape_album_tracks(album['id'])
for track in self.scrape_album_tracks(album['id']):
yield track
proccessed_album_count += 1
if console is not None:
console.log(f'Scraping {artist_name}\'s albums: {proccessed_album_count} / {len(albums)}')
return proccessed_tracks
def get(self, url: str) -> Response:
return self._client.get(url)
@ -168,3 +174,15 @@ class SpotifyScraper:
playlist_data = self.get_playlist_data(playlist_id)
tracks = self.scrape_playlist_tracks(playlist_id)
return SpotifyPlaylist(spotify_id=playlist_id, tracks=tracks, data=playlist_data)
def scrape_user_items(self, user_id: str, limit:int=50) -> list[SpotifyTrack]:
has_next = True
user_playlists = []
while has_next:
user_playlist_set = self.get(f'https://api.spotify.com/v1/users/{user_id}/playlists?limit={limit}').json()
has_next = user_playlist_set['next']
for playlist in user_playlist_set['items']:
user_playlists.append(playlist['id'])
for playlist_id in user_playlists:
for track in self.scrape_playlist_tracks(playlist_id):
yield track