From 7a9f1fab2745b517a3e70ff833f4ae99fd5dab8a Mon Sep 17 00:00:00 2001 From: aj Date: Sat, 25 Jan 2020 20:20:17 +0000 Subject: [PATCH] added cache, added audio feature prep --- analysis/__init__.py | 0 analysis/cache.py | 47 +++++++++++++++++++++++++ audio-features-prep.py | 78 ++++++++++++++++++++++++++++++++++++++++-- cache-transform.py | 32 +++++++++++++++++ scrobble-prep.py | 41 ++++++++-------------- 5 files changed, 169 insertions(+), 29 deletions(-) create mode 100644 analysis/__init__.py create mode 100644 analysis/cache.py create mode 100644 cache-transform.py diff --git a/analysis/__init__.py b/analysis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/analysis/cache.py b/analysis/cache.py new file mode 100644 index 0000000..5710039 --- /dev/null +++ b/analysis/cache.py @@ -0,0 +1,47 @@ +import os +import logging +import json + +logger = logging.getLogger(__name__) + + +class Cache: + def __init__(self, cache): + self.cache = cache + + def set_track(self, name, artist, uri=None, audio_features=None): + name = str(name).lower() + artist = str(artist).lower() + + if self.cache['cache'].get(artist) is None: + self.cache['cache'][artist] = {name: {}} + if self.cache['cache'][artist].get(name) is None: + self.cache['cache'][artist][name] = {} + + if uri is not None: + self.cache['cache'][artist][name]['uri'] = uri + if audio_features is not None: + self.cache['cache'][artist][name]['features'] = audio_features + + def get_track(self, name, artist): + name = str(name) + artist = str(artist) + try: + return self.cache['cache'][artist][name] + except KeyError: + return None + + +def load_cache_from_storage(path: str = '.', name: str = 'cache.json'): + + if os.path.exists(os.path.join(path, name)): + with open(os.path.join(path, name), 'r') as file: + return Cache(json.loads(file.read())) + else: + logger.error(f'{os.path.join(path, name)} does not exist') + return {'cache': {}} + + +def write_cache_to_storage(cache: Cache, path: str = '.', name: str = 'cache.json'): + with open(os.path.join(path, name), 'w') as file: + file.write(json.dumps(cache.cache)) diff --git a/audio-features-prep.py b/audio-features-prep.py index 13a3f01..f4e9bec 100644 --- a/audio-features-prep.py +++ b/audio-features-prep.py @@ -3,17 +3,91 @@ from fmframework.net.network import Network as FmNet from spotframework.net.network import Network as SpotNet from spotframework.net.user import NetworkUser from spotframework.model.uri import Uri +from google.cloud import bigquery from csv import DictWriter -import os import datetime -import json +import os from log import logger +import analysis.cache + spotnet = SpotNet(NetworkUser(client_id=os.environ['SPOT_CLIENT'], client_secret=os.environ['SPOT_SECRET'], refresh_token=os.environ['SPOT_REFRESH']).refresh_access_token()) fmnet = FmNet(username='sarsoo', api_key=os.environ['FM_CLIENT']) +cache = analysis.cache.load_cache_from_storage() +client = bigquery.Client() + +# Perform a query. +QUERY = ( + 'SELECT ' + ' DISTINCT uri, track, album, artist ' + 'FROM `sarsooxyz.scrobbles.*` ' + 'WHERE ' + ' uri IS NOT NULL ' + 'ORDER BY artist ' +) +logger.info('querying uris') +query_job = client.query(QUERY) +rows = query_job.result() + +features = [] +for_pulling = [] + +logger.info('polling cache') +for row in rows: + cache_entry = cache.get_track(row.track, row.artist) + + if cache_entry is not None: + if cache_entry.get('features') is None: + features.append(cache_entry) + continue + + for_pulling.append(row) + +logger.info('pulling tracks') +tracks = spotnet.get_tracks(uri_strings=[i.uri for i in for_pulling]) + +if tracks is not None: + logger.info('populating features') + tracks = spotnet.populate_track_audio_features(tracks) + features += [i.audio_features.to_dict() for i in tracks if i.audio_features is not None] + +logger.info('caching pulled') +for cacheable in for_pulling: + track = next((i for i in tracks if str(i.uri) == cacheable.uri), None) + if track is not None and track.audio_features is not None: + cache.set_track(name=cacheable.track, artist=cacheable.artist, audio_features=track.audio_features.to_dict()) + +logger.info('dumping') +date = str(datetime.date.today()) +with open(f'{date}_features.csv', 'w', newline='') as fileobj: + + headers = ['acousticness', + 'analysis_url', + 'danceability', + 'duration_ms', + 'energy', + 'uri', + 'instrumentalness', + 'key', + 'key_code', + 'liveness', + 'loudness', + 'mode', + 'speechiness', + 'tempo', + 'time_signature', + 'track_href', + 'valence'] + writer = DictWriter(fileobj, fieldnames=headers, dialect='excel-tab') + writer.writeheader() + + for feature in features: + writer.writerow(feature) + +analysis.cache.write_cache_to_storage(cache) diff --git a/cache-transform.py b/cache-transform.py new file mode 100644 index 0000000..97d02d8 --- /dev/null +++ b/cache-transform.py @@ -0,0 +1,32 @@ +import os, json, pprint + +uri_cache_name = 'cache.json' +if os.path.isfile(uri_cache_name): + with open(uri_cache_name, 'r') as uri_cache: + uris = json.loads(uri_cache.read()) + + new_cache = { + 'cache': {} + } + + for uri in uris: + + try: + new_cache['cache'][uri['artist']] + except KeyError: + new_cache['cache'][uri['artist']] = {} + + try: + new_cache['cache'][uri['artist']][uri['name']] + except KeyError: + new_cache['cache'][uri['artist']][uri['name']] = {} + + new_cache['cache'][uri['artist']][uri['name']]['uri'] = uri['uri'] + + pprint.pprint(new_cache) + + with open(uri_cache_name, 'w') as uri_cache: + uri_cache.write(json.dumps(new_cache)) + + + diff --git a/scrobble-prep.py b/scrobble-prep.py index f87c872..b58c37e 100644 --- a/scrobble-prep.py +++ b/scrobble-prep.py @@ -8,58 +8,46 @@ from csv import DictWriter import os import datetime -import json from log import logger +import analysis.cache + spotnet = SpotNet(NetworkUser(client_id=os.environ['SPOT_CLIENT'], client_secret=os.environ['SPOT_SECRET'], refresh_token=os.environ['SPOT_REFRESH']).refresh_access_token()) fmnet = FmNet(username='sarsoo', api_key=os.environ['FM_CLIENT']) # initialise cache -uri_cache_name = 'uris.json' -if os.path.isfile(uri_cache_name): - with open(uri_cache_name, 'r') as uri_cache: - uris = json.loads(uri_cache.read()) -else: - uris = [] +cache = analysis.cache.load_cache_from_storage() # scrobble range -from_date = datetime.datetime(year=2018, month=1, day=1) -to_date = datetime.datetime(year=2019, month=1, day=1) +from_date = datetime.datetime(year=2019, month=1, day=1) +to_date = datetime.datetime(year=2020, month=1, day=1) scrobbles = fmnet.get_recent_tracks(from_time=from_date, to_time=to_date, page_limit=200) # populate with uris for scrobble in scrobbles: - cache_entry = [i for i in uris if - i['name'] == scrobble.track.name.lower() and - i['artist'] == scrobble.track.artist.name.lower()] + cache_entry = cache.get_track(name=scrobble.track.name.lower(), artist=scrobble.track.artist.name.lower()) - # check cache - if len(cache_entry) == 0: + if cache_entry is not None and cache_entry.get('uri'): + scrobble.uri = cache_entry.get('uri') + else: logger.info(f'pulling {scrobble.track}') spotify_search = spotnet.search(query_types=[Uri.ObjectType.track], track=scrobble.track.name, artist=scrobble.track.artist.name, response_limit=5).tracks if len(spotify_search) > 0: - uris.append({ - 'name': scrobble.track.name.lower(), - 'artist': scrobble.track.artist.name.lower(), - 'uri': str(spotify_search[0].uri) - }) - scrobble.uri = spotify_search[0].uri + cache.set_track(name=scrobble.track.name.lower(), + artist=scrobble.track.artist.name.lower(), + uri=str(spotify_search[0].uri)) + scrobble.uri = str(spotify_search[0].uri) else: logger.debug('no search tracks returned') scrobble.uri = None - # cache entry available - else: - # logger.info(f'{scrobble.track} found in cache') - scrobble.uri = cache_entry[0]['uri'] - date = str(datetime.date.today()) with open(f'{date}_scrobbles.csv', 'w', newline='') as fileobj: @@ -79,5 +67,4 @@ with open(f'{date}_scrobbles.csv', 'w', newline='') as fileobj: 'uri': str(scrobble.uri) if scrobble.uri is not None else '' }) -with open(uri_cache_name, 'w') as uri_cache: - uri_cache.write(json.dumps(uris)) +analysis.cache.write_cache_to_storage(cache)