From 7a9f1fab2745b517a3e70ff833f4ae99fd5dab8a Mon Sep 17 00:00:00 2001
From: aj <andrewjpack@gmail.com>
Date: Sat, 25 Jan 2020 20:20:17 +0000
Subject: [PATCH] added cache, added audio feature prep

---
 analysis/__init__.py   |  0
 analysis/cache.py      | 47 +++++++++++++++++++++++++
 audio-features-prep.py | 78 ++++++++++++++++++++++++++++++++++++++++--
 cache-transform.py     | 32 +++++++++++++++++
 scrobble-prep.py       | 41 ++++++++--------------
 5 files changed, 169 insertions(+), 29 deletions(-)
 create mode 100644 analysis/__init__.py
 create mode 100644 analysis/cache.py
 create mode 100644 cache-transform.py

diff --git a/analysis/__init__.py b/analysis/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/analysis/cache.py b/analysis/cache.py
new file mode 100644
index 0000000..5710039
--- /dev/null
+++ b/analysis/cache.py
@@ -0,0 +1,47 @@
+import os
+import logging
+import json
+
+logger = logging.getLogger(__name__)
+
+
+class Cache:
+    def __init__(self, cache):
+        self.cache = cache
+
+    def set_track(self, name, artist, uri=None, audio_features=None):
+        name = str(name).lower()
+        artist = str(artist).lower()
+
+        if self.cache['cache'].get(artist) is None:
+            self.cache['cache'][artist] = {name: {}}
+        if self.cache['cache'][artist].get(name) is None:
+            self.cache['cache'][artist][name] = {}
+
+        if uri is not None:
+            self.cache['cache'][artist][name]['uri'] = uri
+        if audio_features is not None:
+            self.cache['cache'][artist][name]['features'] = audio_features
+
+    def get_track(self, name, artist):
+        name = str(name)
+        artist = str(artist)
+        try:
+            return self.cache['cache'][artist][name]
+        except KeyError:
+            return None
+
+
+def load_cache_from_storage(path: str = '.', name: str = 'cache.json'):
+
+    if os.path.exists(os.path.join(path, name)):
+        with open(os.path.join(path, name), 'r') as file:
+            return Cache(json.loads(file.read()))
+    else:
+        logger.error(f'{os.path.join(path, name)} does not exist')
+        return {'cache': {}}
+
+
+def write_cache_to_storage(cache: Cache, path: str = '.', name: str = 'cache.json'):
+    with open(os.path.join(path, name), 'w') as file:
+        file.write(json.dumps(cache.cache))
diff --git a/audio-features-prep.py b/audio-features-prep.py
index 13a3f01..f4e9bec 100644
--- a/audio-features-prep.py
+++ b/audio-features-prep.py
@@ -3,17 +3,91 @@ from fmframework.net.network import Network as FmNet
 from spotframework.net.network import Network as SpotNet
 from spotframework.net.user import NetworkUser
 from spotframework.model.uri import Uri
+from google.cloud import bigquery
 
 from csv import DictWriter
 
-import os
 import datetime
-import json
+import os
 from log import logger
 
+import analysis.cache
+
 spotnet = SpotNet(NetworkUser(client_id=os.environ['SPOT_CLIENT'],
                               client_secret=os.environ['SPOT_SECRET'],
                               refresh_token=os.environ['SPOT_REFRESH']).refresh_access_token())
 fmnet = FmNet(username='sarsoo', api_key=os.environ['FM_CLIENT'])
 
+cache = analysis.cache.load_cache_from_storage()
 
+client = bigquery.Client()
+
+# Perform a query.
+QUERY = (
+    'SELECT ' 
+    '   DISTINCT uri, track, album, artist '
+    'FROM `sarsooxyz.scrobbles.*` '
+    'WHERE '
+    '  uri IS NOT NULL '
+    'ORDER BY artist '
+)
+logger.info('querying uris')
+query_job = client.query(QUERY)
+rows = query_job.result()
+
+features = []
+for_pulling = []
+
+logger.info('polling cache')
+for row in rows:
+    cache_entry = cache.get_track(row.track, row.artist)
+
+    if cache_entry is not None:
+        if cache_entry.get('features') is None:
+            features.append(cache_entry)
+            continue
+
+    for_pulling.append(row)
+
+logger.info('pulling tracks')
+tracks = spotnet.get_tracks(uri_strings=[i.uri for i in for_pulling])
+
+if tracks is not None:
+    logger.info('populating features')
+    tracks = spotnet.populate_track_audio_features(tracks)
+    features += [i.audio_features.to_dict() for i in tracks if i.audio_features is not None]
+
+logger.info('caching pulled')
+for cacheable in for_pulling:
+    track = next((i for i in tracks if str(i.uri) == cacheable.uri), None)
+    if track is not None and track.audio_features is not None:
+        cache.set_track(name=cacheable.track, artist=cacheable.artist, audio_features=track.audio_features.to_dict())
+
+logger.info('dumping')
+date = str(datetime.date.today())
+with open(f'{date}_features.csv', 'w', newline='') as fileobj:
+
+    headers = ['acousticness',
+               'analysis_url',
+               'danceability',
+               'duration_ms',
+               'energy',
+               'uri',
+               'instrumentalness',
+               'key',
+               'key_code',
+               'liveness',
+               'loudness',
+               'mode',
+               'speechiness',
+               'tempo',
+               'time_signature',
+               'track_href',
+               'valence']
+    writer = DictWriter(fileobj, fieldnames=headers, dialect='excel-tab')
+    writer.writeheader()
+
+    for feature in features:
+        writer.writerow(feature)
+
+analysis.cache.write_cache_to_storage(cache)
diff --git a/cache-transform.py b/cache-transform.py
new file mode 100644
index 0000000..97d02d8
--- /dev/null
+++ b/cache-transform.py
@@ -0,0 +1,32 @@
+import os, json, pprint
+
+uri_cache_name = 'cache.json'
+if os.path.isfile(uri_cache_name):
+    with open(uri_cache_name, 'r') as uri_cache:
+        uris = json.loads(uri_cache.read())
+
+        new_cache = {
+            'cache': {}
+        }
+
+        for uri in uris:
+
+            try:
+                new_cache['cache'][uri['artist']]
+            except KeyError:
+                new_cache['cache'][uri['artist']] = {}
+
+            try:
+                new_cache['cache'][uri['artist']][uri['name']]
+            except KeyError:
+                new_cache['cache'][uri['artist']][uri['name']] = {}
+
+            new_cache['cache'][uri['artist']][uri['name']]['uri'] = uri['uri']
+
+        pprint.pprint(new_cache)
+
+    with open(uri_cache_name, 'w') as uri_cache:
+        uri_cache.write(json.dumps(new_cache))
+
+
+
diff --git a/scrobble-prep.py b/scrobble-prep.py
index f87c872..b58c37e 100644
--- a/scrobble-prep.py
+++ b/scrobble-prep.py
@@ -8,58 +8,46 @@ from csv import DictWriter
 
 import os
 import datetime
-import json
 from log import logger
 
+import analysis.cache
+
 spotnet = SpotNet(NetworkUser(client_id=os.environ['SPOT_CLIENT'],
                               client_secret=os.environ['SPOT_SECRET'],
                               refresh_token=os.environ['SPOT_REFRESH']).refresh_access_token())
 fmnet = FmNet(username='sarsoo', api_key=os.environ['FM_CLIENT'])
 
 # initialise cache
-uri_cache_name = 'uris.json'
-if os.path.isfile(uri_cache_name):
-    with open(uri_cache_name, 'r') as uri_cache:
-        uris = json.loads(uri_cache.read())
-else:
-    uris = []
+cache = analysis.cache.load_cache_from_storage()
 
 # scrobble range
-from_date = datetime.datetime(year=2018, month=1, day=1)
-to_date = datetime.datetime(year=2019, month=1, day=1)
+from_date = datetime.datetime(year=2019, month=1, day=1)
+to_date = datetime.datetime(year=2020, month=1, day=1)
 
 scrobbles = fmnet.get_recent_tracks(from_time=from_date, to_time=to_date, page_limit=200)
 
 # populate with uris
 for scrobble in scrobbles:
 
-    cache_entry = [i for i in uris if
-                   i['name'] == scrobble.track.name.lower() and
-                   i['artist'] == scrobble.track.artist.name.lower()]
+    cache_entry = cache.get_track(name=scrobble.track.name.lower(), artist=scrobble.track.artist.name.lower())
 
-    # check cache
-    if len(cache_entry) == 0:
+    if cache_entry is not None and cache_entry.get('uri'):
+        scrobble.uri = cache_entry.get('uri')
+    else:
         logger.info(f'pulling {scrobble.track}')
         spotify_search = spotnet.search(query_types=[Uri.ObjectType.track],
                                         track=scrobble.track.name,
                                         artist=scrobble.track.artist.name,
                                         response_limit=5).tracks
         if len(spotify_search) > 0:
-            uris.append({
-                'name': scrobble.track.name.lower(),
-                'artist': scrobble.track.artist.name.lower(),
-                'uri': str(spotify_search[0].uri)
-            })
-            scrobble.uri = spotify_search[0].uri
+            cache.set_track(name=scrobble.track.name.lower(),
+                            artist=scrobble.track.artist.name.lower(),
+                            uri=str(spotify_search[0].uri))
+            scrobble.uri = str(spotify_search[0].uri)
         else:
             logger.debug('no search tracks returned')
             scrobble.uri = None
 
-    # cache entry available
-    else:
-        # logger.info(f'{scrobble.track} found in cache')
-        scrobble.uri = cache_entry[0]['uri']
-
 date = str(datetime.date.today())
 with open(f'{date}_scrobbles.csv', 'w', newline='') as fileobj:
 
@@ -79,5 +67,4 @@ with open(f'{date}_scrobbles.csv', 'w', newline='') as fileobj:
             'uri': str(scrobble.uri) if scrobble.uri is not None else ''
         })
 
-with open(uri_cache_name, 'w') as uri_cache:
-    uri_cache.write(json.dumps(uris))
+analysis.cache.write_cache_to_storage(cache)