fixed query numbers, quick cache for playlists

2021-02-03 16:08:06 +00:00 · 2021-02-03 16:08:06 +00:00 · 5e703b011f
commit 5e703b011f
parent 0202649cfa
7 changed files with 580 additions and 114 deletions
--- a/album.ipynb
+++ b/album.ipynb
--- a/analysis.ipynb
+++ b/analysis.ipynb
@ -10,7 +10,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.4-final"
+   "version": "3.8.6-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
@ -291,7 +291,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -324,11 +324,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
-    "scrobbles = get_query()"
+    "scrobbles = get_query(cache=cache)"
   ]
  },
  {
@ -340,11 +340,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
-    "scrobbles.to_csv(cache, sep='\\t')"
+    "scrobbles.reset_index().to_csv(cache, sep='\\t')"
   ]
  }
 ]
--- a/analysis/init.py
+++ b/analysis/init.py
@ -1,6 +1,9 @@
 from datetime import datetime
 import logging
 import pandas as pd
+from dotenv import load_dotenv
+
+load_dotenv()

 float_headers = ["acousticness", "danceability", "energy", "instrumentalness", "liveness", "speechiness", "valence"]
 spotify_descriptor_headers = ["duration_ms", "mode", "loudness", "key", "tempo", "time_signature"] + float_headers
@ -28,5 +31,6 @@ def init_log():
    spotfm_logger.addHandler(stream_handler)

 def days_since(in_date):
-    now = datetime.now()
+    # only using up to end of 2020 in dataset at the moment
+    now = datetime(year=2021, month=1, day=1)
    return now - in_date
--- a/analysis/net.py
+++ b/analysis/net.py
@ -15,11 +15,19 @@ def get_spotnet():
 def get_fmnet():
    return FMNet(username='sarsoo', api_key=os.environ['FM_CLIENT'])

+playlist_cache = dict() # low-tech caches for repeated pulling 
+all_playlists = list()
 def get_playlist(name: str, spotnet: SpotNet):
-    playlists = spotnet.playlists()
-    playlist = [i for i in playlists if i.name == name][0]
-    playlist.tracks = spotnet.playlist_tracks(uri=playlist.uri)
-    return playlist
+    global all_playlists
+    try:
+        return playlist_cache[name]
+    except KeyError:
+        if len(all_playlists) == 0:
+            all_playlists = spotnet.playlists()
+        playlist = [i for i in all_playlists if i.name == name][0]
+        playlist.tracks = spotnet.playlist_tracks(uri=playlist.uri)
+        playlist_cache[name] = playlist
+        return playlist

 def track_frame(tracks: List[PlaylistTrack]):
    return pd.DataFrame(
--- a/analysis/query.py
+++ b/analysis/query.py
@ -6,7 +6,7 @@ client = bigquery.Client()

 def all_joined(limit: int = 200):
    query = (
-        'SELECT ' 
+        'SELECT DISTINCT' 
        '   Scrobbles.track, Scrobbles.album, Scrobbles.artist, Scrobbles.time, Scrobbles.uri, '
        '   Features.acousticness, Features.danceability, Features.duration_ms, '
        '   Features.energy, Features.instrumentalness, Features.key, Features.liveness, '
@ -14,7 +14,7 @@ def all_joined(limit: int = 200):
        '   Features.time_signature, Features.valence '

        'FROM `sarsooxyz.scrobbles.*` AS Scrobbles '
-        'INNER JOIN `sarsooxyz.audio_features.features` AS Features '
+        'LEFT JOIN `sarsooxyz.audio_features.features` AS Features '
        'ON Scrobbles.uri = Features.uri '
    )

@ -27,7 +27,11 @@ def get_query(pull=False, cache="query.csv"):
    if pull:
        scrobbles = all_joined(limit=-1) # load dataset as panda frame
    else:
-        scrobbles = pd.read_csv(cache, sep='\t', index_col=0)
+        try:
+            scrobbles = pd.read_csv(cache, sep='\t', index_col=0)
+        except FileNotFoundError:
+            print(f'{cache} not found, pulling')
+            scrobbles = all_joined(limit=-1) # load dataset as panda frame
    scrobbles['time'] = pd.to_datetime(scrobbles['time'])
    scrobbles = scrobbles.set_index('time')
    return scrobbles
--- a/artist.ipynb
+++ b/artist.ipynb
--- a/playlist.ipynb
+++ b/playlist.ipynb