restructured, added notebooks, refreshed data, poetry

This commit is contained in:
andy 2021-02-01 01:37:22 +00:00
parent 7a9f1fab27
commit 8ef8536213
19 changed files with 3000 additions and 223 deletions

4
.gitignore vendored
View File

@ -4,4 +4,6 @@ __pycache__
.idea
.fm
scratch.py
service.json
service.json
cache.json
.env

View File

@ -1,4 +1,12 @@
listening analysis
==================
# Listening Analysis
performing analysis on listening habits using last.fm and spotify data
Notebooks, [analysis](analysis.ipynb) and other [stats](stats.ipynb).
Combining Spotify & Last.fm data for exploring habits and trends
Uses two data sources,
1. Last.fm scrobbles
2. Spotify audio features
The two are joined by searching Last.fm tracks on Spotify to get a Uri, the track name and artist name are provided for the query.
These Uris can be used to retrieve Spotify feature descriptors. `all_joined()` gets a BigQuery of that joins the scrobble time series with their audio features and provides this as a panda frame.

407
analysis.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,27 @@
import logging
import pandas as pd
float_headers = ["acousticness", "danceability", "energy", "instrumentalness", "liveness", "speechiness", "valence"]
descriptor_headers = ["duration_ms", "mode", "loudness", "key", "tempo", "time_signature"] + float_headers
def init_log():
logger = logging.getLogger('listening')
spotframework_logger = logging.getLogger('spotframework')
fmframework_logger = logging.getLogger('fmframework')
spotfm_logger = logging.getLogger('spotfm')
logger.setLevel('DEBUG')
spotframework_logger.setLevel('WARNING')
fmframework_logger.setLevel('WARNING')
spotfm_logger.setLevel('WARNING')
log_format = '%(levelname)s %(name)s:%(funcName)s - %(message)s'
formatter = logging.Formatter(log_format)
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
spotframework_logger.addHandler(stream_handler)
fmframework_logger.addHandler(stream_handler)
spotfm_logger.addHandler(stream_handler)

View File

@ -8,13 +8,17 @@ logger = logging.getLogger(__name__)
class Cache:
def __init__(self, cache):
self.cache = cache
# dictionary indexed by artist name followed by track name
def set_track(self, name, artist, uri=None, audio_features=None):
name = str(name).lower()
artist = str(artist).lower()
# ARTIST
if self.cache['cache'].get(artist) is None:
self.cache['cache'][artist] = {name: {}}
# TRACK
if self.cache['cache'][artist].get(name) is None:
self.cache['cache'][artist][name] = {}
@ -39,7 +43,7 @@ def load_cache_from_storage(path: str = '.', name: str = 'cache.json'):
return Cache(json.loads(file.read()))
else:
logger.error(f'{os.path.join(path, name)} does not exist')
return {'cache': {}}
return Cache({'cache': {}})
def write_cache_to_storage(cache: Cache, path: str = '.', name: str = 'cache.json'):

27
analysis/net.py Normal file
View File

@ -0,0 +1,27 @@
import os
from typing import List
import pandas as pd
from spotframework.model.track import PlaylistTrack
from spotframework.net.network import Network as SpotNet, NetworkUser
def get_spotnet():
return SpotNet(NetworkUser(client_id=os.environ['SPOT_CLIENT'],
client_secret=os.environ['SPOT_SECRET'],
refresh_token=os.environ['SPOT_REFRESH'])).refresh_access_token()
def get_playlist(name: str, spotnet: SpotNet):
playlists = spotnet.playlists()
playlist = [i for i in playlists if i.name == name][0]
playlist.tracks = spotnet.playlist_tracks(uri=playlist.uri)
return playlist
def track_frame(tracks: List[PlaylistTrack]):
return pd.DataFrame(
[
[i.track.name, i.track.artists[0].name]
for i in tracks
],
columns = ["track", "artist"]
)

102
analysis/prep/feature.py Normal file
View File

@ -0,0 +1,102 @@
import datetime
import logging
from csv import DictWriter
from google.cloud import bigquery
from analysis.cache import Cache
from fmframework.net import Network as FMNetwork
from spotframework.net.network import Network as SpotNetwork
from spotframework.model.uri import Uri
logger = logging.getLogger('listening')
def prepare_features(spotnet: SpotNetwork,
fmnet: FMNetwork,
cache: Cache,
limit: int = None):
features = populated_features(spotnet=spotnet,
fmnet=fmnet,
cache=cache,
limit=limit)
save_features(features)
def populated_features(spotnet: SpotNetwork,
fmnet: FMNetwork,
cache: Cache,
limit: int = None):
client = bigquery.Client()
QUERY = (
'SELECT '
' DISTINCT uri, track, album, artist '
'FROM `sarsooxyz.scrobbles.*` '
'WHERE '
' uri IS NOT NULL '
'ORDER BY artist '
)
if limit is not None:
QUERY += f'LIMIT {limit} '
logger.info('querying uris')
query_job = client.query(QUERY)
rows = query_job.result()
features = []
for_pulling = []
# HIT CACHE
logger.info('polling cache')
for row in rows:
cache_entry = cache.get_track(row.track, row.artist)
try:
feature = cache_entry['features']
features.append(feature)
except (KeyError, TypeError):
for_pulling.append(row)
# GET SPOTIFY TRACKS
logger.info('pulling tracks')
tracks = spotnet.tracks(uris=[i.uri for i in for_pulling])
if tracks is not None:
logger.info('populating features')
tracks = spotnet.populate_track_audio_features(tracks)
features += [i.audio_features.to_dict() for i in tracks if i.audio_features is not None]
logger.info('caching pulled')
for cacheable in for_pulling:
track = next((i for i in tracks if str(i.uri) == cacheable.uri), None)
if track is not None and track.audio_features is not None:
cache.set_track(name=cacheable.track, artist=cacheable.artist, audio_features=track.audio_features.to_dict())
return features
def save_features(features):
date = str(datetime.datetime.now()).replace(':', '.')
with open(f'{date}_features.csv', 'w', newline='', encoding='UTF-8') as fileobj:
headers = ['acousticness',
'analysis_url',
'danceability',
'duration_ms',
'energy',
'uri',
'instrumentalness',
'key',
'liveness',
'loudness',
'mode',
'speechiness',
'tempo',
'time_signature',
'track_href',
'valence']
writer = DictWriter(fileobj, fieldnames=headers, dialect='excel-tab')
writer.writeheader()
for feature in features:
writer.writerow(feature)

80
analysis/prep/scrobble.py Normal file
View File

@ -0,0 +1,80 @@
import datetime
import logging
from csv import DictWriter
from analysis.cache import Cache
from fmframework.net import Network as FMNetwork
from spotframework.net.network import Network as SpotNetwork
from spotframework.model.uri import Uri
logger = logging.getLogger(__name__)
def prepare_scrobbles(spotnet: SpotNetwork,
fmnet: FMNetwork,
cache: Cache,
from_date: datetime.datetime = None,
to_date: datetime.datetime = None,
limit: int = None):
scrobbles = populated_scrobbles(spotnet=spotnet,
fmnet=fmnet,
cache=cache,
from_date=from_date,
to_date=to_date,
limit=limit)
save_scrobbles(scrobbles)
def populated_scrobbles(spotnet: SpotNetwork,
fmnet: FMNetwork,
cache: Cache,
from_date: datetime.datetime = None,
to_date: datetime.datetime = None,
limit: int = None):
# get all scrobbles for date range
scrobbles = fmnet.recent_tracks(limit=limit, from_time=from_date, to_time=to_date, page_limit=200)
# populate with uris
for scrobble in scrobbles:
cache_entry = cache.get_track(name=scrobble.track.name.lower(), artist=scrobble.track.artist.name.lower())
if cache_entry is not None and cache_entry.get('uri'):
# uri is cached
scrobble.uri = cache_entry.get('uri')
else:
# cache missed or doesn't have uri
logger.info(f'pulling {scrobble.track}')
spotify_search = spotnet.search(query_types=[Uri.ObjectType.track],
track=scrobble.track.name,
artist=scrobble.track.artist.name,
response_limit=5).tracks
if len(spotify_search) > 0:
cache.set_track(name=scrobble.track.name.lower(),
artist=scrobble.track.artist.name.lower(),
uri=str(spotify_search[0].uri))
scrobble.uri = str(spotify_search[0].uri)
else:
logger.debug('no search tracks returned')
scrobble.uri = None
return scrobbles
def save_scrobbles(scrobbles):
date = str(datetime.datetime.now()).replace(':', '.')
with open(f'{date}_scrobbles.csv', 'w', newline='', encoding='UTF-8') as fileobj:
headers = ['track', 'album', 'artist', 'time', 'track id', 'album id', 'artist id', 'uri']
writer = DictWriter(fileobj, fieldnames=headers, dialect='excel-tab')
writer.writeheader()
for scrobble in scrobbles:
writer.writerow({
'track': scrobble.track.name,
'album': scrobble.track.album.name,
'artist': scrobble.track.artist.name,
'time': scrobble.time,
'track id': scrobble.track.mbid,
'album id': scrobble.track.album.mbid,
'artist id': scrobble.track.artist.mbid,
'uri': str(scrobble.uri) if scrobble.uri is not None else ''
})

23
analysis/query.py Normal file
View File

@ -0,0 +1,23 @@
from google.cloud import bigquery
client = bigquery.Client()
def all_joined(limit: int = 200):
query = (
'SELECT '
' Scrobbles.track, Scrobbles.album, Scrobbles.artist, Scrobbles.time, Scrobbles.uri, '
' Features.acousticness, Features.danceability, Features.duration_ms, '
' Features.energy, Features.instrumentalness, Features.key, Features.liveness, '
' Features.loudness, Features.mode, Features.speechiness, Features.tempo, '
' Features.time_signature, Features.valence '
'FROM `sarsooxyz.scrobbles.*` AS Scrobbles '
'INNER JOIN `sarsooxyz.audio_features.features` AS Features '
'ON Scrobbles.uri = Features.uri '
)
if limit >= 0:
query += f' LIMIT {limit}'
return client.query(query).to_dataframe()

View File

@ -1,93 +0,0 @@
from fmframework.net.network import Network as FmNet
from spotframework.net.network import Network as SpotNet
from spotframework.net.user import NetworkUser
from spotframework.model.uri import Uri
from google.cloud import bigquery
from csv import DictWriter
import datetime
import os
from log import logger
import analysis.cache
spotnet = SpotNet(NetworkUser(client_id=os.environ['SPOT_CLIENT'],
client_secret=os.environ['SPOT_SECRET'],
refresh_token=os.environ['SPOT_REFRESH']).refresh_access_token())
fmnet = FmNet(username='sarsoo', api_key=os.environ['FM_CLIENT'])
cache = analysis.cache.load_cache_from_storage()
client = bigquery.Client()
# Perform a query.
QUERY = (
'SELECT '
' DISTINCT uri, track, album, artist '
'FROM `sarsooxyz.scrobbles.*` '
'WHERE '
' uri IS NOT NULL '
'ORDER BY artist '
)
logger.info('querying uris')
query_job = client.query(QUERY)
rows = query_job.result()
features = []
for_pulling = []
logger.info('polling cache')
for row in rows:
cache_entry = cache.get_track(row.track, row.artist)
if cache_entry is not None:
if cache_entry.get('features') is None:
features.append(cache_entry)
continue
for_pulling.append(row)
logger.info('pulling tracks')
tracks = spotnet.get_tracks(uri_strings=[i.uri for i in for_pulling])
if tracks is not None:
logger.info('populating features')
tracks = spotnet.populate_track_audio_features(tracks)
features += [i.audio_features.to_dict() for i in tracks if i.audio_features is not None]
logger.info('caching pulled')
for cacheable in for_pulling:
track = next((i for i in tracks if str(i.uri) == cacheable.uri), None)
if track is not None and track.audio_features is not None:
cache.set_track(name=cacheable.track, artist=cacheable.artist, audio_features=track.audio_features.to_dict())
logger.info('dumping')
date = str(datetime.date.today())
with open(f'{date}_features.csv', 'w', newline='') as fileobj:
headers = ['acousticness',
'analysis_url',
'danceability',
'duration_ms',
'energy',
'uri',
'instrumentalness',
'key',
'key_code',
'liveness',
'loudness',
'mode',
'speechiness',
'tempo',
'time_signature',
'track_href',
'valence']
writer = DictWriter(fileobj, fieldnames=headers, dialect='excel-tab')
writer.writeheader()
for feature in features:
writer.writerow(feature)
analysis.cache.write_cache_to_storage(cache)

View File

@ -1,32 +0,0 @@
import os, json, pprint
uri_cache_name = 'cache.json'
if os.path.isfile(uri_cache_name):
with open(uri_cache_name, 'r') as uri_cache:
uris = json.loads(uri_cache.read())
new_cache = {
'cache': {}
}
for uri in uris:
try:
new_cache['cache'][uri['artist']]
except KeyError:
new_cache['cache'][uri['artist']] = {}
try:
new_cache['cache'][uri['artist']][uri['name']]
except KeyError:
new_cache['cache'][uri['artist']][uri['name']] = {}
new_cache['cache'][uri['artist']][uri['name']]['uri'] = uri['uri']
pprint.pprint(new_cache)
with open(uri_cache_name, 'w') as uri_cache:
uri_cache.write(json.dumps(new_cache))

2123
poetry.lock generated Normal file

File diff suppressed because it is too large Load Diff

27
prep-audio-features.py Normal file
View File

@ -0,0 +1,27 @@
import datetime
import os
from dotenv import load_dotenv
from fmframework.net.network import Network as FMNet
from spotframework.net.network import Network as SpotNet, NetworkUser
from spotframework.model.uri import Uri
from analysis.prep.feature import prepare_features
from analysis.cache import load_cache_from_storage, write_cache_to_storage
from analysis import init_log
load_dotenv()
init_log()
spotnet = SpotNet(NetworkUser(client_id=os.environ['SPOT_CLIENT'],
client_secret=os.environ['SPOT_SECRET'],
refresh_token=os.environ['SPOT_REFRESH'])).refresh_access_token()
fmnet = FMNet(username='sarsoo', api_key=os.environ['FM_CLIENT'])
cache = load_cache_from_storage()
try:
prepare_features(spotnet, fmnet, cache)
except Exception as e:
print(f"Error Occured: {e}")
finally:
write_cache_to_storage(cache)

33
prep-scrobbles.py Normal file
View File

@ -0,0 +1,33 @@
import datetime
import os
from dotenv import load_dotenv
from fmframework.net.network import Network as FMNet
from spotframework.net.network import Network as SpotNet, NetworkUser
from spotframework.model.uri import Uri
from analysis.prep.scrobble import prepare_scrobbles, populated_scrobbles
from analysis.cache import load_cache_from_storage, write_cache_to_storage
from analysis import init_log
load_dotenv()
init_log()
spotnet = SpotNet(NetworkUser(client_id=os.environ['SPOT_CLIENT'],
client_secret=os.environ['SPOT_SECRET'],
refresh_token=os.environ['SPOT_REFRESH'])).refresh_access_token()
fmnet = FMNet(username='sarsoo', api_key=os.environ['FM_CLIENT'])
cache = load_cache_from_storage()
try:
for year in range(2017, 2021):
from_date = datetime.datetime(year=year, month=1, day=1)
to_date = datetime.datetime(year=year + 1, month=1, day=1)
print(f"Getting {year}")
prepare_scrobbles(spotnet, fmnet, cache, from_date, to_date)
except Exception as e:
print(f"Error Occured: {e}")
finally:
write_cache_to_storage(cache)

29
pyproject.toml Normal file
View File

@ -0,0 +1,29 @@
[tool.poetry]
name = "listening-analysis"
version = "0.1.0"
description = "Analysing listening habits using Spotify & Last.fm data"
authors = ["andy <andy@sarsoo.xyz>"]
[tool.poetry.dependencies]
python = ">=3.8,<3.10"
spotframework = {path = "../spotframework"}
fmframework = {path = "../fmframework"}
numpy = "^1.20.0"
pandas = "^1.2.1"
opencv-python = "^4.5.1"
ipykernel = "^5.4.3"
jupyterlab = {version = "^3.0.6", optional = true}
google-cloud-bigquery = "^2.7.0"
python-dotenv = "^0.15.0"
matplotlib = "^3.3.4"
pyarrow = "^3.0.0"
[tool.poetry.dev-dependencies]
pylint = "^2.6.0"
[tool.poetry.extras]
jupyter = ["jupyterlab"]
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

View File

@ -1,22 +0,0 @@
cachetools==4.0.0
certifi==2019.11.28
chardet==3.0.4
Click==7.0
google-api-core==1.16.0
google-auth==1.11.0
google-cloud-bigquery==1.23.1
google-cloud-core==1.2.0
google-resumable-media==0.5.0
googleapis-common-protos==1.51.0
idna==2.8
numpy==1.18.1
opencv-python==4.1.2.30
protobuf==3.11.2
pyasn1==0.4.8
pyasn1-modules==0.2.8
pytz==2019.3
requests==2.22.0
rsa==4.0
six==1.14.0
tabulate==0.8.6
urllib3==1.25.7

View File

@ -1,70 +0,0 @@
from fmframework.net.network import Network as FmNet
from spotframework.net.network import Network as SpotNet
from spotframework.net.user import NetworkUser
from spotframework.model.uri import Uri
from csv import DictWriter
import os
import datetime
from log import logger
import analysis.cache
spotnet = SpotNet(NetworkUser(client_id=os.environ['SPOT_CLIENT'],
client_secret=os.environ['SPOT_SECRET'],
refresh_token=os.environ['SPOT_REFRESH']).refresh_access_token())
fmnet = FmNet(username='sarsoo', api_key=os.environ['FM_CLIENT'])
# initialise cache
cache = analysis.cache.load_cache_from_storage()
# scrobble range
from_date = datetime.datetime(year=2019, month=1, day=1)
to_date = datetime.datetime(year=2020, month=1, day=1)
scrobbles = fmnet.get_recent_tracks(from_time=from_date, to_time=to_date, page_limit=200)
# populate with uris
for scrobble in scrobbles:
cache_entry = cache.get_track(name=scrobble.track.name.lower(), artist=scrobble.track.artist.name.lower())
if cache_entry is not None and cache_entry.get('uri'):
scrobble.uri = cache_entry.get('uri')
else:
logger.info(f'pulling {scrobble.track}')
spotify_search = spotnet.search(query_types=[Uri.ObjectType.track],
track=scrobble.track.name,
artist=scrobble.track.artist.name,
response_limit=5).tracks
if len(spotify_search) > 0:
cache.set_track(name=scrobble.track.name.lower(),
artist=scrobble.track.artist.name.lower(),
uri=str(spotify_search[0].uri))
scrobble.uri = str(spotify_search[0].uri)
else:
logger.debug('no search tracks returned')
scrobble.uri = None
date = str(datetime.date.today())
with open(f'{date}_scrobbles.csv', 'w', newline='') as fileobj:
headers = ['track', 'album', 'artist', 'time', 'track id', 'album id', 'artist id', 'uri']
writer = DictWriter(fileobj, fieldnames=headers, dialect='excel-tab')
writer.writeheader()
for scrobble in scrobbles:
writer.writerow({
'track': scrobble.track.name,
'album': scrobble.track.album.name,
'artist': scrobble.track.artist.name,
'time': scrobble.time,
'track id': scrobble.track.mbid,
'album id': scrobble.track.album.mbid,
'artist id': scrobble.track.artist.mbid,
'uri': str(scrobble.uri) if scrobble.uri is not None else ''
})
analysis.cache.write_cache_to_storage(cache)

103
stats.ipynb Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long