listening-analysis/analysis/query.py

37 lines
1.3 KiB
Python

from google.cloud import bigquery
import pandas as pd
client = bigquery.Client()
def all_joined(limit: int = 200):
query = (
'SELECT DISTINCT'
' Scrobbles.track, Scrobbles.album, Scrobbles.artist, Scrobbles.time, Scrobbles.uri, '
' Features.acousticness, Features.danceability, Features.duration_ms, '
' Features.energy, Features.instrumentalness, Features.key, Features.liveness, '
' Features.loudness, Features.mode, Features.speechiness, Features.tempo, '
' Features.time_signature, Features.valence '
'FROM `sarsooxyz.scrobbles.*` AS Scrobbles '
'LEFT JOIN `sarsooxyz.audio_features.features` AS Features '
'ON Scrobbles.uri = Features.uri '
)
if limit >= 0:
query += f' LIMIT {limit}'
return client.query(query).to_dataframe()
def get_query(pull=False, cache="query.csv"):
if pull:
scrobbles = all_joined(limit=-1) # load dataset as panda frame
else:
try:
scrobbles = pd.read_csv(cache, sep='\t', index_col=0)
except FileNotFoundError:
print(f'{cache} not found, pulling')
scrobbles = all_joined(limit=-1) # load dataset as panda frame
scrobbles['time'] = pd.to_datetime(scrobbles['time'])
scrobbles = scrobbles.set_index('time')
return scrobbles