2021-02-01 01:37:22 +00:00
|
|
|
|
|
|
|
from google.cloud import bigquery
|
2021-02-01 21:43:27 +00:00
|
|
|
import pandas as pd
|
2021-02-01 01:37:22 +00:00
|
|
|
|
|
|
|
client = bigquery.Client()
|
|
|
|
|
|
|
|
def all_joined(limit: int = 200):
|
|
|
|
query = (
|
2021-02-03 16:08:06 +00:00
|
|
|
'SELECT DISTINCT'
|
2021-02-01 01:37:22 +00:00
|
|
|
' Scrobbles.track, Scrobbles.album, Scrobbles.artist, Scrobbles.time, Scrobbles.uri, '
|
|
|
|
' Features.acousticness, Features.danceability, Features.duration_ms, '
|
|
|
|
' Features.energy, Features.instrumentalness, Features.key, Features.liveness, '
|
|
|
|
' Features.loudness, Features.mode, Features.speechiness, Features.tempo, '
|
|
|
|
' Features.time_signature, Features.valence '
|
|
|
|
|
|
|
|
'FROM `sarsooxyz.scrobbles.*` AS Scrobbles '
|
2021-02-03 16:08:06 +00:00
|
|
|
'LEFT JOIN `sarsooxyz.audio_features.features` AS Features '
|
2021-02-01 01:37:22 +00:00
|
|
|
'ON Scrobbles.uri = Features.uri '
|
|
|
|
)
|
|
|
|
|
|
|
|
if limit >= 0:
|
|
|
|
query += f' LIMIT {limit}'
|
|
|
|
|
2021-02-01 21:43:27 +00:00
|
|
|
return client.query(query).to_dataframe()
|
|
|
|
|
|
|
|
def get_query(pull=False, cache="query.csv"):
|
|
|
|
if pull:
|
|
|
|
scrobbles = all_joined(limit=-1) # load dataset as panda frame
|
|
|
|
else:
|
2021-02-03 16:08:06 +00:00
|
|
|
try:
|
|
|
|
scrobbles = pd.read_csv(cache, sep='\t', index_col=0)
|
|
|
|
except FileNotFoundError:
|
|
|
|
print(f'{cache} not found, pulling')
|
|
|
|
scrobbles = all_joined(limit=-1) # load dataset as panda frame
|
2021-02-01 21:43:27 +00:00
|
|
|
scrobbles['time'] = pd.to_datetime(scrobbles['time'])
|
|
|
|
scrobbles = scrobbles.set_index('time')
|
|
|
|
return scrobbles
|