deduplicate script and scrobble while listening catching

This commit is contained in:
aj 2020-07-20 14:59:39 +01:00
parent c02fcb117f
commit 478f2eaa4d
4 changed files with 142 additions and 9 deletions

2
.gitignore vendored
View File

@ -1,6 +1,8 @@
venv
__pycache__
*.csv
*.build/
*.dist/
.idea
.fm
scratch.py

114
duplicate.py Normal file
View File

@ -0,0 +1,114 @@
from fmframework.net.network import Network, LastFMNetworkException
from csv import DictWriter
import os
import logging
username = 'sarsoo'
logger = logging.getLogger('fmframework')
directory = '.fm'
if not os.path.exists(directory):
os.makedirs(directory)
file_handler = logging.FileHandler(f"{directory}/deduplicate.log")
file_handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(name)s - %(funcName)s - %(message)s'))
logger.addHandler(file_handler)
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(logging.Formatter('%(levelname)s %(name)s:%(funcName)s - %(message)s'))
logger.addHandler(stream_handler)
# chunk scrobbles into successive groups of sample size
def neighbouring_scrobbles(scrobbles, sample_size):
if len(scrobbles) < sample_size:
logger.warning(f'less scrobbles than provided sample size {len(scrobbles)}/{sample_size}')
start_idx = 0
final_idx = min(sample_size, len(scrobbles) - 1)
while start_idx < len(scrobbles):
yield scrobbles[start_idx:final_idx]
start_idx += 1
final_idx = min(final_idx + 1, len(scrobbles) - 1)
def check_for_duplicates(fmkey, retrieval_limit):
net = Network(username=username, api_key=fmkey)
net.retry_counter = 20
try:
scrobbles = net.get_recent_tracks(limit=retrieval_limit, page_limit=200)
if not scrobbles:
logger.error('No scrobbles returned')
return
duplicates_found = []
for scrobble_group in neighbouring_scrobbles(scrobbles, 7):
for idx, to_check in enumerate(scrobble_group[1:]):
if scrobble_group[0].track == to_check.track:
duplicates_found.append((scrobble_group[0], to_check, idx + 1))
print(f'Found {len(duplicates_found)} duplicates')
print()
for duplicate in duplicates_found:
print(f'{duplicate[1].time} - {duplicate[0].time}, {duplicate[0].track}')
print(f'https://www.last.fm/user/{username}/library/music/'
f'{duplicate[0].track.artist.name.replace(" ", "+")}/_/'
f'{duplicate[0].track.name.replace(" ", "+")}')
print(f'https://www.last.fm/user/{username}/library'
f'?from={duplicate[0].time.strftime("%Y-%m-%d")}'
f'&to={duplicate[1].time.strftime("%Y-%m-%d")}')
print()
headers = ['initial', 'duplicate', 'scrobble difference', 'difference minutes', 'track',
'album', 'artist', 'track url', 'scrobbles url']
with open('duplicates.csv', 'w', newline='', encoding='utf-16') as fileobj:
writer = DictWriter(fileobj, fieldnames=headers)
writer.writeheader()
for duplicate in duplicates_found:
writer.writerow({
'initial': duplicate[1].time,
'duplicate': duplicate[0].time,
'scrobble difference': duplicate[2],
'difference minutes': (duplicate[0].time - duplicate[1].time).total_seconds() / 60,
'track': duplicate[0].track.name,
'album': duplicate[0].track.album.name,
'artist': duplicate[0].track.artist.name,
'track url': f'https://www.last.fm/user/{username}/library/music/'
f'{duplicate[0].track.artist.name.replace(" ", "+")}/_/'
f'{duplicate[0].track.name.replace(" ", "+")}',
'scrobbles url': f'https://www.last.fm/user/{username}/library'
f'?from={duplicate[1].time.strftime("%Y-%m-%d")}'
f'&to={duplicate[0].time.strftime("%Y-%m-%d")}'
})
except LastFMNetworkException:
logger.exception('error during scrobble retrieval')
if __name__ == '__main__':
key = os.environ.get('FMKEY')
if key is None:
key = input('enter Last.fm key: ')
limit = input('limit? (0 for none): ')
if limit.isdigit():
limit = int(limit)
if limit == 0:
limit = None
else:
print('not a number, setting to none')
limit = None
check_for_duplicates(key, limit)
input('done, hit key to quit...')

View File

@ -47,21 +47,29 @@ class LastFM:
return self.name
@dataclass
@dataclass(eq=False)
class Artist(LastFM):
def __str__(self):
return f'{self.name}'
def __eq__(self, other):
return self.__class__ == other.__class__ and self.name == other.name
@dataclass
@dataclass(eq=False)
class Album(LastFM):
artist: Artist = None
def __str__(self):
return f'{self.name} / {self.artist}'
def __eq__(self, other):
return self.__class__ == other.__class__ \
and \
(self.name, self.artist) == (other.name, other.artist)
@dataclass
@dataclass(eq=False)
class Track(LastFM):
album: Album = None
artist: Artist = None
@ -69,6 +77,11 @@ class Track(LastFM):
def __str__(self):
return f'{self.name} / {self.album} / {self.artist}'
def __eq__(self, other):
return self.__class__ == other.__class__ \
and \
(self.name, self.album, self.artist) == (other.name, self.album, other.artist)
class WeeklyChart:
def __init__(self, from_time, to_time):
@ -94,3 +107,8 @@ class Scrobble:
def __str__(self):
return self.track
def __eq__(self, other):
return self.__class__ == other.__class__ \
and \
(self.track, self.time) == (other.track, self.time)

View File

@ -3,6 +3,7 @@ from dataclasses import dataclass
from typing import Optional, List
from copy import deepcopy
import logging
from time import sleep
from enum import Enum
from datetime import datetime, date, time, timedelta
@ -59,6 +60,7 @@ class Network:
if 200 <= response.status_code < 300:
logger.debug(f'{http_method} {method} {response.status_code}')
self.retry_counter = 0
return resp
code = resp.get('error', None)
@ -68,6 +70,7 @@ class Network:
if code in [8, 11, 16]:
if self.retry_counter < 5:
self.retry_counter += 1
sleep(2)
logger.warning(f'{method} {response.status_code} {code} {message} retyring')
return self.net_call(http_method=http_method,
method=method,
@ -134,11 +137,7 @@ class Network:
items = iterator.items
if len(items) >= 1:
if items[0].get('@attr', {}).get('nowplaying', None):
items.pop(0)
return [self.parse_scrobble(i) for i in items[:limit]]
return [self.parse_scrobble(i) for i in items[:limit] if i.get('date')]
def get_scrobbles_from_date(self,
input_date: date,
@ -423,7 +422,7 @@ class PageCollection:
self.method = method
self.params = params
self.pages: List[Page] = []
self.page_limit = page_limit
self.page_limit = min(page_limit, 200)
self.response_limit = response_limit
self.counter = 0