kopia lustrzana https://dev.funkwhale.audio/funkwhale/funkwhale
92 wiersze
2.9 KiB
Python
92 wiersze
2.9 KiB
Python
import logging
|
|
import re
|
|
|
|
import unidecode
|
|
from django.conf import settings
|
|
from django.core.cache import cache
|
|
from lb_matching_tools.cleaner import MetadataCleaner
|
|
|
|
from funkwhale_api.music import models as music_models
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
api_key = settings.TYPESENSE_API_KEY
|
|
host = settings.TYPESENSE_HOST
|
|
port = settings.TYPESENSE_PORT
|
|
protocol = settings.TYPESENSE_PROTOCOL
|
|
TYPESENSE_NUM_TYPO = settings.TYPESENSE_NUM_TYPO
|
|
|
|
|
|
class TypesenseNotActivate(Exception):
|
|
pass
|
|
|
|
|
|
if not settings.TYPESENSE_API_KEY:
|
|
logger.info(
|
|
"Typesense is not activated. You can enable it by setting the TYPESENSE_API_KEY env variable."
|
|
)
|
|
else:
|
|
import typesense
|
|
|
|
|
|
def delete_non_alnum_characters(text):
|
|
return unidecode.unidecode(re.sub(r"[^\w]+", "", text).lower())
|
|
|
|
|
|
def resolve_recordings_to_fw_track(recordings):
|
|
"""
|
|
Tries to match a troi recording entity to a fw track using the typesense index.
|
|
For test purposes : if multiple fw tracks are returned, we log the information
|
|
but only keep the best result in db to avoid duplicates.
|
|
"""
|
|
|
|
if not settings.TYPESENSE_API_KEY:
|
|
raise TypesenseNotActivate(
|
|
"Typesense is not activated. You can enable it by setting the TYPESENSE_API_KEY env variable."
|
|
)
|
|
|
|
client = typesense.Client(
|
|
{
|
|
"api_key": api_key,
|
|
"nodes": [{"host": host, "port": port, "protocol": protocol}],
|
|
"connection_timeout_seconds": 2,
|
|
}
|
|
)
|
|
|
|
mc = MetadataCleaner()
|
|
|
|
for recording in recordings:
|
|
rec = mc.clean_recording(recording.name)
|
|
artist = mc.clean_artist(recording.artist.name)
|
|
canonical_name_for_track = delete_non_alnum_characters(artist + rec)
|
|
|
|
logger.debug(f"Trying to resolve : {canonical_name_for_track}")
|
|
|
|
search_parameters = {
|
|
"q": canonical_name_for_track,
|
|
"query_by": "combined",
|
|
"num_typos": TYPESENSE_NUM_TYPO,
|
|
"drop_tokens_threshold": 0,
|
|
}
|
|
matches = client.collections["canonical_fw_data"].documents.search(
|
|
search_parameters
|
|
)
|
|
|
|
if matches["hits"]:
|
|
hit = matches["hits"][0]
|
|
pk = hit["document"]["pk"]
|
|
logger.debug(f"Saving match for track with primary key {pk}")
|
|
cache.set(recording.mbid, pk)
|
|
|
|
if settings.DEBUG and matches["hits"][1]:
|
|
for hit in matches["hits"][1:]:
|
|
pk = hit["document"]["pk"]
|
|
fw_track = music_models.Track.objects.get(pk=pk)
|
|
logger.info(
|
|
f"Duplicate match found for {fw_track.artist.name} {fw_track.title} \
|
|
and primary key {pk}. Skipping because of better match."
|
|
)
|
|
else:
|
|
logger.debug("No match found in fw db")
|
|
return cache.get_many([rec.mbid for rec in recordings])
|