Files
NaviMigrate/internal/match/matcher.go

359 lines
7.8 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package match
import (
"context"
"fmt"
"math"
"regexp"
"sort"
"strings"
"sync"
"navimigrate/internal/model"
"navimigrate/internal/navidrome"
)
type Searcher interface {
SearchTracks(ctx context.Context, query string, limit int) ([]navidrome.Track, error)
}
type Matcher struct {
searcher Searcher
threshold float64
cacheMu sync.RWMutex
cache map[string][]navidrome.Track
}
func NewMatcher(searcher Searcher, threshold float64) *Matcher {
if threshold < 0 {
threshold = 45
}
return &Matcher{
searcher: searcher,
threshold: threshold,
cache: map[string][]navidrome.Track{},
}
}
func (m *Matcher) MatchTrack(ctx context.Context, src model.Track) model.MatchedTrack {
queries := m.buildQueries(src)
if len(queries) == 0 {
return model.MatchedTrack{Source: src, Matched: false, Reason: "no usable metadata"}
}
type scored struct {
track navidrome.Track
score float64
query string
}
best := scored{score: -999}
seen := map[string]struct{}{}
for _, q := range queries {
candidates, err := m.searchCached(ctx, q)
if err != nil {
continue
}
for _, c := range candidates {
if _, ok := seen[c.ID]; ok {
continue
}
seen[c.ID] = struct{}{}
score := scoreCandidate(src, c)
if score > best.score {
best = scored{track: c, score: score, query: q}
}
}
}
if best.track.ID == "" {
return model.MatchedTrack{Source: src, Matched: false, Reason: "no candidates"}
}
if best.score >= m.threshold {
return model.MatchedTrack{
Source: src,
TargetID: best.track.ID,
Score: best.score,
Query: best.query,
Matched: true,
}
}
reason := fmt.Sprintf("best score %.1f below threshold %.1f", best.score, m.threshold)
return model.MatchedTrack{
Source: src,
TargetID: best.track.ID,
Score: best.score,
Query: best.query,
Matched: false,
Reason: reason,
}
}
func (m *Matcher) searchCached(ctx context.Context, q string) ([]navidrome.Track, error) {
q = strings.TrimSpace(q)
if q == "" {
return nil, nil
}
m.cacheMu.RLock()
if v, ok := m.cache[q]; ok {
m.cacheMu.RUnlock()
return v, nil
}
m.cacheMu.RUnlock()
res, err := m.searcher.SearchTracks(ctx, q, 20)
if err != nil {
return nil, err
}
m.cacheMu.Lock()
m.cache[q] = res
m.cacheMu.Unlock()
return res, nil
}
func (m *Matcher) buildQueries(src model.Track) []string {
title := strings.TrimSpace(src.Title)
if title == "" {
return nil
}
artist := ""
if len(src.Artists) > 0 {
artist = src.Artists[0]
}
latinTitle := strings.TrimSpace(transliterateToLatin(title))
latinArtist := strings.TrimSpace(transliterateToLatin(artist))
queries := []string{}
if src.ISRC != "" {
queries = append(queries, src.ISRC)
}
queries = append(queries, strings.TrimSpace(title+" "+artist))
if latinTitle != "" {
queries = append(queries, strings.TrimSpace(latinTitle+" "+latinArtist))
}
cleanTitle := cleanTitle(title)
if cleanTitle != title {
queries = append(queries, strings.TrimSpace(cleanTitle+" "+artist))
latinClean := strings.TrimSpace(transliterateToLatin(cleanTitle))
if latinClean != "" {
queries = append(queries, strings.TrimSpace(latinClean+" "+latinArtist))
}
}
queries = append(queries, title)
if latinTitle != "" {
queries = append(queries, latinTitle)
}
uniq := map[string]struct{}{}
out := make([]string, 0, len(queries))
for _, q := range queries {
q = strings.TrimSpace(q)
if q == "" {
continue
}
if _, ok := uniq[q]; ok {
continue
}
uniq[q] = struct{}{}
out = append(out, q)
}
return out
}
func scoreCandidate(src model.Track, dst navidrome.Track) float64 {
score := 0.0
if src.ISRC != "" && hasISRC(dst.ISRCs, src.ISRC) {
score += 60
}
score += 25 * similarity(normalize(src.Title), normalize(dst.Title))
primaryArtist := ""
if len(src.Artists) > 0 {
primaryArtist = src.Artists[0]
}
if primaryArtist != "" {
score += 20 * similarity(normalize(primaryArtist), normalize(dst.Artist))
}
if src.DurationMS > 0 && dst.Duration > 0 {
delta := math.Abs(float64(src.DurationMS/1000 - dst.Duration))
switch {
case delta <= 2:
score += 10
case delta <= 5:
score += 7
case delta <= 10:
score += 4
case delta > 25:
score -= 6
}
}
nt := normalize(src.Title)
dt := normalize(dst.Title)
if !strings.Contains(nt, "live") && strings.Contains(dt, "live") {
score -= 8
}
if !strings.Contains(nt, "remix") && strings.Contains(dt, "remix") {
score -= 6
}
if strings.Contains(dt, "karaoke") {
score -= 12
}
return score
}
func hasISRC(candidates []string, wanted string) bool {
wanted = strings.ToUpper(strings.TrimSpace(wanted))
if wanted == "" {
return false
}
for _, c := range candidates {
if strings.EqualFold(strings.TrimSpace(c), wanted) {
return true
}
}
return false
}
var nonAlphaNum = regexp.MustCompile(`[^a-z0-9]+`)
func normalize(s string) string {
s = transliterateToLatin(s)
s = strings.ToLower(strings.TrimSpace(s))
s = strings.ReplaceAll(s, "&", " and ")
s = nonAlphaNum.ReplaceAllString(s, " ")
tokens := strings.Fields(s)
return strings.Join(tokens, " ")
}
var cyrillicToLatin = map[rune]string{
'а': "a", 'б': "b", 'в': "v", 'г': "g", 'д': "d", 'е': "e", 'ё': "e", 'ж': "zh", 'з': "z", 'и': "i", 'й': "i",
'к': "k", 'л': "l", 'м': "m", 'н': "n", 'о': "o", 'п': "p", 'р': "r", 'с': "s", 'т': "t", 'у': "u", 'ф': "f",
'х': "h", 'ц': "ts", 'ч': "ch", 'ш': "sh", 'щ': "shch", 'ъ': "", 'ы': "y", 'ь': "", 'э': "e", 'ю': "yu", 'я': "ya",
'і': "i", 'ї': "yi", 'є': "ye", 'ґ': "g",
'А': "a", 'Б': "b", 'В': "v", 'Г': "g", 'Д': "d", 'Е': "e", 'Ё': "e", 'Ж': "zh", 'З': "z", 'И': "i", 'Й': "i",
'К': "k", 'Л': "l", 'М': "m", 'Н': "n", 'О': "o", 'П': "p", 'Р': "r", 'С': "s", 'Т': "t", 'У': "u", 'Ф': "f",
'Х': "h", 'Ц': "ts", 'Ч': "ch", 'Ш': "sh", 'Щ': "shch", 'Ъ': "", 'Ы': "y", 'Ь': "", 'Э': "e", 'Ю': "yu", 'Я': "ya",
'І': "i", 'Ї': "yi", 'Є': "ye", 'Ґ': "g",
}
func transliterateToLatin(s string) string {
if s == "" {
return s
}
b := strings.Builder{}
b.Grow(len(s) + 8)
for _, r := range s {
if v, ok := cyrillicToLatin[r]; ok {
b.WriteString(v)
continue
}
b.WriteRune(r)
}
return b.String()
}
var cleanupRe = regexp.MustCompile(`(?i)\s*\(([^)]*(remaster|remastered|live|mono|stereo|version|deluxe|explicit|clean|bonus)[^)]*)\)|\s*-\s*(remaster(ed)?|live|version|edit|radio edit).*`)
func cleanTitle(s string) string {
clean := cleanupRe.ReplaceAllString(s, "")
clean = strings.TrimSpace(clean)
if clean == "" {
return s
}
return clean
}
func similarity(a, b string) float64 {
if a == "" || b == "" {
return 0
}
if a == b {
return 1
}
ta := tokenSet(a)
tb := tokenSet(b)
if len(ta) == 0 || len(tb) == 0 {
return 0
}
inter := 0
for t := range ta {
if _, ok := tb[t]; ok {
inter++
}
}
if inter == 0 {
return 0
}
jaccard := float64(inter) / float64(len(ta)+len(tb)-inter)
lev := levenshteinRatio(a, b)
return (jaccard * 0.6) + (lev * 0.4)
}
func tokenSet(s string) map[string]struct{} {
parts := strings.Fields(s)
set := make(map[string]struct{}, len(parts))
for _, p := range parts {
set[p] = struct{}{}
}
return set
}
func levenshteinRatio(a, b string) float64 {
ar := []rune(a)
br := []rune(b)
if len(ar) == 0 || len(br) == 0 {
return 0
}
d := levenshtein(ar, br)
maxLen := len(ar)
if len(br) > maxLen {
maxLen = len(br)
}
return 1 - float64(d)/float64(maxLen)
}
func levenshtein(a, b []rune) int {
dp := make([]int, len(b)+1)
for j := 0; j <= len(b); j++ {
dp[j] = j
}
for i := 1; i <= len(a); i++ {
prev := dp[0]
dp[0] = i
for j := 1; j <= len(b); j++ {
tmp := dp[j]
cost := 0
if a[i-1] != b[j-1] {
cost = 1
}
dp[j] = min3(
dp[j]+1,
dp[j-1]+1,
prev+cost,
)
prev = tmp
}
}
return dp[len(b)]
}
func min3(a, b, c int) int {
arr := []int{a, b, c}
sort.Ints(arr)
return arr[0]
}