build spotify-to-navidrome migrator with recovery flow
This commit is contained in:
358
internal/match/matcher.go
Normal file
358
internal/match/matcher.go
Normal file
@@ -0,0 +1,358 @@
|
||||
package match
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"navimigrate/internal/model"
|
||||
"navimigrate/internal/navidrome"
|
||||
)
|
||||
|
||||
type Searcher interface {
|
||||
SearchTracks(ctx context.Context, query string, limit int) ([]navidrome.Track, error)
|
||||
}
|
||||
|
||||
type Matcher struct {
|
||||
searcher Searcher
|
||||
threshold float64
|
||||
cacheMu sync.RWMutex
|
||||
cache map[string][]navidrome.Track
|
||||
}
|
||||
|
||||
func NewMatcher(searcher Searcher, threshold float64) *Matcher {
|
||||
if threshold < 0 {
|
||||
threshold = 45
|
||||
}
|
||||
return &Matcher{
|
||||
searcher: searcher,
|
||||
threshold: threshold,
|
||||
cache: map[string][]navidrome.Track{},
|
||||
}
|
||||
}
|
||||
|
||||
func (m *Matcher) MatchTrack(ctx context.Context, src model.Track) model.MatchedTrack {
|
||||
queries := m.buildQueries(src)
|
||||
if len(queries) == 0 {
|
||||
return model.MatchedTrack{Source: src, Matched: false, Reason: "no usable metadata"}
|
||||
}
|
||||
|
||||
type scored struct {
|
||||
track navidrome.Track
|
||||
score float64
|
||||
query string
|
||||
}
|
||||
|
||||
best := scored{score: -999}
|
||||
seen := map[string]struct{}{}
|
||||
for _, q := range queries {
|
||||
candidates, err := m.searchCached(ctx, q)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
for _, c := range candidates {
|
||||
if _, ok := seen[c.ID]; ok {
|
||||
continue
|
||||
}
|
||||
seen[c.ID] = struct{}{}
|
||||
score := scoreCandidate(src, c)
|
||||
if score > best.score {
|
||||
best = scored{track: c, score: score, query: q}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if best.track.ID == "" {
|
||||
return model.MatchedTrack{Source: src, Matched: false, Reason: "no candidates"}
|
||||
}
|
||||
|
||||
if best.score >= m.threshold {
|
||||
return model.MatchedTrack{
|
||||
Source: src,
|
||||
TargetID: best.track.ID,
|
||||
Score: best.score,
|
||||
Query: best.query,
|
||||
Matched: true,
|
||||
}
|
||||
}
|
||||
|
||||
reason := fmt.Sprintf("best score %.1f below threshold %.1f", best.score, m.threshold)
|
||||
return model.MatchedTrack{
|
||||
Source: src,
|
||||
TargetID: best.track.ID,
|
||||
Score: best.score,
|
||||
Query: best.query,
|
||||
Matched: false,
|
||||
Reason: reason,
|
||||
}
|
||||
}
|
||||
|
||||
func (m *Matcher) searchCached(ctx context.Context, q string) ([]navidrome.Track, error) {
|
||||
q = strings.TrimSpace(q)
|
||||
if q == "" {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
m.cacheMu.RLock()
|
||||
if v, ok := m.cache[q]; ok {
|
||||
m.cacheMu.RUnlock()
|
||||
return v, nil
|
||||
}
|
||||
m.cacheMu.RUnlock()
|
||||
|
||||
res, err := m.searcher.SearchTracks(ctx, q, 20)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
m.cacheMu.Lock()
|
||||
m.cache[q] = res
|
||||
m.cacheMu.Unlock()
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
||||
func (m *Matcher) buildQueries(src model.Track) []string {
|
||||
title := strings.TrimSpace(src.Title)
|
||||
if title == "" {
|
||||
return nil
|
||||
}
|
||||
artist := ""
|
||||
if len(src.Artists) > 0 {
|
||||
artist = src.Artists[0]
|
||||
}
|
||||
latinTitle := strings.TrimSpace(transliterateToLatin(title))
|
||||
latinArtist := strings.TrimSpace(transliterateToLatin(artist))
|
||||
|
||||
queries := []string{}
|
||||
if src.ISRC != "" {
|
||||
queries = append(queries, src.ISRC)
|
||||
}
|
||||
queries = append(queries, strings.TrimSpace(title+" "+artist))
|
||||
if latinTitle != "" {
|
||||
queries = append(queries, strings.TrimSpace(latinTitle+" "+latinArtist))
|
||||
}
|
||||
|
||||
cleanTitle := cleanTitle(title)
|
||||
if cleanTitle != title {
|
||||
queries = append(queries, strings.TrimSpace(cleanTitle+" "+artist))
|
||||
latinClean := strings.TrimSpace(transliterateToLatin(cleanTitle))
|
||||
if latinClean != "" {
|
||||
queries = append(queries, strings.TrimSpace(latinClean+" "+latinArtist))
|
||||
}
|
||||
}
|
||||
|
||||
queries = append(queries, title)
|
||||
if latinTitle != "" {
|
||||
queries = append(queries, latinTitle)
|
||||
}
|
||||
|
||||
uniq := map[string]struct{}{}
|
||||
out := make([]string, 0, len(queries))
|
||||
for _, q := range queries {
|
||||
q = strings.TrimSpace(q)
|
||||
if q == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := uniq[q]; ok {
|
||||
continue
|
||||
}
|
||||
uniq[q] = struct{}{}
|
||||
out = append(out, q)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func scoreCandidate(src model.Track, dst navidrome.Track) float64 {
|
||||
score := 0.0
|
||||
|
||||
if src.ISRC != "" && hasISRC(dst.ISRCs, src.ISRC) {
|
||||
score += 60
|
||||
}
|
||||
|
||||
score += 25 * similarity(normalize(src.Title), normalize(dst.Title))
|
||||
|
||||
primaryArtist := ""
|
||||
if len(src.Artists) > 0 {
|
||||
primaryArtist = src.Artists[0]
|
||||
}
|
||||
if primaryArtist != "" {
|
||||
score += 20 * similarity(normalize(primaryArtist), normalize(dst.Artist))
|
||||
}
|
||||
|
||||
if src.DurationMS > 0 && dst.Duration > 0 {
|
||||
delta := math.Abs(float64(src.DurationMS/1000 - dst.Duration))
|
||||
switch {
|
||||
case delta <= 2:
|
||||
score += 10
|
||||
case delta <= 5:
|
||||
score += 7
|
||||
case delta <= 10:
|
||||
score += 4
|
||||
case delta > 25:
|
||||
score -= 6
|
||||
}
|
||||
}
|
||||
|
||||
nt := normalize(src.Title)
|
||||
dt := normalize(dst.Title)
|
||||
if !strings.Contains(nt, "live") && strings.Contains(dt, "live") {
|
||||
score -= 8
|
||||
}
|
||||
if !strings.Contains(nt, "remix") && strings.Contains(dt, "remix") {
|
||||
score -= 6
|
||||
}
|
||||
if strings.Contains(dt, "karaoke") {
|
||||
score -= 12
|
||||
}
|
||||
|
||||
return score
|
||||
}
|
||||
|
||||
func hasISRC(candidates []string, wanted string) bool {
|
||||
wanted = strings.ToUpper(strings.TrimSpace(wanted))
|
||||
if wanted == "" {
|
||||
return false
|
||||
}
|
||||
for _, c := range candidates {
|
||||
if strings.EqualFold(strings.TrimSpace(c), wanted) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
var nonAlphaNum = regexp.MustCompile(`[^a-z0-9]+`)
|
||||
|
||||
func normalize(s string) string {
|
||||
s = transliterateToLatin(s)
|
||||
s = strings.ToLower(strings.TrimSpace(s))
|
||||
s = strings.ReplaceAll(s, "&", " and ")
|
||||
s = nonAlphaNum.ReplaceAllString(s, " ")
|
||||
tokens := strings.Fields(s)
|
||||
return strings.Join(tokens, " ")
|
||||
}
|
||||
|
||||
var cyrillicToLatin = map[rune]string{
|
||||
'а': "a", 'б': "b", 'в': "v", 'г': "g", 'д': "d", 'е': "e", 'ё': "e", 'ж': "zh", 'з': "z", 'и': "i", 'й': "i",
|
||||
'к': "k", 'л': "l", 'м': "m", 'н': "n", 'о': "o", 'п': "p", 'р': "r", 'с': "s", 'т': "t", 'у': "u", 'ф': "f",
|
||||
'х': "h", 'ц': "ts", 'ч': "ch", 'ш': "sh", 'щ': "shch", 'ъ': "", 'ы': "y", 'ь': "", 'э': "e", 'ю': "yu", 'я': "ya",
|
||||
'і': "i", 'ї': "yi", 'є': "ye", 'ґ': "g",
|
||||
'А': "a", 'Б': "b", 'В': "v", 'Г': "g", 'Д': "d", 'Е': "e", 'Ё': "e", 'Ж': "zh", 'З': "z", 'И': "i", 'Й': "i",
|
||||
'К': "k", 'Л': "l", 'М': "m", 'Н': "n", 'О': "o", 'П': "p", 'Р': "r", 'С': "s", 'Т': "t", 'У': "u", 'Ф': "f",
|
||||
'Х': "h", 'Ц': "ts", 'Ч': "ch", 'Ш': "sh", 'Щ': "shch", 'Ъ': "", 'Ы': "y", 'Ь': "", 'Э': "e", 'Ю': "yu", 'Я': "ya",
|
||||
'І': "i", 'Ї': "yi", 'Є': "ye", 'Ґ': "g",
|
||||
}
|
||||
|
||||
func transliterateToLatin(s string) string {
|
||||
if s == "" {
|
||||
return s
|
||||
}
|
||||
b := strings.Builder{}
|
||||
b.Grow(len(s) + 8)
|
||||
for _, r := range s {
|
||||
if v, ok := cyrillicToLatin[r]; ok {
|
||||
b.WriteString(v)
|
||||
continue
|
||||
}
|
||||
b.WriteRune(r)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
var cleanupRe = regexp.MustCompile(`(?i)\s*\(([^)]*(remaster|remastered|live|mono|stereo|version|deluxe|explicit|clean|bonus)[^)]*)\)|\s*-\s*(remaster(ed)?|live|version|edit|radio edit).*`)
|
||||
|
||||
func cleanTitle(s string) string {
|
||||
clean := cleanupRe.ReplaceAllString(s, "")
|
||||
clean = strings.TrimSpace(clean)
|
||||
if clean == "" {
|
||||
return s
|
||||
}
|
||||
return clean
|
||||
}
|
||||
|
||||
func similarity(a, b string) float64 {
|
||||
if a == "" || b == "" {
|
||||
return 0
|
||||
}
|
||||
if a == b {
|
||||
return 1
|
||||
}
|
||||
ta := tokenSet(a)
|
||||
tb := tokenSet(b)
|
||||
if len(ta) == 0 || len(tb) == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
inter := 0
|
||||
for t := range ta {
|
||||
if _, ok := tb[t]; ok {
|
||||
inter++
|
||||
}
|
||||
}
|
||||
if inter == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
jaccard := float64(inter) / float64(len(ta)+len(tb)-inter)
|
||||
lev := levenshteinRatio(a, b)
|
||||
return (jaccard * 0.6) + (lev * 0.4)
|
||||
}
|
||||
|
||||
func tokenSet(s string) map[string]struct{} {
|
||||
parts := strings.Fields(s)
|
||||
set := make(map[string]struct{}, len(parts))
|
||||
for _, p := range parts {
|
||||
set[p] = struct{}{}
|
||||
}
|
||||
return set
|
||||
}
|
||||
|
||||
func levenshteinRatio(a, b string) float64 {
|
||||
ar := []rune(a)
|
||||
br := []rune(b)
|
||||
if len(ar) == 0 || len(br) == 0 {
|
||||
return 0
|
||||
}
|
||||
d := levenshtein(ar, br)
|
||||
maxLen := len(ar)
|
||||
if len(br) > maxLen {
|
||||
maxLen = len(br)
|
||||
}
|
||||
return 1 - float64(d)/float64(maxLen)
|
||||
}
|
||||
|
||||
func levenshtein(a, b []rune) int {
|
||||
dp := make([]int, len(b)+1)
|
||||
for j := 0; j <= len(b); j++ {
|
||||
dp[j] = j
|
||||
}
|
||||
for i := 1; i <= len(a); i++ {
|
||||
prev := dp[0]
|
||||
dp[0] = i
|
||||
for j := 1; j <= len(b); j++ {
|
||||
tmp := dp[j]
|
||||
cost := 0
|
||||
if a[i-1] != b[j-1] {
|
||||
cost = 1
|
||||
}
|
||||
dp[j] = min3(
|
||||
dp[j]+1,
|
||||
dp[j-1]+1,
|
||||
prev+cost,
|
||||
)
|
||||
prev = tmp
|
||||
}
|
||||
}
|
||||
return dp[len(b)]
|
||||
}
|
||||
|
||||
func min3(a, b, c int) int {
|
||||
arr := []int{a, b, c}
|
||||
sort.Ints(arr)
|
||||
return arr[0]
|
||||
}
|
||||
Reference in New Issue
Block a user