build spotify-to-navidrome migrator with recovery flow

2026-04-09 03:10:58 +02:00
parent 650a0c6a87
commit c1360a6423
23 changed files with 3383 additions and 0 deletions
--- a/internal/match/matcher.go
+++ b/internal/match/matcher.go
@@ -0,0 +1,358 @@
+package match
+
+import (
+	"context"
+	"fmt"
+	"math"
+	"regexp"
+	"sort"
+	"strings"
+	"sync"
+
+	"navimigrate/internal/model"
+	"navimigrate/internal/navidrome"
+)
+
+type Searcher interface {
+	SearchTracks(ctx context.Context, query string, limit int) ([]navidrome.Track, error)
+}
+
+type Matcher struct {
+	searcher  Searcher
+	threshold float64
+	cacheMu   sync.RWMutex
+	cache     map[string][]navidrome.Track
+}
+
+func NewMatcher(searcher Searcher, threshold float64) *Matcher {
+	if threshold < 0 {
+		threshold = 45
+	}
+	return &Matcher{
+		searcher:  searcher,
+		threshold: threshold,
+		cache:     map[string][]navidrome.Track{},
+	}
+}
+
+func (m *Matcher) MatchTrack(ctx context.Context, src model.Track) model.MatchedTrack {
+	queries := m.buildQueries(src)
+	if len(queries) == 0 {
+		return model.MatchedTrack{Source: src, Matched: false, Reason: "no usable metadata"}
+	}
+
+	type scored struct {
+		track navidrome.Track
+		score float64
+		query string
+	}
+
+	best := scored{score: -999}
+	seen := map[string]struct{}{}
+	for _, q := range queries {
+		candidates, err := m.searchCached(ctx, q)
+		if err != nil {
+			continue
+		}
+		for _, c := range candidates {
+			if _, ok := seen[c.ID]; ok {
+				continue
+			}
+			seen[c.ID] = struct{}{}
+			score := scoreCandidate(src, c)
+			if score > best.score {
+				best = scored{track: c, score: score, query: q}
+			}
+		}
+	}
+
+	if best.track.ID == "" {
+		return model.MatchedTrack{Source: src, Matched: false, Reason: "no candidates"}
+	}
+
+	if best.score >= m.threshold {
+		return model.MatchedTrack{
+			Source:   src,
+			TargetID: best.track.ID,
+			Score:    best.score,
+			Query:    best.query,
+			Matched:  true,
+		}
+	}
+
+	reason := fmt.Sprintf("best score %.1f below threshold %.1f", best.score, m.threshold)
+	return model.MatchedTrack{
+		Source:   src,
+		TargetID: best.track.ID,
+		Score:    best.score,
+		Query:    best.query,
+		Matched:  false,
+		Reason:   reason,
+	}
+}
+
+func (m *Matcher) searchCached(ctx context.Context, q string) ([]navidrome.Track, error) {
+	q = strings.TrimSpace(q)
+	if q == "" {
+		return nil, nil
+	}
+
+	m.cacheMu.RLock()
+	if v, ok := m.cache[q]; ok {
+		m.cacheMu.RUnlock()
+		return v, nil
+	}
+	m.cacheMu.RUnlock()
+
+	res, err := m.searcher.SearchTracks(ctx, q, 20)
+	if err != nil {
+		return nil, err
+	}
+
+	m.cacheMu.Lock()
+	m.cache[q] = res
+	m.cacheMu.Unlock()
+
+	return res, nil
+}
+
+func (m *Matcher) buildQueries(src model.Track) []string {
+	title := strings.TrimSpace(src.Title)
+	if title == "" {
+		return nil
+	}
+	artist := ""
+	if len(src.Artists) > 0 {
+		artist = src.Artists[0]
+	}
+	latinTitle := strings.TrimSpace(transliterateToLatin(title))
+	latinArtist := strings.TrimSpace(transliterateToLatin(artist))
+
+	queries := []string{}
+	if src.ISRC != "" {
+		queries = append(queries, src.ISRC)
+	}
+	queries = append(queries, strings.TrimSpace(title+" "+artist))
+	if latinTitle != "" {
+		queries = append(queries, strings.TrimSpace(latinTitle+" "+latinArtist))
+	}
+
+	cleanTitle := cleanTitle(title)
+	if cleanTitle != title {
+		queries = append(queries, strings.TrimSpace(cleanTitle+" "+artist))
+		latinClean := strings.TrimSpace(transliterateToLatin(cleanTitle))
+		if latinClean != "" {
+			queries = append(queries, strings.TrimSpace(latinClean+" "+latinArtist))
+		}
+	}
+
+	queries = append(queries, title)
+	if latinTitle != "" {
+		queries = append(queries, latinTitle)
+	}
+
+	uniq := map[string]struct{}{}
+	out := make([]string, 0, len(queries))
+	for _, q := range queries {
+		q = strings.TrimSpace(q)
+		if q == "" {
+			continue
+		}
+		if _, ok := uniq[q]; ok {
+			continue
+		}
+		uniq[q] = struct{}{}
+		out = append(out, q)
+	}
+	return out
+}
+
+func scoreCandidate(src model.Track, dst navidrome.Track) float64 {
+	score := 0.0
+
+	if src.ISRC != "" && hasISRC(dst.ISRCs, src.ISRC) {
+		score += 60
+	}
+
+	score += 25 * similarity(normalize(src.Title), normalize(dst.Title))
+
+	primaryArtist := ""
+	if len(src.Artists) > 0 {
+		primaryArtist = src.Artists[0]
+	}
+	if primaryArtist != "" {
+		score += 20 * similarity(normalize(primaryArtist), normalize(dst.Artist))
+	}
+
+	if src.DurationMS > 0 && dst.Duration > 0 {
+		delta := math.Abs(float64(src.DurationMS/1000 - dst.Duration))
+		switch {
+		case delta <= 2:
+			score += 10
+		case delta <= 5:
+			score += 7
+		case delta <= 10:
+			score += 4
+		case delta > 25:
+			score -= 6
+		}
+	}
+
+	nt := normalize(src.Title)
+	dt := normalize(dst.Title)
+	if !strings.Contains(nt, "live") && strings.Contains(dt, "live") {
+		score -= 8
+	}
+	if !strings.Contains(nt, "remix") && strings.Contains(dt, "remix") {
+		score -= 6
+	}
+	if strings.Contains(dt, "karaoke") {
+		score -= 12
+	}
+
+	return score
+}
+
+func hasISRC(candidates []string, wanted string) bool {
+	wanted = strings.ToUpper(strings.TrimSpace(wanted))
+	if wanted == "" {
+		return false
+	}
+	for _, c := range candidates {
+		if strings.EqualFold(strings.TrimSpace(c), wanted) {
+			return true
+		}
+	}
+	return false
+}
+
+var nonAlphaNum = regexp.MustCompile(`[^a-z0-9]+`)
+
+func normalize(s string) string {
+	s = transliterateToLatin(s)
+	s = strings.ToLower(strings.TrimSpace(s))
+	s = strings.ReplaceAll(s, "&", " and ")
+	s = nonAlphaNum.ReplaceAllString(s, " ")
+	tokens := strings.Fields(s)
+	return strings.Join(tokens, " ")
+}
+
+var cyrillicToLatin = map[rune]string{
+	'а': "a", 'б': "b", 'в': "v", 'г': "g", 'д': "d", 'е': "e", 'ё': "e", 'ж': "zh", 'з': "z", 'и': "i", 'й': "i",
+	'к': "k", 'л': "l", 'м': "m", 'н': "n", 'о': "o", 'п': "p", 'р': "r", 'с': "s", 'т': "t", 'у': "u", 'ф': "f",
+	'х': "h", 'ц': "ts", 'ч': "ch", 'ш': "sh", 'щ': "shch", 'ъ': "", 'ы': "y", 'ь': "", 'э': "e", 'ю': "yu", 'я': "ya",
+	'і': "i", 'ї': "yi", 'є': "ye", 'ґ': "g",
+	'А': "a", 'Б': "b", 'В': "v", 'Г': "g", 'Д': "d", 'Е': "e", 'Ё': "e", 'Ж': "zh", 'З': "z", 'И': "i", 'Й': "i",
+	'К': "k", 'Л': "l", 'М': "m", 'Н': "n", 'О': "o", 'П': "p", 'Р': "r", 'С': "s", 'Т': "t", 'У': "u", 'Ф': "f",
+	'Х': "h", 'Ц': "ts", 'Ч': "ch", 'Ш': "sh", 'Щ': "shch", 'Ъ': "", 'Ы': "y", 'Ь': "", 'Э': "e", 'Ю': "yu", 'Я': "ya",
+	'І': "i", 'Ї': "yi", 'Є': "ye", 'Ґ': "g",
+}
+
+func transliterateToLatin(s string) string {
+	if s == "" {
+		return s
+	}
+	b := strings.Builder{}
+	b.Grow(len(s) + 8)
+	for _, r := range s {
+		if v, ok := cyrillicToLatin[r]; ok {
+			b.WriteString(v)
+			continue
+		}
+		b.WriteRune(r)
+	}
+	return b.String()
+}
+
+var cleanupRe = regexp.MustCompile(`(?i)\s*\(([^)]*(remaster|remastered|live|mono|stereo|version|deluxe|explicit|clean|bonus)[^)]*)\)|\s*-\s*(remaster(ed)?|live|version|edit|radio edit).*`)
+
+func cleanTitle(s string) string {
+	clean := cleanupRe.ReplaceAllString(s, "")
+	clean = strings.TrimSpace(clean)
+	if clean == "" {
+		return s
+	}
+	return clean
+}
+
+func similarity(a, b string) float64 {
+	if a == "" || b == "" {
+		return 0
+	}
+	if a == b {
+		return 1
+	}
+	ta := tokenSet(a)
+	tb := tokenSet(b)
+	if len(ta) == 0 || len(tb) == 0 {
+		return 0
+	}
+
+	inter := 0
+	for t := range ta {
+		if _, ok := tb[t]; ok {
+			inter++
+		}
+	}
+	if inter == 0 {
+		return 0
+	}
+
+	jaccard := float64(inter) / float64(len(ta)+len(tb)-inter)
+	lev := levenshteinRatio(a, b)
+	return (jaccard * 0.6) + (lev * 0.4)
+}
+
+func tokenSet(s string) map[string]struct{} {
+	parts := strings.Fields(s)
+	set := make(map[string]struct{}, len(parts))
+	for _, p := range parts {
+		set[p] = struct{}{}
+	}
+	return set
+}
+
+func levenshteinRatio(a, b string) float64 {
+	ar := []rune(a)
+	br := []rune(b)
+	if len(ar) == 0 || len(br) == 0 {
+		return 0
+	}
+	d := levenshtein(ar, br)
+	maxLen := len(ar)
+	if len(br) > maxLen {
+		maxLen = len(br)
+	}
+	return 1 - float64(d)/float64(maxLen)
+}
+
+func levenshtein(a, b []rune) int {
+	dp := make([]int, len(b)+1)
+	for j := 0; j <= len(b); j++ {
+		dp[j] = j
+	}
+	for i := 1; i <= len(a); i++ {
+		prev := dp[0]
+		dp[0] = i
+		for j := 1; j <= len(b); j++ {
+			tmp := dp[j]
+			cost := 0
+			if a[i-1] != b[j-1] {
+				cost = 1
+			}
+			dp[j] = min3(
+				dp[j]+1,
+				dp[j-1]+1,
+				prev+cost,
+			)
+			prev = tmp
+		}
+	}
+	return dp[len(b)]
+}
+
+func min3(a, b, c int) int {
+	arr := []int{a, b, c}
+	sort.Ints(arr)
+	return arr[0]
+}
--- a/internal/match/matcher_test.go
+++ b/internal/match/matcher_test.go
@@ -0,0 +1,64 @@
+package match
+
+import (
+	"context"
+	"strings"
+	"testing"
+
+	"navimigrate/internal/model"
+	"navimigrate/internal/navidrome"
+)
+
+type fakeSearcher struct {
+	tracks []navidrome.Track
+}
+
+func (f fakeSearcher) SearchTracks(context.Context, string, int) ([]navidrome.Track, error) {
+	return f.tracks, nil
+}
+
+func TestNormalizeTransliteratesCyrillic(t *testing.T) {
+	got := normalize("детство")
+	if got != "detstvo" {
+		t.Fatalf("expected detstvo, got %q", got)
+	}
+}
+
+func TestBuildQueriesIncludesLatinVariant(t *testing.T) {
+	m := NewMatcher(fakeSearcher{}, 45)
+	q := m.buildQueries(model.Track{
+		Title:   "детство",
+		Artists: []string{"Rauf & Faik"},
+	})
+
+	joined := strings.Join(q, "\n")
+	if !strings.Contains(strings.ToLower(joined), "detstvo") {
+		t.Fatalf("expected transliterated query to include detstvo, got %v", q)
+	}
+}
+
+func TestMatchThresholdIsConfigurable(t *testing.T) {
+	src := model.Track{
+		Title:      "One More Time",
+		Artists:    []string{"Daft Punk"},
+		DurationMS: 317000,
+	}
+	candidate := navidrome.Track{
+		ID:       "track-1",
+		Title:    "One More Time",
+		Artist:   "Daft Punk",
+		Duration: 317,
+	}
+
+	m := NewMatcher(fakeSearcher{tracks: []navidrome.Track{candidate}}, 100)
+	res := m.MatchTrack(context.Background(), src)
+	if res.Matched {
+		t.Fatalf("expected no match with high threshold, score=%.1f", res.Score)
+	}
+
+	m = NewMatcher(fakeSearcher{tracks: []navidrome.Track{candidate}}, 0)
+	res = m.MatchTrack(context.Background(), src)
+	if !res.Matched {
+		t.Fatalf("expected match with low threshold, score=%.1f", res.Score)
+	}
+}