build spotify-to-navidrome migrator with recovery flow

This commit is contained in:
2026-04-09 03:10:58 +02:00
parent 650a0c6a87
commit c1360a6423
23 changed files with 3383 additions and 0 deletions

358
internal/match/matcher.go Normal file
View File

@@ -0,0 +1,358 @@
package match
import (
"context"
"fmt"
"math"
"regexp"
"sort"
"strings"
"sync"
"navimigrate/internal/model"
"navimigrate/internal/navidrome"
)
type Searcher interface {
SearchTracks(ctx context.Context, query string, limit int) ([]navidrome.Track, error)
}
type Matcher struct {
searcher Searcher
threshold float64
cacheMu sync.RWMutex
cache map[string][]navidrome.Track
}
func NewMatcher(searcher Searcher, threshold float64) *Matcher {
if threshold < 0 {
threshold = 45
}
return &Matcher{
searcher: searcher,
threshold: threshold,
cache: map[string][]navidrome.Track{},
}
}
func (m *Matcher) MatchTrack(ctx context.Context, src model.Track) model.MatchedTrack {
queries := m.buildQueries(src)
if len(queries) == 0 {
return model.MatchedTrack{Source: src, Matched: false, Reason: "no usable metadata"}
}
type scored struct {
track navidrome.Track
score float64
query string
}
best := scored{score: -999}
seen := map[string]struct{}{}
for _, q := range queries {
candidates, err := m.searchCached(ctx, q)
if err != nil {
continue
}
for _, c := range candidates {
if _, ok := seen[c.ID]; ok {
continue
}
seen[c.ID] = struct{}{}
score := scoreCandidate(src, c)
if score > best.score {
best = scored{track: c, score: score, query: q}
}
}
}
if best.track.ID == "" {
return model.MatchedTrack{Source: src, Matched: false, Reason: "no candidates"}
}
if best.score >= m.threshold {
return model.MatchedTrack{
Source: src,
TargetID: best.track.ID,
Score: best.score,
Query: best.query,
Matched: true,
}
}
reason := fmt.Sprintf("best score %.1f below threshold %.1f", best.score, m.threshold)
return model.MatchedTrack{
Source: src,
TargetID: best.track.ID,
Score: best.score,
Query: best.query,
Matched: false,
Reason: reason,
}
}
func (m *Matcher) searchCached(ctx context.Context, q string) ([]navidrome.Track, error) {
q = strings.TrimSpace(q)
if q == "" {
return nil, nil
}
m.cacheMu.RLock()
if v, ok := m.cache[q]; ok {
m.cacheMu.RUnlock()
return v, nil
}
m.cacheMu.RUnlock()
res, err := m.searcher.SearchTracks(ctx, q, 20)
if err != nil {
return nil, err
}
m.cacheMu.Lock()
m.cache[q] = res
m.cacheMu.Unlock()
return res, nil
}
func (m *Matcher) buildQueries(src model.Track) []string {
title := strings.TrimSpace(src.Title)
if title == "" {
return nil
}
artist := ""
if len(src.Artists) > 0 {
artist = src.Artists[0]
}
latinTitle := strings.TrimSpace(transliterateToLatin(title))
latinArtist := strings.TrimSpace(transliterateToLatin(artist))
queries := []string{}
if src.ISRC != "" {
queries = append(queries, src.ISRC)
}
queries = append(queries, strings.TrimSpace(title+" "+artist))
if latinTitle != "" {
queries = append(queries, strings.TrimSpace(latinTitle+" "+latinArtist))
}
cleanTitle := cleanTitle(title)
if cleanTitle != title {
queries = append(queries, strings.TrimSpace(cleanTitle+" "+artist))
latinClean := strings.TrimSpace(transliterateToLatin(cleanTitle))
if latinClean != "" {
queries = append(queries, strings.TrimSpace(latinClean+" "+latinArtist))
}
}
queries = append(queries, title)
if latinTitle != "" {
queries = append(queries, latinTitle)
}
uniq := map[string]struct{}{}
out := make([]string, 0, len(queries))
for _, q := range queries {
q = strings.TrimSpace(q)
if q == "" {
continue
}
if _, ok := uniq[q]; ok {
continue
}
uniq[q] = struct{}{}
out = append(out, q)
}
return out
}
func scoreCandidate(src model.Track, dst navidrome.Track) float64 {
score := 0.0
if src.ISRC != "" && hasISRC(dst.ISRCs, src.ISRC) {
score += 60
}
score += 25 * similarity(normalize(src.Title), normalize(dst.Title))
primaryArtist := ""
if len(src.Artists) > 0 {
primaryArtist = src.Artists[0]
}
if primaryArtist != "" {
score += 20 * similarity(normalize(primaryArtist), normalize(dst.Artist))
}
if src.DurationMS > 0 && dst.Duration > 0 {
delta := math.Abs(float64(src.DurationMS/1000 - dst.Duration))
switch {
case delta <= 2:
score += 10
case delta <= 5:
score += 7
case delta <= 10:
score += 4
case delta > 25:
score -= 6
}
}
nt := normalize(src.Title)
dt := normalize(dst.Title)
if !strings.Contains(nt, "live") && strings.Contains(dt, "live") {
score -= 8
}
if !strings.Contains(nt, "remix") && strings.Contains(dt, "remix") {
score -= 6
}
if strings.Contains(dt, "karaoke") {
score -= 12
}
return score
}
func hasISRC(candidates []string, wanted string) bool {
wanted = strings.ToUpper(strings.TrimSpace(wanted))
if wanted == "" {
return false
}
for _, c := range candidates {
if strings.EqualFold(strings.TrimSpace(c), wanted) {
return true
}
}
return false
}
var nonAlphaNum = regexp.MustCompile(`[^a-z0-9]+`)
func normalize(s string) string {
s = transliterateToLatin(s)
s = strings.ToLower(strings.TrimSpace(s))
s = strings.ReplaceAll(s, "&", " and ")
s = nonAlphaNum.ReplaceAllString(s, " ")
tokens := strings.Fields(s)
return strings.Join(tokens, " ")
}
var cyrillicToLatin = map[rune]string{
'а': "a", 'б': "b", 'в': "v", 'г': "g", 'д': "d", 'е': "e", 'ё': "e", 'ж': "zh", 'з': "z", 'и': "i", 'й': "i",
'к': "k", 'л': "l", 'м': "m", 'н': "n", 'о': "o", 'п': "p", 'р': "r", 'с': "s", 'т': "t", 'у': "u", 'ф': "f",
'х': "h", 'ц': "ts", 'ч': "ch", 'ш': "sh", 'щ': "shch", 'ъ': "", 'ы': "y", 'ь': "", 'э': "e", 'ю': "yu", 'я': "ya",
'і': "i", 'ї': "yi", 'є': "ye", 'ґ': "g",
'А': "a", 'Б': "b", 'В': "v", 'Г': "g", 'Д': "d", 'Е': "e", 'Ё': "e", 'Ж': "zh", 'З': "z", 'И': "i", 'Й': "i",
'К': "k", 'Л': "l", 'М': "m", 'Н': "n", 'О': "o", 'П': "p", 'Р': "r", 'С': "s", 'Т': "t", 'У': "u", 'Ф': "f",
'Х': "h", 'Ц': "ts", 'Ч': "ch", 'Ш': "sh", 'Щ': "shch", 'Ъ': "", 'Ы': "y", 'Ь': "", 'Э': "e", 'Ю': "yu", 'Я': "ya",
'І': "i", 'Ї': "yi", 'Є': "ye", 'Ґ': "g",
}
func transliterateToLatin(s string) string {
if s == "" {
return s
}
b := strings.Builder{}
b.Grow(len(s) + 8)
for _, r := range s {
if v, ok := cyrillicToLatin[r]; ok {
b.WriteString(v)
continue
}
b.WriteRune(r)
}
return b.String()
}
var cleanupRe = regexp.MustCompile(`(?i)\s*\(([^)]*(remaster|remastered|live|mono|stereo|version|deluxe|explicit|clean|bonus)[^)]*)\)|\s*-\s*(remaster(ed)?|live|version|edit|radio edit).*`)
func cleanTitle(s string) string {
clean := cleanupRe.ReplaceAllString(s, "")
clean = strings.TrimSpace(clean)
if clean == "" {
return s
}
return clean
}
func similarity(a, b string) float64 {
if a == "" || b == "" {
return 0
}
if a == b {
return 1
}
ta := tokenSet(a)
tb := tokenSet(b)
if len(ta) == 0 || len(tb) == 0 {
return 0
}
inter := 0
for t := range ta {
if _, ok := tb[t]; ok {
inter++
}
}
if inter == 0 {
return 0
}
jaccard := float64(inter) / float64(len(ta)+len(tb)-inter)
lev := levenshteinRatio(a, b)
return (jaccard * 0.6) + (lev * 0.4)
}
func tokenSet(s string) map[string]struct{} {
parts := strings.Fields(s)
set := make(map[string]struct{}, len(parts))
for _, p := range parts {
set[p] = struct{}{}
}
return set
}
func levenshteinRatio(a, b string) float64 {
ar := []rune(a)
br := []rune(b)
if len(ar) == 0 || len(br) == 0 {
return 0
}
d := levenshtein(ar, br)
maxLen := len(ar)
if len(br) > maxLen {
maxLen = len(br)
}
return 1 - float64(d)/float64(maxLen)
}
func levenshtein(a, b []rune) int {
dp := make([]int, len(b)+1)
for j := 0; j <= len(b); j++ {
dp[j] = j
}
for i := 1; i <= len(a); i++ {
prev := dp[0]
dp[0] = i
for j := 1; j <= len(b); j++ {
tmp := dp[j]
cost := 0
if a[i-1] != b[j-1] {
cost = 1
}
dp[j] = min3(
dp[j]+1,
dp[j-1]+1,
prev+cost,
)
prev = tmp
}
}
return dp[len(b)]
}
func min3(a, b, c int) int {
arr := []int{a, b, c}
sort.Ints(arr)
return arr[0]
}

View File

@@ -0,0 +1,64 @@
package match
import (
"context"
"strings"
"testing"
"navimigrate/internal/model"
"navimigrate/internal/navidrome"
)
type fakeSearcher struct {
tracks []navidrome.Track
}
func (f fakeSearcher) SearchTracks(context.Context, string, int) ([]navidrome.Track, error) {
return f.tracks, nil
}
func TestNormalizeTransliteratesCyrillic(t *testing.T) {
got := normalize("детство")
if got != "detstvo" {
t.Fatalf("expected detstvo, got %q", got)
}
}
func TestBuildQueriesIncludesLatinVariant(t *testing.T) {
m := NewMatcher(fakeSearcher{}, 45)
q := m.buildQueries(model.Track{
Title: "детство",
Artists: []string{"Rauf & Faik"},
})
joined := strings.Join(q, "\n")
if !strings.Contains(strings.ToLower(joined), "detstvo") {
t.Fatalf("expected transliterated query to include detstvo, got %v", q)
}
}
func TestMatchThresholdIsConfigurable(t *testing.T) {
src := model.Track{
Title: "One More Time",
Artists: []string{"Daft Punk"},
DurationMS: 317000,
}
candidate := navidrome.Track{
ID: "track-1",
Title: "One More Time",
Artist: "Daft Punk",
Duration: 317,
}
m := NewMatcher(fakeSearcher{tracks: []navidrome.Track{candidate}}, 100)
res := m.MatchTrack(context.Background(), src)
if res.Matched {
t.Fatalf("expected no match with high threshold, score=%.1f", res.Score)
}
m = NewMatcher(fakeSearcher{tracks: []navidrome.Track{candidate}}, 0)
res = m.MatchTrack(context.Background(), src)
if !res.Matched {
t.Fatalf("expected match with low threshold, score=%.1f", res.Score)
}
}