NaviMigrate/internal/match/matcher.go

package match

import (
	"context"
	"fmt"
	"math"
	"regexp"
	"sort"
	"strings"
	"sync"

	"navimigrate/internal/model"
	"navimigrate/internal/navidrome"
)

type Searcher interface {
	SearchTracks(ctx context.Context, query string, limit int) ([]navidrome.Track, error)
}

type Matcher struct {
	searcher  Searcher
	threshold float64
	cacheMu   sync.RWMutex
	cache     map[string][]navidrome.Track
}

func NewMatcher(searcher Searcher, threshold float64) *Matcher {
	if threshold < 0 {
		threshold = 45
	}
	return &Matcher{
		searcher:  searcher,
		threshold: threshold,
		cache:     map[string][]navidrome.Track{},
	}
}

func (m *Matcher) MatchTrack(ctx context.Context, src model.Track) model.MatchedTrack {
	queries := m.buildQueries(src)
	if len(queries) == 0 {
		return model.MatchedTrack{Source: src, Matched: false, Reason: "no usable metadata"}
	}

	type scored struct {
		track navidrome.Track
		score float64
		query string
	}

	best := scored{score: -999}
	seen := map[string]struct{}{}
	for _, q := range queries {
		candidates, err := m.searchCached(ctx, q)
		if err != nil {
			continue
		}
		for _, c := range candidates {
			if _, ok := seen[c.ID]; ok {
				continue
			}
			seen[c.ID] = struct{}{}
			score := scoreCandidate(src, c)
			if score > best.score {
				best = scored{track: c, score: score, query: q}
			}
		}
	}

	if best.track.ID == "" {
		return model.MatchedTrack{Source: src, Matched: false, Reason: "no candidates"}
	}

	if best.score >= m.threshold {
		return model.MatchedTrack{
			Source:   src,
			TargetID: best.track.ID,
			Score:    best.score,
			Query:    best.query,
			Matched:  true,
		}
	}

	reason := fmt.Sprintf("best score %.1f below threshold %.1f", best.score, m.threshold)
	return model.MatchedTrack{
		Source:   src,
		TargetID: best.track.ID,
		Score:    best.score,
		Query:    best.query,
		Matched:  false,
		Reason:   reason,
	}
}

func (m *Matcher) searchCached(ctx context.Context, q string) ([]navidrome.Track, error) {
	q = strings.TrimSpace(q)
	if q == "" {
		return nil, nil
	}

	m.cacheMu.RLock()
	if v, ok := m.cache[q]; ok {
		m.cacheMu.RUnlock()
		return v, nil
	}
	m.cacheMu.RUnlock()

	res, err := m.searcher.SearchTracks(ctx, q, 20)
	if err != nil {
		return nil, err
	}

	m.cacheMu.Lock()
	m.cache[q] = res
	m.cacheMu.Unlock()

	return res, nil
}

func (m *Matcher) buildQueries(src model.Track) []string {
	title := strings.TrimSpace(src.Title)
	if title == "" {
		return nil
	}
	artist := ""
	if len(src.Artists) > 0 {
		artist = src.Artists[0]
	}
	latinTitle := strings.TrimSpace(transliterateToLatin(title))
	latinArtist := strings.TrimSpace(transliterateToLatin(artist))

	queries := []string{}
	if src.ISRC != "" {
		queries = append(queries, src.ISRC)
	}
	queries = append(queries, strings.TrimSpace(title+" "+artist))
	if latinTitle != "" {
		queries = append(queries, strings.TrimSpace(latinTitle+" "+latinArtist))
	}

	cleanTitle := cleanTitle(title)
	if cleanTitle != title {
		queries = append(queries, strings.TrimSpace(cleanTitle+" "+artist))
		latinClean := strings.TrimSpace(transliterateToLatin(cleanTitle))
		if latinClean != "" {
			queries = append(queries, strings.TrimSpace(latinClean+" "+latinArtist))
		}
	}

	queries = append(queries, title)
	if latinTitle != "" {
		queries = append(queries, latinTitle)
	}

	uniq := map[string]struct{}{}
	out := make([]string, 0, len(queries))
	for _, q := range queries {
		q = strings.TrimSpace(q)
		if q == "" {
			continue
		}
		if _, ok := uniq[q]; ok {
			continue
		}
		uniq[q] = struct{}{}
		out = append(out, q)
	}
	return out
}

func scoreCandidate(src model.Track, dst navidrome.Track) float64 {
	score := 0.0

	if src.ISRC != "" && hasISRC(dst.ISRCs, src.ISRC) {
		score += 60
	}

	score += 25 * similarity(normalize(src.Title), normalize(dst.Title))

	primaryArtist := ""
	if len(src.Artists) > 0 {
		primaryArtist = src.Artists[0]
	}
	if primaryArtist != "" {
		score += 20 * similarity(normalize(primaryArtist), normalize(dst.Artist))
	}

	if src.DurationMS > 0 && dst.Duration > 0 {
		delta := math.Abs(float64(src.DurationMS/1000 - dst.Duration))
		switch {
		case delta <= 2:
			score += 10
		case delta <= 5:
			score += 7
		case delta <= 10:
			score += 4
		case delta > 25:
			score -= 6
		}
	}

	nt := normalize(src.Title)
	dt := normalize(dst.Title)
	if !strings.Contains(nt, "live") && strings.Contains(dt, "live") {
		score -= 8
	}
	if !strings.Contains(nt, "remix") && strings.Contains(dt, "remix") {
		score -= 6
	}
	if strings.Contains(dt, "karaoke") {
		score -= 12
	}

	return score
}

func hasISRC(candidates []string, wanted string) bool {
	wanted = strings.ToUpper(strings.TrimSpace(wanted))
	if wanted == "" {
		return false
	}
	for _, c := range candidates {
		if strings.EqualFold(strings.TrimSpace(c), wanted) {
			return true
		}
	}
	return false
}

var nonAlphaNum = regexp.MustCompile(`[^a-z0-9]+`)

func normalize(s string) string {
	s = transliterateToLatin(s)
	s = strings.ToLower(strings.TrimSpace(s))
	s = strings.ReplaceAll(s, "&", " and ")
	s = nonAlphaNum.ReplaceAllString(s, " ")
	tokens := strings.Fields(s)
	return strings.Join(tokens, " ")
}

var cyrillicToLatin = map[rune]string{
	'а': "a", 'б': "b", 'в': "v", 'г': "g", 'д': "d", 'е': "e", 'ё': "e", 'ж': "zh", 'з': "z", 'и': "i", 'й': "i",
	'к': "k", 'л': "l", 'м': "m", 'н': "n", 'о': "o", 'п': "p", 'р': "r", 'с': "s", 'т': "t", 'у': "u", 'ф': "f",
	'х': "h", 'ц': "ts", 'ч': "ch", 'ш': "sh", 'щ': "shch", 'ъ': "", 'ы': "y", 'ь': "", 'э': "e", 'ю': "yu", 'я': "ya",
	'і': "i", 'ї': "yi", 'є': "ye", 'ґ': "g",
	'А': "a", 'Б': "b", 'В': "v", 'Г': "g", 'Д': "d", 'Е': "e", 'Ё': "e", 'Ж': "zh", 'З': "z", 'И': "i", 'Й': "i",
	'К': "k", 'Л': "l", 'М': "m", 'Н': "n", 'О': "o", 'П': "p", 'Р': "r", 'С': "s", 'Т': "t", 'У': "u", 'Ф': "f",
	'Х': "h", 'Ц': "ts", 'Ч': "ch", 'Ш': "sh", 'Щ': "shch", 'Ъ': "", 'Ы': "y", 'Ь': "", 'Э': "e", 'Ю': "yu", 'Я': "ya",
	'І': "i", 'Ї': "yi", 'Є': "ye", 'Ґ': "g",
}

func transliterateToLatin(s string) string {
	if s == "" {
		return s
	}
	b := strings.Builder{}
	b.Grow(len(s) + 8)
	for _, r := range s {
		if v, ok := cyrillicToLatin[r]; ok {
			b.WriteString(v)
			continue
		}
		b.WriteRune(r)
	}
	return b.String()
}

var cleanupRe = regexp.MustCompile(`(?i)\s*\(([^)]*(remaster|remastered|live|mono|stereo|version|deluxe|explicit|clean|bonus)[^)]*)\)|\s*-\s*(remaster(ed)?|live|version|edit|radio edit).*`)

func cleanTitle(s string) string {
	clean := cleanupRe.ReplaceAllString(s, "")
	clean = strings.TrimSpace(clean)
	if clean == "" {
		return s
	}
	return clean
}

func similarity(a, b string) float64 {
	if a == "" || b == "" {
		return 0
	}
	if a == b {
		return 1
	}
	ta := tokenSet(a)
	tb := tokenSet(b)
	if len(ta) == 0 || len(tb) == 0 {
		return 0
	}

	inter := 0
	for t := range ta {
		if _, ok := tb[t]; ok {
			inter++
		}
	}
	if inter == 0 {
		return 0
	}

	jaccard := float64(inter) / float64(len(ta)+len(tb)-inter)
	lev := levenshteinRatio(a, b)
	return (jaccard * 0.6) + (lev * 0.4)
}

func tokenSet(s string) map[string]struct{} {
	parts := strings.Fields(s)
	set := make(map[string]struct{}, len(parts))
	for _, p := range parts {
		set[p] = struct{}{}
	}
	return set
}

func levenshteinRatio(a, b string) float64 {
	ar := []rune(a)
	br := []rune(b)
	if len(ar) == 0 || len(br) == 0 {
		return 0
	}
	d := levenshtein(ar, br)
	maxLen := len(ar)
	if len(br) > maxLen {
		maxLen = len(br)
	}
	return 1 - float64(d)/float64(maxLen)
}

func levenshtein(a, b []rune) int {
	dp := make([]int, len(b)+1)
	for j := 0; j <= len(b); j++ {
		dp[j] = j
	}
	for i := 1; i <= len(a); i++ {
		prev := dp[0]
		dp[0] = i
		for j := 1; j <= len(b); j++ {
			tmp := dp[j]
			cost := 0
			if a[i-1] != b[j-1] {
				cost = 1
			}
			dp[j] = min3(
				dp[j]+1,
				dp[j-1]+1,
				prev+cost,
			)
			prev = tmp
		}
	}
	return dp[len(b)]
}

func min3(a, b, c int) int {
	arr := []int{a, b, c}
	sort.Ints(arr)
	return arr[0]
}