Merge pull request #15 from PiratesIRC/claude/investigate-fuzzy-matcher-bug-01R5rQiHwBMwKZfAUCcJMzKW
Claude/investigate fuzzy matcher bug 01 r5r qi hw b mw k zf au cc j mz kw
This commit is contained in:
@@ -11,7 +11,7 @@ import logging
|
|||||||
from glob import glob
|
from glob import glob
|
||||||
|
|
||||||
# Version: YY.DDD.HHMM (Julian date format: Year.DayOfYear.Time)
|
# Version: YY.DDD.HHMM (Julian date format: Year.DayOfYear.Time)
|
||||||
__version__ = "25.314.1907"
|
__version__ = "25.317.1200"
|
||||||
|
|
||||||
# Setup logging
|
# Setup logging
|
||||||
LOGGER = logging.getLogger("plugins.fuzzy_matcher")
|
LOGGER = logging.getLogger("plugins.fuzzy_matcher")
|
||||||
@@ -213,6 +213,9 @@ class FuzzyMatcher:
|
|||||||
if user_ignored_tags is None:
|
if user_ignored_tags is None:
|
||||||
user_ignored_tags = []
|
user_ignored_tags = []
|
||||||
|
|
||||||
|
# Store original for logging
|
||||||
|
original_name = name
|
||||||
|
|
||||||
# Remove leading parenthetical prefixes like (SP2), (D1), etc.
|
# Remove leading parenthetical prefixes like (SP2), (D1), etc.
|
||||||
name = re.sub(r'^\([^\)]+\)\s*', '', name)
|
name = re.sub(r'^\([^\)]+\)\s*', '', name)
|
||||||
|
|
||||||
@@ -223,7 +226,8 @@ class FuzzyMatcher:
|
|||||||
quality_tags = {'HD', 'SD', 'FD', 'UHD', 'FHD'}
|
quality_tags = {'HD', 'SD', 'FD', 'UHD', 'FHD'}
|
||||||
|
|
||||||
# Check for 2-3 letter prefix with colon or space at start
|
# Check for 2-3 letter prefix with colon or space at start
|
||||||
prefix_match = re.match(r'^([A-Z]{2,3})[:|\s]\s*', name)
|
# Fixed regex: [:\s] instead of [:|\s] (pipe and backslash were incorrect)
|
||||||
|
prefix_match = re.match(r'^([A-Z]{2,3})[:\s]\s*', name)
|
||||||
if prefix_match:
|
if prefix_match:
|
||||||
prefix = prefix_match.group(1).upper()
|
prefix = prefix_match.group(1).upper()
|
||||||
# Only remove if it's NOT a quality tag
|
# Only remove if it's NOT a quality tag
|
||||||
@@ -281,6 +285,10 @@ class FuzzyMatcher:
|
|||||||
# Clean up whitespace
|
# Clean up whitespace
|
||||||
name = re.sub(r'\s+', ' ', name).strip()
|
name = re.sub(r'\s+', ' ', name).strip()
|
||||||
|
|
||||||
|
# Log warning if normalization resulted in empty string (indicates overly aggressive stripping)
|
||||||
|
if not name:
|
||||||
|
self.logger.warning(f"normalize_name returned empty string for input: '{original_name}' (original input was stripped too aggressively)")
|
||||||
|
|
||||||
return name
|
return name
|
||||||
|
|
||||||
def extract_tags(self, name, user_ignored_tags=None):
|
def extract_tags(self, name, user_ignored_tags=None):
|
||||||
@@ -353,8 +361,10 @@ class FuzzyMatcher:
|
|||||||
if len(str1) < len(str2):
|
if len(str1) < len(str2):
|
||||||
str1, str2 = str2, str1
|
str1, str2 = str2, str1
|
||||||
|
|
||||||
if len(str2) == 0:
|
# Empty strings should not match anything (including other empty strings)
|
||||||
return 1.0 if len(str1) == 0 else 0.0
|
# This prevents false positives when normalization strips everything
|
||||||
|
if len(str2) == 0 or len(str1) == 0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
previous_row = list(range(len(str2) + 1))
|
previous_row = list(range(len(str2) + 1))
|
||||||
|
|
||||||
@@ -429,6 +439,11 @@ class FuzzyMatcher:
|
|||||||
for candidate in candidate_names:
|
for candidate in candidate_names:
|
||||||
# Normalize candidate (stream name) with Cinemax removal if requested
|
# Normalize candidate (stream name) with Cinemax removal if requested
|
||||||
candidate_normalized = self.normalize_name(candidate, user_ignored_tags, remove_cinemax=remove_cinemax)
|
candidate_normalized = self.normalize_name(candidate, user_ignored_tags, remove_cinemax=remove_cinemax)
|
||||||
|
|
||||||
|
# Skip candidates that normalize to empty or very short strings
|
||||||
|
if not candidate_normalized or len(candidate_normalized) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
processed_candidate = self.process_string_for_matching(candidate_normalized)
|
processed_candidate = self.process_string_for_matching(candidate_normalized)
|
||||||
score = self.calculate_similarity(processed_query, processed_candidate)
|
score = self.calculate_similarity(processed_query, processed_candidate)
|
||||||
|
|
||||||
@@ -481,6 +496,12 @@ class FuzzyMatcher:
|
|||||||
for candidate in candidate_names:
|
for candidate in candidate_names:
|
||||||
# Normalize candidate (stream name) with Cinemax removal if requested
|
# Normalize candidate (stream name) with Cinemax removal if requested
|
||||||
candidate_normalized = self.normalize_name(candidate, user_ignored_tags, remove_cinemax=remove_cinemax)
|
candidate_normalized = self.normalize_name(candidate, user_ignored_tags, remove_cinemax=remove_cinemax)
|
||||||
|
|
||||||
|
# Skip candidates that normalize to empty or very short strings (< 2 chars)
|
||||||
|
# This prevents false positives where multiple streams all normalize to ""
|
||||||
|
if not candidate_normalized or len(candidate_normalized) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
candidate_lower = candidate_normalized.lower()
|
candidate_lower = candidate_normalized.lower()
|
||||||
candidate_nospace = re.sub(r'[\s&\-]+', '', candidate_lower)
|
candidate_nospace = re.sub(r'[\s&\-]+', '', candidate_lower)
|
||||||
|
|
||||||
@@ -502,6 +523,11 @@ class FuzzyMatcher:
|
|||||||
for candidate in candidate_names:
|
for candidate in candidate_names:
|
||||||
# Normalize candidate (stream name) with Cinemax removal if requested
|
# Normalize candidate (stream name) with Cinemax removal if requested
|
||||||
candidate_normalized = self.normalize_name(candidate, user_ignored_tags, remove_cinemax=remove_cinemax)
|
candidate_normalized = self.normalize_name(candidate, user_ignored_tags, remove_cinemax=remove_cinemax)
|
||||||
|
|
||||||
|
# Skip candidates that normalize to empty or very short strings
|
||||||
|
if not candidate_normalized or len(candidate_normalized) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
candidate_lower = candidate_normalized.lower()
|
candidate_lower = candidate_normalized.lower()
|
||||||
|
|
||||||
# Check if one is a substring of the other
|
# Check if one is a substring of the other
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ class Plugin:
|
|||||||
"""Dispatcharr Stream-Mapparr Plugin"""
|
"""Dispatcharr Stream-Mapparr Plugin"""
|
||||||
|
|
||||||
name = "Stream-Mapparr"
|
name = "Stream-Mapparr"
|
||||||
version = "0.5.0"
|
version = "0.5.1"
|
||||||
description = "🎯 Automatically add matching streams to channels based on name similarity and quality precedence with enhanced fuzzy matching"
|
description = "🎯 Automatically add matching streams to channels based on name similarity and quality precedence with enhanced fuzzy matching"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -149,13 +149,6 @@ class Plugin:
|
|||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
# Add channel database section header
|
|
||||||
static_fields.append({
|
|
||||||
"id": "channel_databases_header",
|
|
||||||
"type": "info",
|
|
||||||
"label": "📚 Channel Databases",
|
|
||||||
})
|
|
||||||
|
|
||||||
# Dynamically add channel database enable/disable fields
|
# Dynamically add channel database enable/disable fields
|
||||||
try:
|
try:
|
||||||
databases = self._get_channel_databases()
|
databases = self._get_channel_databases()
|
||||||
@@ -753,7 +746,8 @@ class Plugin:
|
|||||||
# Remove country code prefix if requested
|
# Remove country code prefix if requested
|
||||||
if remove_country_prefix:
|
if remove_country_prefix:
|
||||||
quality_tags = {'HD', 'SD', 'FD', 'UHD', 'FHD'}
|
quality_tags = {'HD', 'SD', 'FD', 'UHD', 'FHD'}
|
||||||
prefix_match = re.match(r'^([A-Z]{2,3})[:|\s]\s*', cleaned)
|
# Fixed regex: [:\s] instead of [:|\s] (pipe and backslash were incorrect)
|
||||||
|
prefix_match = re.match(r'^([A-Z]{2,3})[:\s]\s*', cleaned)
|
||||||
if prefix_match:
|
if prefix_match:
|
||||||
prefix = prefix_match.group(1).upper()
|
prefix = prefix_match.group(1).upper()
|
||||||
if prefix not in quality_tags:
|
if prefix not in quality_tags:
|
||||||
@@ -1025,6 +1019,12 @@ class Plugin:
|
|||||||
ignore_geographic, ignore_misc, remove_cinemax=channel_has_max
|
ignore_geographic, ignore_misc, remove_cinemax=channel_has_max
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Skip if either cleaned name is empty or too short (prevents false positives)
|
||||||
|
if not cleaned_stream or len(cleaned_stream) < 2:
|
||||||
|
continue
|
||||||
|
if not cleaned_matched or len(cleaned_matched) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
if cleaned_stream.lower() == cleaned_matched.lower():
|
if cleaned_stream.lower() == cleaned_matched.lower():
|
||||||
matching_streams.append(stream)
|
matching_streams.append(stream)
|
||||||
|
|
||||||
@@ -1064,6 +1064,12 @@ class Plugin:
|
|||||||
ignore_geographic, ignore_misc, remove_cinemax=channel_has_max
|
ignore_geographic, ignore_misc, remove_cinemax=channel_has_max
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Skip if either cleaned name is empty or too short (prevents false positives)
|
||||||
|
if not cleaned_stream_name or len(cleaned_stream_name) < 2:
|
||||||
|
continue
|
||||||
|
if not cleaned_channel_name or len(cleaned_channel_name) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
if cleaned_stream_name.lower() == cleaned_channel_name.lower():
|
if cleaned_stream_name.lower() == cleaned_channel_name.lower():
|
||||||
matching_streams.append(stream)
|
matching_streams.append(stream)
|
||||||
|
|
||||||
@@ -1086,6 +1092,12 @@ class Plugin:
|
|||||||
ignore_geographic, ignore_misc, remove_cinemax=channel_has_max
|
ignore_geographic, ignore_misc, remove_cinemax=channel_has_max
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Skip if either cleaned name is empty or too short (prevents false positives)
|
||||||
|
if not cleaned_stream_name or len(cleaned_stream_name) < 2:
|
||||||
|
continue
|
||||||
|
if not cleaned_channel_name or len(cleaned_channel_name) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
# Simple case-insensitive substring matching
|
# Simple case-insensitive substring matching
|
||||||
if cleaned_channel_name.lower() in cleaned_stream_name.lower() or cleaned_stream_name.lower() in cleaned_channel_name.lower():
|
if cleaned_channel_name.lower() in cleaned_stream_name.lower() or cleaned_stream_name.lower() in cleaned_channel_name.lower():
|
||||||
matching_streams.append(stream)
|
matching_streams.append(stream)
|
||||||
|
|||||||
194
test_fuzzy_matcher_fix.py
Normal file
194
test_fuzzy_matcher_fix.py
Normal file
@@ -0,0 +1,194 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script to verify the fuzzy matcher bug fix.
|
||||||
|
Tests that streams which normalize to empty strings don't produce false positive matches.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Add the Stream-Mapparr directory to the path
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'Stream-Mapparr'))
|
||||||
|
|
||||||
|
from fuzzy_matcher import FuzzyMatcher
|
||||||
|
|
||||||
|
def test_empty_string_normalization():
|
||||||
|
"""Test that empty normalized strings don't cause false positive matches."""
|
||||||
|
print("=" * 80)
|
||||||
|
print("Test 1: Empty String Normalization")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# Create a fuzzy matcher
|
||||||
|
matcher = FuzzyMatcher(plugin_dir=None, match_threshold=85)
|
||||||
|
|
||||||
|
# Test case 1: Stream names that could normalize to empty strings
|
||||||
|
test_streams = [
|
||||||
|
"BR TNT SD",
|
||||||
|
"BR MTV SD",
|
||||||
|
"BR GNT SD",
|
||||||
|
"BR TLC SD",
|
||||||
|
"BR BIS SD"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Configure user_ignored_tags that would strip channel names
|
||||||
|
user_ignored_tags = ["TNT", "MTV", "GNT", "TLC", "BIS", "SD", "HD"]
|
||||||
|
|
||||||
|
# Test normalizing these streams
|
||||||
|
print("\nNormalizing streams with aggressive tags:")
|
||||||
|
for stream in test_streams:
|
||||||
|
normalized = matcher.normalize_name(stream, user_ignored_tags, remove_country_prefix=True)
|
||||||
|
print(f" '{stream}' -> '{normalized}' (len={len(normalized)})")
|
||||||
|
|
||||||
|
print("\n" + "-" * 80)
|
||||||
|
|
||||||
|
# Test matching "GNT" channel against these streams
|
||||||
|
channel_name = "GNT"
|
||||||
|
print(f"\nAttempting to match channel '{channel_name}' against test streams...")
|
||||||
|
|
||||||
|
matched_name, score, match_type = matcher.fuzzy_match(
|
||||||
|
channel_name,
|
||||||
|
test_streams,
|
||||||
|
user_ignored_tags,
|
||||||
|
remove_cinemax=False
|
||||||
|
)
|
||||||
|
|
||||||
|
if matched_name:
|
||||||
|
print(f"✓ Match found: '{matched_name}' with score {score} (type: {match_type})")
|
||||||
|
# Should only match "BR GNT SD"
|
||||||
|
if matched_name == "BR GNT SD":
|
||||||
|
print("✓ PASS: Matched the correct stream!")
|
||||||
|
else:
|
||||||
|
print(f"✗ FAIL: Matched wrong stream! Expected 'BR GNT SD', got '{matched_name}'")
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
print(f"✗ No match found (score: {score})")
|
||||||
|
print(" This could be acceptable if all streams normalize to empty strings")
|
||||||
|
|
||||||
|
print()
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_string_similarity():
|
||||||
|
"""Test that empty strings don't match each other with 100% score."""
|
||||||
|
print("=" * 80)
|
||||||
|
print("Test 2: Empty String Similarity")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
matcher = FuzzyMatcher(plugin_dir=None, match_threshold=85)
|
||||||
|
|
||||||
|
# Test empty string comparison
|
||||||
|
score1 = matcher.calculate_similarity("", "")
|
||||||
|
print(f"\nSimilarity('', '') = {score1}")
|
||||||
|
|
||||||
|
if score1 == 0.0:
|
||||||
|
print("✓ PASS: Empty strings return 0.0 similarity (no false positive match)")
|
||||||
|
else:
|
||||||
|
print(f"✗ FAIL: Empty strings return {score1} similarity (should be 0.0)")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Test empty vs non-empty
|
||||||
|
score2 = matcher.calculate_similarity("", "test")
|
||||||
|
print(f"Similarity('', 'test') = {score2}")
|
||||||
|
|
||||||
|
if score2 == 0.0:
|
||||||
|
print("✓ PASS: Empty string vs non-empty returns 0.0")
|
||||||
|
else:
|
||||||
|
print(f"✗ FAIL: Empty vs non-empty returns {score2} (should be 0.0)")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print()
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def test_valid_matches_still_work():
|
||||||
|
"""Test that legitimate matches still work after the fix."""
|
||||||
|
print("=" * 80)
|
||||||
|
print("Test 3: Valid Matches Still Work")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
matcher = FuzzyMatcher(plugin_dir=None, match_threshold=85)
|
||||||
|
|
||||||
|
test_cases = [
|
||||||
|
{
|
||||||
|
"channel": "CNN",
|
||||||
|
"streams": ["CNN HD", "CNN SD", "Fox News HD"],
|
||||||
|
"expected": "CNN HD",
|
||||||
|
"user_tags": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"channel": "HBO",
|
||||||
|
"streams": ["HBO East HD", "HBO West SD", "Showtime HD"],
|
||||||
|
"expected": "HBO East HD",
|
||||||
|
"user_tags": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"channel": "ESPN",
|
||||||
|
"streams": ["ESPN HD", "ESPN2 HD", "Fox Sports HD"],
|
||||||
|
"expected": "ESPN HD",
|
||||||
|
"user_tags": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
|
||||||
|
for i, test in enumerate(test_cases, 1):
|
||||||
|
channel = test["channel"]
|
||||||
|
streams = test["streams"]
|
||||||
|
expected = test["expected"]
|
||||||
|
user_tags = test["user_tags"]
|
||||||
|
|
||||||
|
print(f"\nTest case {i}: Matching '{channel}' against {streams}")
|
||||||
|
|
||||||
|
matched_name, score, match_type = matcher.fuzzy_match(
|
||||||
|
channel,
|
||||||
|
streams,
|
||||||
|
user_tags,
|
||||||
|
remove_cinemax=False
|
||||||
|
)
|
||||||
|
|
||||||
|
if matched_name == expected:
|
||||||
|
print(f"✓ PASS: Matched '{matched_name}' (score: {score}, type: {match_type})")
|
||||||
|
else:
|
||||||
|
print(f"✗ FAIL: Expected '{expected}', got '{matched_name}'")
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
print()
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run all tests."""
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("FUZZY MATCHER BUG FIX VERIFICATION")
|
||||||
|
print("=" * 80 + "\n")
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
results.append(("Empty String Similarity", test_empty_string_similarity()))
|
||||||
|
results.append(("Empty String Normalization", test_empty_string_normalization()))
|
||||||
|
results.append(("Valid Matches Still Work", test_valid_matches_still_work()))
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print("=" * 80)
|
||||||
|
print("TEST SUMMARY")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
for test_name, passed in results:
|
||||||
|
status = "✓ PASS" if passed else "✗ FAIL"
|
||||||
|
print(f"{status}: {test_name}")
|
||||||
|
|
||||||
|
all_passed = all(passed for _, passed in results)
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
if all_passed:
|
||||||
|
print("✓ ALL TESTS PASSED!")
|
||||||
|
else:
|
||||||
|
print("✗ SOME TESTS FAILED")
|
||||||
|
print("=" * 80 + "\n")
|
||||||
|
|
||||||
|
return 0 if all_passed else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
Reference in New Issue
Block a user