Merge pull request #15 from PiratesIRC/claude/investigate-fuzzy-matcher-bug-01R5rQiHwBMwKZfAUCcJMzKW

Claude/investigate fuzzy matcher bug 01 r5r qi hw b mw k zf au cc j mz kw
2025-11-13 12:18:29 -06:00
parent f941e513c9 9caa91b01a
commit 46d941165b
3 changed files with 249 additions and 17 deletions
--- a/Stream-Mapparr/fuzzy_matcher.py
+++ b/Stream-Mapparr/fuzzy_matcher.py
@@ -11,7 +11,7 @@ import logging
 from glob import glob
 # Version: YY.DDD.HHMM (Julian date format: Year.DayOfYear.Time)
-__version__ = "25.314.1907"
+__version__ = "25.317.1200"
 # Setup logging
 LOGGER = logging.getLogger("plugins.fuzzy_matcher")
@@ -213,6 +213,9 @@ class FuzzyMatcher:
        if user_ignored_tags is None:
            user_ignored_tags = []
        # Store original for logging
        original_name = name
        # Remove leading parenthetical prefixes like (SP2), (D1), etc.
        name = re.sub(r'^\([^\)]+\)\s*', '', name)
@@ -223,7 +226,8 @@ class FuzzyMatcher:
            quality_tags = {'HD', 'SD', 'FD', 'UHD', 'FHD'}
            # Check for 2-3 letter prefix with colon or space at start
-            prefix_match = re.match(r'^([A-Z]{2,3})[:|\s]\s*', name)
+            # Fixed regex: [:\s] instead of [:|\s] (pipe and backslash were incorrect)
            prefix_match = re.match(r'^([A-Z]{2,3})[:\s]\s*', name)
            if prefix_match:
                prefix = prefix_match.group(1).upper()
                # Only remove if it's NOT a quality tag
@@ -281,6 +285,10 @@ class FuzzyMatcher:
        # Clean up whitespace
        name = re.sub(r'\s+', ' ', name).strip()
        # Log warning if normalization resulted in empty string (indicates overly aggressive stripping)
        if not name:
            self.logger.warning(f"normalize_name returned empty string for input: '{original_name}' (original input was stripped too aggressively)")
        return name
    def extract_tags(self, name, user_ignored_tags=None):
@@ -353,8 +361,10 @@ class FuzzyMatcher:
        if len(str1) < len(str2):
            str1, str2 = str2, str1
-        if len(str2) == 0:
+        # Empty strings should not match anything (including other empty strings)
-            return 1.0 if len(str1) == 0 else 0.0
+        # This prevents false positives when normalization strips everything
        if len(str2) == 0 or len(str1) == 0:
            return 0.0
        previous_row = list(range(len(str2) + 1))
@@ -429,6 +439,11 @@ class FuzzyMatcher:
        for candidate in candidate_names:
            # Normalize candidate (stream name) with Cinemax removal if requested
            candidate_normalized = self.normalize_name(candidate, user_ignored_tags, remove_cinemax=remove_cinemax)
            # Skip candidates that normalize to empty or very short strings
            if not candidate_normalized or len(candidate_normalized) < 2:
                continue
            processed_candidate = self.process_string_for_matching(candidate_normalized)
            score = self.calculate_similarity(processed_query, processed_candidate)
@@ -481,6 +496,12 @@ class FuzzyMatcher:
        for candidate in candidate_names:
            # Normalize candidate (stream name) with Cinemax removal if requested
            candidate_normalized = self.normalize_name(candidate, user_ignored_tags, remove_cinemax=remove_cinemax)
            # Skip candidates that normalize to empty or very short strings (< 2 chars)
            # This prevents false positives where multiple streams all normalize to ""
            if not candidate_normalized or len(candidate_normalized) < 2:
                continue
            candidate_lower = candidate_normalized.lower()
            candidate_nospace = re.sub(r'[\s&\-]+', '', candidate_lower)
@@ -502,6 +523,11 @@ class FuzzyMatcher:
        for candidate in candidate_names:
            # Normalize candidate (stream name) with Cinemax removal if requested
            candidate_normalized = self.normalize_name(candidate, user_ignored_tags, remove_cinemax=remove_cinemax)
            # Skip candidates that normalize to empty or very short strings
            if not candidate_normalized or len(candidate_normalized) < 2:
                continue
            candidate_lower = candidate_normalized.lower()
            # Check if one is a substring of the other
--- a/Stream-Mapparr/plugin.py
+++ b/Stream-Mapparr/plugin.py
@@ -33,7 +33,7 @@ class Plugin:
    """Dispatcharr Stream-Mapparr Plugin"""
    name = "Stream-Mapparr"
-    version = "0.5.0"
+    version = "0.5.1"
    description = "🎯 Automatically add matching streams to channels based on name similarity and quality precedence with enhanced fuzzy matching"
    @property
@@ -149,13 +149,6 @@ class Plugin:
            },
        ]
        # Add channel database section header
        static_fields.append({
            "id": "channel_databases_header",
            "type": "info",
            "label": "📚 Channel Databases",
        })
        # Dynamically add channel database enable/disable fields
        try:
            databases = self._get_channel_databases()
@@ -753,7 +746,8 @@ class Plugin:
        # Remove country code prefix if requested
        if remove_country_prefix:
            quality_tags = {'HD', 'SD', 'FD', 'UHD', 'FHD'}
-            prefix_match = re.match(r'^([A-Z]{2,3})[:|\s]\s*', cleaned)
+            # Fixed regex: [:\s] instead of [:|\s] (pipe and backslash were incorrect)
            prefix_match = re.match(r'^([A-Z]{2,3})[:\s]\s*', cleaned)
            if prefix_match:
                prefix = prefix_match.group(1).upper()
                if prefix not in quality_tags:
@@ -1025,6 +1019,12 @@ class Plugin:
                        ignore_geographic, ignore_misc, remove_cinemax=channel_has_max
                    )
                    # Skip if either cleaned name is empty or too short (prevents false positives)
                    if not cleaned_stream or len(cleaned_stream) < 2:
                        continue
                    if not cleaned_matched or len(cleaned_matched) < 2:
                        continue
                    if cleaned_stream.lower() == cleaned_matched.lower():
                        matching_streams.append(stream)
@@ -1064,6 +1064,12 @@ class Plugin:
                    ignore_geographic, ignore_misc, remove_cinemax=channel_has_max
                )
                # Skip if either cleaned name is empty or too short (prevents false positives)
                if not cleaned_stream_name or len(cleaned_stream_name) < 2:
                    continue
                if not cleaned_channel_name or len(cleaned_channel_name) < 2:
                    continue
                if cleaned_stream_name.lower() == cleaned_channel_name.lower():
                    matching_streams.append(stream)
@@ -1086,6 +1092,12 @@ class Plugin:
                ignore_geographic, ignore_misc, remove_cinemax=channel_has_max
            )
            # Skip if either cleaned name is empty or too short (prevents false positives)
            if not cleaned_stream_name or len(cleaned_stream_name) < 2:
                continue
            if not cleaned_channel_name or len(cleaned_channel_name) < 2:
                continue
            # Simple case-insensitive substring matching
            if cleaned_channel_name.lower() in cleaned_stream_name.lower() or cleaned_stream_name.lower() in cleaned_channel_name.lower():
                matching_streams.append(stream)
--- a/test_fuzzy_matcher_fix.py
+++ b/test_fuzzy_matcher_fix.py
@@ -0,0 +1,194 @@
 #!/usr/bin/env python3
 """
 Test script to verify the fuzzy matcher bug fix.
 Tests that streams which normalize to empty strings don't produce false positive matches.
 """
 import sys
 import os
 # Add the Stream-Mapparr directory to the path
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'Stream-Mapparr'))
 from fuzzy_matcher import FuzzyMatcher
 def test_empty_string_normalization():
    """Test that empty normalized strings don't cause false positive matches."""
    print("=" * 80)
    print("Test 1: Empty String Normalization")
    print("=" * 80)
    # Create a fuzzy matcher
    matcher = FuzzyMatcher(plugin_dir=None, match_threshold=85)
    # Test case 1: Stream names that could normalize to empty strings
    test_streams = [
        "BR TNT SD",
        "BR MTV SD",
        "BR GNT SD",
        "BR TLC SD",
        "BR BIS SD"
    ]
    # Configure user_ignored_tags that would strip channel names
    user_ignored_tags = ["TNT", "MTV", "GNT", "TLC", "BIS", "SD", "HD"]
    # Test normalizing these streams
    print("\nNormalizing streams with aggressive tags:")
    for stream in test_streams:
        normalized = matcher.normalize_name(stream, user_ignored_tags, remove_country_prefix=True)
        print(f"  '{stream}' -> '{normalized}' (len={len(normalized)})")
    print("\n" + "-" * 80)
    # Test matching "GNT" channel against these streams
    channel_name = "GNT"
    print(f"\nAttempting to match channel '{channel_name}' against test streams...")
    matched_name, score, match_type = matcher.fuzzy_match(
        channel_name,
        test_streams,
        user_ignored_tags,
        remove_cinemax=False
    )
    if matched_name:
        print(f"✓ Match found: '{matched_name}' with score {score} (type: {match_type})")
        # Should only match "BR GNT SD"
        if matched_name == "BR GNT SD":
            print("✓ PASS: Matched the correct stream!")
        else:
            print(f"✗ FAIL: Matched wrong stream! Expected 'BR GNT SD', got '{matched_name}'")
            return False
    else:
        print(f"✗ No match found (score: {score})")
        print("  This could be acceptable if all streams normalize to empty strings")
    print()
    return True
 def test_empty_string_similarity():
    """Test that empty strings don't match each other with 100% score."""
    print("=" * 80)
    print("Test 2: Empty String Similarity")
    print("=" * 80)
    matcher = FuzzyMatcher(plugin_dir=None, match_threshold=85)
    # Test empty string comparison
    score1 = matcher.calculate_similarity("", "")
    print(f"\nSimilarity('', '') = {score1}")
    if score1 == 0.0:
        print("✓ PASS: Empty strings return 0.0 similarity (no false positive match)")
    else:
        print(f"✗ FAIL: Empty strings return {score1} similarity (should be 0.0)")
        return False
    # Test empty vs non-empty
    score2 = matcher.calculate_similarity("", "test")
    print(f"Similarity('', 'test') = {score2}")
    if score2 == 0.0:
        print("✓ PASS: Empty string vs non-empty returns 0.0")
    else:
        print(f"✗ FAIL: Empty vs non-empty returns {score2} (should be 0.0)")
        return False
    print()
    return True
 def test_valid_matches_still_work():
    """Test that legitimate matches still work after the fix."""
    print("=" * 80)
    print("Test 3: Valid Matches Still Work")
    print("=" * 80)
    matcher = FuzzyMatcher(plugin_dir=None, match_threshold=85)
    test_cases = [
        {
            "channel": "CNN",
            "streams": ["CNN HD", "CNN SD", "Fox News HD"],
            "expected": "CNN HD",
            "user_tags": []
        },
        {
            "channel": "HBO",
            "streams": ["HBO East HD", "HBO West SD", "Showtime HD"],
            "expected": "HBO East HD",
            "user_tags": []
        },
        {
            "channel": "ESPN",
            "streams": ["ESPN HD", "ESPN2 HD", "Fox Sports HD"],
            "expected": "ESPN HD",
            "user_tags": []
        }
    ]
    all_passed = True
    for i, test in enumerate(test_cases, 1):
        channel = test["channel"]
        streams = test["streams"]
        expected = test["expected"]
        user_tags = test["user_tags"]
        print(f"\nTest case {i}: Matching '{channel}' against {streams}")
        matched_name, score, match_type = matcher.fuzzy_match(
            channel,
            streams,
            user_tags,
            remove_cinemax=False
        )
        if matched_name == expected:
            print(f"✓ PASS: Matched '{matched_name}' (score: {score}, type: {match_type})")
        else:
            print(f"✗ FAIL: Expected '{expected}', got '{matched_name}'")
            all_passed = False
    print()
    return all_passed
 def main():
    """Run all tests."""
    print("\n" + "=" * 80)
    print("FUZZY MATCHER BUG FIX VERIFICATION")
    print("=" * 80 + "\n")
    results = []
    # Run tests
    results.append(("Empty String Similarity", test_empty_string_similarity()))
    results.append(("Empty String Normalization", test_empty_string_normalization()))
    results.append(("Valid Matches Still Work", test_valid_matches_still_work()))
    # Print summary
    print("=" * 80)
    print("TEST SUMMARY")
    print("=" * 80)
    for test_name, passed in results:
        status = "✓ PASS" if passed else "✗ FAIL"
        print(f"{status}: {test_name}")
    all_passed = all(passed for _, passed in results)
    print("\n" + "=" * 80)
    if all_passed:
        print("✓ ALL TESTS PASSED!")
    else:
        print("✗ SOME TESTS FAILED")
    print("=" * 80 + "\n")
    return 0 if all_passed else 1
 if __name__ == "__main__":
    sys.exit(main())