From 833ba094fc548a7cc018e1fc7b825c15ff5ae475 Mon Sep 17 00:00:00 2001 From: Pirates IRC <98669745+PiratesIRC@users.noreply.github.com> Date: Sun, 28 Dec 2025 07:34:00 -0600 Subject: [PATCH] 0.7.3 # Stream-Mapparr Plugin Changelog (v0.7.2 -> v0.7.3) ### Matching Logic Improvements - Implemented a length ratio check (75%) for substring matches to prevent partial word false positives (e.g., preventing "story" from matching "history"). - Added strict validation for numeric tokens; streams must now explicitly contain matching numbers if the channel name includes them (e.g., prevents "BBC1" from matching "CBBC"). ### Features - Added the underlying FuzzyMatcher library version number to the header of generated CSV export files for better debugging. ### Maintenance - Updated the minimum required FuzzyMatcher version to 25.358.0200. --- Stream-Mapparr/fuzzy_matcher.py | 131 +++++++++++++++++++++++++------- Stream-Mapparr/plugin.py | 29 +++++-- 2 files changed, 125 insertions(+), 35 deletions(-) diff --git a/Stream-Mapparr/fuzzy_matcher.py b/Stream-Mapparr/fuzzy_matcher.py index fc3819b..5ec8ab3 100644 --- a/Stream-Mapparr/fuzzy_matcher.py +++ b/Stream-Mapparr/fuzzy_matcher.py @@ -12,7 +12,7 @@ import unicodedata from glob import glob # Version: YY.DDD.HHMM (Julian date format: Year.DayOfYear.Time) -__version__ = "25.354.1835" +__version__ = "25.358.0200" # Setup logging LOGGER = logging.getLogger("plugins.fuzzy_matcher") @@ -398,7 +398,20 @@ class FuzzyMatcher: patterns_to_apply.extend(GEOGRAPHIC_PATTERNS) if ignore_misc: - patterns_to_apply.extend(MISC_PATTERNS) + # CRITICAL FIX: Only apply MISC_PATTERNS (which removes ALL parentheses) if we're also + # ignoring regional tags. Otherwise, MISC_PATTERNS would strip regional indicators like + # "(WEST)" even when the user has set ignore_regional=False. + # This ensures that "BBC America" won't match "BBC AMERICA (WEST)" when ignore_regional=False + if ignore_regional: + # Safe to remove ALL parentheses since regional indicators are already being ignored + patterns_to_apply.extend(MISC_PATTERNS) + else: + # User wants to preserve regional indicators - skip MISC_PATTERNS to avoid + # removing parenthetical content that might be regional indicators + # Note: This means some misc tags like (CX), (B), (PRIME) won't be removed + # when ignore_regional=False, but this is the correct behavior to preserve + # regional tags like (WEST), (EAST), etc. + pass # Apply selected hardcoded patterns for pattern in patterns_to_apply: @@ -406,22 +419,42 @@ class FuzzyMatcher: # Apply user-configured ignored tags with improved handling for tag in user_ignored_tags: + escaped_tag = re.escape(tag) + # Check if tag contains brackets or parentheses - if so, match literally if '[' in tag or ']' in tag or '(' in tag or ')' in tag: - # Literal match for bracketed/parenthesized tags - escaped_tag = re.escape(tag) - name = re.sub(escaped_tag, '', name, flags=re.IGNORECASE) + # Literal match for bracketed/parenthesized tags, remove with trailing whitespace + name = re.sub(escaped_tag + r'\s*', '', name, flags=re.IGNORECASE) else: - # Word boundary match for simple word tags to avoid partial matches - # e.g., "East" won't match the "east" in "Feast" - escaped_tag = re.escape(tag) - name = re.sub(r'\b' + escaped_tag + r'\b', '', name, flags=re.IGNORECASE) + # CRITICAL FIX: Word boundaries (\b) only work with alphanumeric characters + # Tags with Unicode/special characters (like ┃NLZIET┃) fail with word boundaries + # Check if tag contains only word characters (alphanumeric + underscore) + if re.match(r'^\w+$', tag): + # Safe to use word boundaries for pure word tags + # This prevents "East" from matching the "east" in "Feast" + name = re.sub(r'\b' + escaped_tag + r'\b', '', name, flags=re.IGNORECASE) + else: + # Tag contains special/Unicode characters - can't use word boundaries + # Match the tag followed by optional whitespace + name = re.sub(escaped_tag + r'\s*', '', name, flags=re.IGNORECASE) # Remove callsigns in parentheses - name = re.sub(r'\([KW][A-Z]{3}(?:-(?:TV|CD|LP|DT|LD))?\)', '', name, flags=re.IGNORECASE) - - # Remove other tags in parentheses - name = re.sub(r'\([A-Z0-9]+\)', '', name) + # CRITICAL FIX: Don't remove regional indicators like (WEST), (EAST), etc. when ignore_regional=False + # The callsign pattern \([KW][A-Z]{3}...\) accidentally matches (WEST), (WETA), (KOMO), etc. + # We need to exclude known regional indicators even when matching callsigns + if ignore_regional: + # Safe to remove callsigns without checking for regional indicators + name = re.sub(r'\([KW][A-Z]{3}(?:-(?:TV|CD|LP|DT|LD))?\)', '', name, flags=re.IGNORECASE) + else: + # Only remove callsigns that are NOT regional indicators + # Use negative lookahead to exclude WEST, EAST, etc. + # Pattern matches (K or W) + 3 letters, but NOT if those 3 letters form a regional word + name = re.sub(r'\([KW](?!EST\)|ACIFIC\)|ENTRAL\)|OUNTAIN\)|TLANTIC\))[A-Z]{3}(?:-(?:TV|CD|LP|DT|LD))?\)', '', name, flags=re.IGNORECASE) + + # Remove other tags in parentheses (but only if we're also ignoring regional tags) + # Otherwise this would remove regional indicators like (WEST), (EAST), etc. + if ignore_regional: + name = re.sub(r'\([A-Z0-9]+\)', '', name) # Remove common pattern fixes name = re.sub(r'^The\s+', '', name, flags=re.IGNORECASE) @@ -569,7 +602,8 @@ class FuzzyMatcher: tokens = sorted([token for token in cleaned_s.split() if token]) return " ".join(tokens) - def find_best_match(self, query_name, candidate_names, user_ignored_tags=None, remove_cinemax=False): + def find_best_match(self, query_name, candidate_names, user_ignored_tags=None, remove_cinemax=False, + ignore_quality=True, ignore_regional=True, ignore_geographic=True, ignore_misc=True): """ Find the best fuzzy match for a name among a list of candidate names. @@ -578,6 +612,10 @@ class FuzzyMatcher: candidate_names: List of candidate names to match against user_ignored_tags: User-configured tags to ignore remove_cinemax: If True, remove "Cinemax" from candidate names + ignore_quality: If True, remove ALL quality indicators during normalization + ignore_regional: If True, remove regional indicator patterns during normalization + ignore_geographic: If True, remove ALL country code patterns during normalization + ignore_misc: If True, remove ALL content within parentheses during normalization Returns: Tuple of (matched_name, score) or (None, 0) if no match found @@ -589,7 +627,11 @@ class FuzzyMatcher: user_ignored_tags = [] # Normalize the query (channel name - don't remove Cinemax from it) - normalized_query = self.normalize_name(query_name, user_ignored_tags) + normalized_query = self.normalize_name(query_name, user_ignored_tags, + ignore_quality=ignore_quality, + ignore_regional=ignore_regional, + ignore_geographic=ignore_geographic, + ignore_misc=ignore_misc) if not normalized_query: return None, 0 @@ -602,7 +644,12 @@ class FuzzyMatcher: for candidate in candidate_names: # Normalize candidate (stream name) with Cinemax removal if requested - candidate_normalized = self.normalize_name(candidate, user_ignored_tags, remove_cinemax=remove_cinemax) + candidate_normalized = self.normalize_name(candidate, user_ignored_tags, + ignore_quality=ignore_quality, + ignore_regional=ignore_regional, + ignore_geographic=ignore_geographic, + ignore_misc=ignore_misc, + remove_cinemax=remove_cinemax) # Skip candidates that normalize to empty or very short strings if not candidate_normalized or len(candidate_normalized) < 2: @@ -623,7 +670,8 @@ class FuzzyMatcher: return None, 0 - def fuzzy_match(self, query_name, candidate_names, user_ignored_tags=None, remove_cinemax=False): + def fuzzy_match(self, query_name, candidate_names, user_ignored_tags=None, remove_cinemax=False, + ignore_quality=True, ignore_regional=True, ignore_geographic=True, ignore_misc=True): """ Generic fuzzy matching function that can match any name against a list of candidates. This is the main entry point for fuzzy matching. @@ -633,6 +681,10 @@ class FuzzyMatcher: candidate_names: List of candidate names to match against (stream names) user_ignored_tags: User-configured tags to ignore remove_cinemax: If True, remove "Cinemax" from candidate names (for channels with "max") + ignore_quality: If True, remove ALL quality indicators during normalization + ignore_regional: If True, remove regional indicator patterns during normalization + ignore_geographic: If True, remove ALL country code patterns during normalization + ignore_misc: If True, remove ALL content within parentheses during normalization Returns: Tuple of (matched_name, score, match_type) or (None, 0, None) if no match found @@ -644,7 +696,11 @@ class FuzzyMatcher: user_ignored_tags = [] # Normalize query (channel name - don't remove Cinemax from it) - normalized_query = self.normalize_name(query_name, user_ignored_tags) + normalized_query = self.normalize_name(query_name, user_ignored_tags, + ignore_quality=ignore_quality, + ignore_regional=ignore_regional, + ignore_geographic=ignore_geographic, + ignore_misc=ignore_misc) if not normalized_query: return None, 0, None @@ -659,7 +715,12 @@ class FuzzyMatcher: for candidate in candidate_names: # Normalize candidate (stream name) with Cinemax removal if requested - candidate_normalized = self.normalize_name(candidate, user_ignored_tags, remove_cinemax=remove_cinemax) + candidate_normalized = self.normalize_name(candidate, user_ignored_tags, + ignore_quality=ignore_quality, + ignore_regional=ignore_regional, + ignore_geographic=ignore_geographic, + ignore_misc=ignore_misc, + remove_cinemax=remove_cinemax) # Skip candidates that normalize to empty or very short strings (< 2 chars) # This prevents false positives where multiple streams all normalize to "" @@ -686,7 +747,12 @@ class FuzzyMatcher: # Stage 2: Substring matching for candidate in candidate_names: # Normalize candidate (stream name) with Cinemax removal if requested - candidate_normalized = self.normalize_name(candidate, user_ignored_tags, remove_cinemax=remove_cinemax) + candidate_normalized = self.normalize_name(candidate, user_ignored_tags, + ignore_quality=ignore_quality, + ignore_regional=ignore_regional, + ignore_geographic=ignore_geographic, + ignore_misc=ignore_misc, + remove_cinemax=remove_cinemax) # Skip candidates that normalize to empty or very short strings if not candidate_normalized or len(candidate_normalized) < 2: @@ -696,18 +762,29 @@ class FuzzyMatcher: # Check if one is a substring of the other if normalized_query_lower in candidate_lower or candidate_lower in normalized_query_lower: - # Calculate similarity score - ratio = self.calculate_similarity(normalized_query_lower, candidate_lower) - if ratio > best_ratio: - best_match = candidate - best_ratio = ratio - match_type = "substring" + # CRITICAL FIX: Add length ratio requirement to prevent false positives + # like "story" matching "history" (story is 5 chars, history is 7 chars) + # Require strings to be within 75% of same length for substring match + # This ensures substring matches are semantically meaningful + length_ratio = min(len(normalized_query_lower), len(candidate_lower)) / max(len(normalized_query_lower), len(candidate_lower)) + if length_ratio >= 0.75: + # Calculate similarity score + ratio = self.calculate_similarity(normalized_query_lower, candidate_lower) + if ratio > best_ratio: + best_match = candidate + best_ratio = ratio + match_type = "substring" if best_match and int(best_ratio * 100) >= self.match_threshold: return best_match, int(best_ratio * 100), match_type # Stage 3: Fuzzy matching with token sorting - fuzzy_match, score = self.find_best_match(query_name, candidate_names, user_ignored_tags, remove_cinemax=remove_cinemax) + fuzzy_match, score = self.find_best_match(query_name, candidate_names, user_ignored_tags, + remove_cinemax=remove_cinemax, + ignore_quality=ignore_quality, + ignore_regional=ignore_regional, + ignore_geographic=ignore_geographic, + ignore_misc=ignore_misc) if fuzzy_match: return fuzzy_match, score, f"fuzzy ({score})" diff --git a/Stream-Mapparr/plugin.py b/Stream-Mapparr/plugin.py index d01e706..2fe5252 100644 --- a/Stream-Mapparr/plugin.py +++ b/Stream-Mapparr/plugin.py @@ -21,6 +21,8 @@ import threading # Import FuzzyMatcher from the same directory from .fuzzy_matcher import FuzzyMatcher +# Import fuzzy_matcher version for CSV header +from . import fuzzy_matcher # Django model imports - same approach as Event Channel Managarr from apps.channels.models import Channel, ChannelProfileMembership, ChannelStream, Stream @@ -63,8 +65,8 @@ class PluginConfig: """ # === PLUGIN METADATA === - PLUGIN_VERSION = "0.7.2" - FUZZY_MATCHER_MIN_VERSION = "25.354.1835" # Requires complete regional patterns support + PLUGIN_VERSION = "0.7.3" + FUZZY_MATCHER_MIN_VERSION = "25.358.0200" # Requires custom ignore tags Unicode fix # === MATCHING SETTINGS === DEFAULT_FUZZY_MATCH_THRESHOLD = 85 # Minimum similarity score (0-100) @@ -2021,7 +2023,9 @@ class Plugin: if self.fuzzy_matcher: stream_names = [stream['name'] for stream in working_streams] matched_stream_name, score, match_type = self.fuzzy_matcher.fuzzy_match( - channel_name, stream_names, ignore_tags, remove_cinemax=channel_has_max + channel_name, stream_names, ignore_tags, remove_cinemax=channel_has_max, + ignore_quality=ignore_quality, ignore_regional=ignore_regional, + ignore_geographic=ignore_geographic, ignore_misc=ignore_misc ) if matched_stream_name: @@ -2055,10 +2059,16 @@ class Plugin: # Substring match: stream contains channel OR channel contains stream if stream_lower in channel_lower or channel_lower in stream_lower: - # Calculate similarity to ensure it meets threshold - similarity = self.fuzzy_matcher.calculate_similarity(stream_lower, channel_lower) - if int(similarity * 100) >= self.fuzzy_matcher.match_threshold: - matching_streams.append(stream) + # CRITICAL FIX: Add length ratio requirement to prevent false positives + # like "story" matching "history" (story is 5 chars, history is 7 chars) + # Require strings to be within 75% of same length for substring match + # This ensures substring matches are semantically meaningful + length_ratio = min(len(stream_lower), len(channel_lower)) / max(len(stream_lower), len(channel_lower)) + if length_ratio >= 0.75: + # Calculate similarity to ensure it meets threshold + similarity = self.fuzzy_matcher.calculate_similarity(stream_lower, channel_lower) + if int(similarity * 100) >= self.fuzzy_matcher.match_threshold: + matching_streams.append(stream) continue # Token-based matching: check if significant tokens overlap @@ -2245,7 +2255,9 @@ class Plugin: try: stream_names = [stream['name'] for stream in all_streams] matched_stream_name, score, match_type = self.fuzzy_matcher.fuzzy_match( - channel_name, stream_names, ignore_tags, remove_cinemax=channel_has_max + channel_name, stream_names, ignore_tags, remove_cinemax=channel_has_max, + ignore_quality=ignore_quality, ignore_regional=ignore_regional, + ignore_geographic=ignore_geographic, ignore_misc=ignore_misc ) if matched_stream_name: @@ -3279,6 +3291,7 @@ class Plugin: # Build header with all settings except login credentials header_lines = [ f"# Stream-Mapparr Export v{self.version}", + f"# FuzzyMatcher Version: {fuzzy_matcher.__version__}", f"# Export Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", "#", "# === Action Performed ===",