ejschwartz commited on
Commit
52b18ab
·
1 Parent(s): c4e7153

Add wildcard distance

Browse files
Files changed (2) hide show
  1. dist.py +199 -0
  2. main.py +2 -0
dist.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def levenshtein_with_wildcards(str1, str2, wildcard='?', verbose=False):
2
+ """
3
+ Calculate the Levenshtein distance between two strings with support for wildcards.
4
+
5
+ Args:
6
+ str1 (str): The first string.
7
+ str2 (str): The second string.
8
+ wildcard (str, optional): The wildcard character. Defaults to '?'.
9
+ verbose (bool, optional): If True, prints the DP matrix and explains the process.
10
+
11
+ Returns:
12
+ int: The Levenshtein distance between the two strings.
13
+ list: If verbose=True, also returns a list of operations performed.
14
+ """
15
+ m, n = len(str1), len(str2)
16
+
17
+ # Create a matrix of size (m+1) x (n+1)
18
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
19
+
20
+ # Initialize the first row and column
21
+ for i in range(m + 1):
22
+ dp[i][0] = i
23
+
24
+ for j in range(n + 1):
25
+ dp[0][j] = j
26
+
27
+ # Fill the dp matrix
28
+ for i in range(1, m + 1):
29
+ for j in range(1, n + 1):
30
+ # If either character is a wildcard, treat it as a match (cost = 0)
31
+ if str1[i - 1] == wildcard or str2[j - 1] == wildcard:
32
+ dp[i][j] = dp[i - 1][j - 1] # No cost for wildcard matches
33
+ else:
34
+ cost = 0 if str1[i - 1] == str2[j - 1] else 1
35
+ dp[i][j] = min(
36
+ dp[i - 1][j] + 1, # deletion
37
+ dp[i][j - 1] + 1, # insertion
38
+ dp[i - 1][j - 1] + cost # substitution
39
+ )
40
+
41
+ if verbose:
42
+ operations = explain_match(str1, str2, dp, wildcard)
43
+ return dp[m][n], operations
44
+
45
+ return dp[m][n]
46
+
47
+ def explain_match(str1, str2, dp, wildcard='?'):
48
+ """
49
+ Traces the optimal alignment path and explains each step of the matching process.
50
+
51
+ Args:
52
+ str1 (str): The first string.
53
+ str2 (str): The second string.
54
+ dp (list): The dynamic programming matrix.
55
+ wildcard (str, optional): The wildcard character. Defaults to '?'.
56
+
57
+ Returns:
58
+ list: A list of explanation strings for each operation performed.
59
+ """
60
+ m, n = len(str1), len(str2)
61
+ operations = []
62
+
63
+ # Find the optimal path
64
+ i, j = m, n
65
+ path = []
66
+
67
+ while i > 0 or j > 0:
68
+ path.append((i, j))
69
+
70
+ if i == 0:
71
+ j -= 1
72
+ elif j == 0:
73
+ i -= 1
74
+ else:
75
+ substitution_cost = dp[i-1][j-1]
76
+ deletion_cost = dp[i-1][j]
77
+ insertion_cost = dp[i][j-1]
78
+
79
+ min_cost = min(substitution_cost, deletion_cost, insertion_cost)
80
+
81
+ if min_cost == substitution_cost:
82
+ i -= 1
83
+ j -= 1
84
+ elif min_cost == deletion_cost:
85
+ i -= 1
86
+ else:
87
+ j -= 1
88
+
89
+ path.append((0, 0))
90
+ path.reverse()
91
+
92
+ # Generate explanations for each step
93
+ for idx in range(1, len(path)):
94
+ prev_i, prev_j = path[idx-1]
95
+ curr_i, curr_j = path[idx]
96
+
97
+ # Diagonal move (match or substitution)
98
+ if curr_i > prev_i and curr_j > prev_j:
99
+ char1 = str1[curr_i-1]
100
+ char2 = str2[curr_j-1]
101
+
102
+ if char1 == wildcard or char2 == wildcard:
103
+ wildcard_char = char1 if char1 == wildcard else char2
104
+ match_char = char2 if char1 == wildcard else char1
105
+ operations.append(f"Wildcard match: '{wildcard_char}' matches any character, here '{match_char}'")
106
+ elif char1 == char2:
107
+ operations.append(f"Match: '{char1}' matches '{char2}'")
108
+ else:
109
+ operations.append(f"Substitution: Replace '{char1}' with '{char2}'")
110
+
111
+ # Horizontal move (insertion)
112
+ elif curr_i == prev_i and curr_j > prev_j:
113
+ operations.append(f"Insertion: Insert '{str2[curr_j-1]}'")
114
+
115
+ # Vertical move (deletion)
116
+ elif curr_i > prev_i and curr_j == prev_j:
117
+ operations.append(f"Deletion: Delete '{str1[curr_i-1]}'")
118
+
119
+ return operations
120
+
121
+ def print_match_summary(str1, str2, wildcard='?'):
122
+ """
123
+ Prints a summary of the match between two strings, highlighting wildcards.
124
+
125
+ Args:
126
+ str1 (str): The first string.
127
+ str2 (str): The second string.
128
+ wildcard (str, optional): The wildcard character. Defaults to '?'.
129
+ """
130
+ distance, operations = levenshtein_with_wildcards(str1, str2, wildcard, verbose=True)
131
+
132
+ print(f"Comparing '{str1}' and '{str2}' (wildcard: '{wildcard}')")
133
+ print(f"Edit distance: {distance}")
134
+ print("\nMatch process:")
135
+
136
+ for i, op in enumerate(operations):
137
+ print(f"Step {i+1}: {op}")
138
+
139
+ # Visual representation
140
+ alignment = []
141
+ i, j = 0, 0
142
+ aligned_str1 = ""
143
+ aligned_str2 = ""
144
+ match_indicators = ""
145
+
146
+ for op in operations:
147
+ if "match" in op or "Match" in op or "Substitution" in op:
148
+ aligned_str1 += str1[i]
149
+ aligned_str2 += str2[j]
150
+
151
+ if "Wildcard" in op:
152
+ match_indicators += "*" # Wildcard match
153
+ elif "Match" in op:
154
+ match_indicators += "|" # Exact match
155
+ else:
156
+ match_indicators += "X" # Substitution
157
+
158
+ i += 1
159
+ j += 1
160
+ elif "Insertion" in op:
161
+ aligned_str1 += "-"
162
+ aligned_str2 += str2[j]
163
+ match_indicators += " "
164
+ j += 1
165
+ elif "Deletion" in op:
166
+ aligned_str1 += str1[i]
167
+ aligned_str2 += "-"
168
+ match_indicators += " "
169
+ i += 1
170
+
171
+ print("\nAlignment:")
172
+ print(aligned_str1)
173
+ print(match_indicators)
174
+ print(aligned_str2)
175
+ print("\nLegend:")
176
+ print("| = exact match, * = wildcard match, X = substitution, - = gap (insertion/deletion)")
177
+
178
+ # Summary of wildcard matches
179
+ wildcard_matches = [op for op in operations if "Wildcard" in op]
180
+ if wildcard_matches:
181
+ print("\nWildcard matches:")
182
+ for match in wildcard_matches:
183
+ print(f"- {match}")
184
+
185
+ return distance, operations
186
+
187
+ # Example usage
188
+ if __name__ == "__main__":
189
+ # Basic examples
190
+ print_match_summary("hello", "hello") # 0 (identical strings)
191
+ print_match_summary("hello", "hallo") # 1 (one substitution)
192
+ print_match_summary("he?lo", "hello") # 0 (wildcard matches 'l')
193
+ print_match_summary("he?lo", "hallo") # 0 (wildcard matches 'a')
194
+ print_match_summary("h?llo", "hello") # 0 (wildcard matches 'e')
195
+ print_match_summary("h?llo", "hillo") # 0 (wildcard matches 'i')
196
+ print_match_summary("c?t", "cat") # 0 (wildcard matches 'a')
197
+ print_match_summary("c?t", "cut") # 0 (wildcard matches 'u')
198
+ print_match_summary("w?rd", "word") # 0 (wildcard matches 'o')
199
+ print_match_summary("d?g", "dog") # 0 (wildcard matches 'o')
main.py CHANGED
@@ -7,6 +7,8 @@ import shlex
7
  import subprocess
8
  import tempfile
9
 
 
 
10
  description = frontmatter.load("README.md").content
11
 
12
  def trim(str, n):
 
7
  import subprocess
8
  import tempfile
9
 
10
+ from dist import levenshtein_with_wildcards, print_match_summary
11
+
12
  description = frontmatter.load("README.md").content
13
 
14
  def trim(str, n):