Spaces:
Running
Running
import difflib | |
import html | |
import re | |
from typing import List, Tuple | |
# --- Helper Function for Markdown Highlighting --- | |
def generate_highlighted_markdown(text, spans_with_info): | |
"""Applies highlighting spans with hover info to text for Markdown output.""" | |
# Ensure spans are sorted by start index and valid | |
# Expects spans_with_info to be list of (start, end, hover_text_string) | |
valid_spans = sorted( | |
[ | |
(s, e, info) | |
for s, e, info in spans_with_info # Unpack the tuple | |
if isinstance(s, int) and isinstance(e, int) and 0 <= s <= e <= len(text) | |
], | |
key=lambda x: x[0], | |
) | |
highlighted_parts = [] | |
current_pos = 0 | |
# Iterate through sorted spans with info | |
for start, end, hover_text in valid_spans: | |
# Add text before the current span (NO HTML escaping) | |
if start > current_pos: | |
highlighted_parts.append(text[current_pos:start]) | |
# Add the highlighted span with title attribute | |
if start < end: | |
# Escape hover text for the title attribute | |
escaped_hover_text = html.escape(hover_text, quote=True) | |
# Escape span content for display | |
escaped_content = html.escape(text[start:end]) | |
highlighted_parts.append( | |
f"<span style='background-color: lightgreen;' title='{escaped_hover_text}'>{escaped_content}</span>" | |
) | |
# Update current position, ensuring it doesn't go backward in case of overlap | |
current_pos = max(current_pos, end) | |
# Add any remaining text after the last span (NO HTML escaping) | |
if current_pos < len(text): | |
highlighted_parts.append(text[current_pos:]) | |
return "".join(highlighted_parts) | |
# --- Citation Span Matching Function --- | |
def find_citation_spans(document: str, citation: str) -> List[Tuple[int, int]]: | |
""" | |
Finds character spans in the document that likely form the citation, | |
allowing for fragments and minor differences. Uses SequenceMatcher | |
on alphanumeric words and maps back to character indices. | |
This follows a greedy iterative strategy to find the longest match to account for cases where fragments are reordered. | |
Args: | |
document: The source document string. | |
citation: The citation string, potentially with fragments/typos. | |
Returns: | |
A list of (start, end) character tuples from the document, | |
representing the most likely origins of the citation fragments. | |
""" | |
# 1. Tokenize document and citation into ALPHANUMERIC words with char spans | |
doc_tokens = [ | |
(m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", document) | |
] | |
cite_tokens = [ | |
(m.group(0), m.start(), m.end()) for m in re.finditer(r"[a-zA-Z0-9]+", citation) | |
] | |
if not doc_tokens or not cite_tokens: | |
return [] | |
doc_words = [t[0].lower() for t in doc_tokens] | |
cite_words = [t[0].lower() for t in cite_tokens] | |
# 2. Find longest common blocks of words using SequenceMatcher | |
matcher = difflib.SequenceMatcher(None, doc_words, cite_words, autojunk=False) | |
matching_blocks = [] | |
matched_tokens = 0 | |
unmatched_doc_words = [(0, len(doc_words))] | |
unmatched_cite_words = [(0, len(cite_words))] | |
while matched_tokens < len(cite_words): | |
next_match_candidates = [] | |
for da, db in unmatched_doc_words: | |
for ca, cb in unmatched_cite_words: | |
match = matcher.find_longest_match(da, db, ca, cb) | |
if match.size > 0: | |
next_match_candidates.append(match) | |
if len(next_match_candidates) == 0: | |
break | |
next_match = max(next_match_candidates, key=lambda x: x.size) | |
matching_blocks.append(next_match) | |
matched_tokens += next_match.size | |
# Update unmatched regions (this part needs careful implementation) | |
# Simplified logic: remove fully contained regions and split overlapping ones | |
new_unmatched_docs = [] | |
for da, db in unmatched_doc_words: | |
# Check if this doc segment overlaps with the match | |
if next_match.a < db and next_match.a + next_match.size > da: | |
# Add segment before the match | |
if next_match.a > da: | |
new_unmatched_docs.append((da, next_match.a)) | |
# Add segment after the match | |
if next_match.a + next_match.size < db: | |
new_unmatched_docs.append((next_match.a + next_match.size, db)) | |
else: | |
new_unmatched_docs.append((da, db)) # Keep non-overlapping segment | |
unmatched_doc_words = new_unmatched_docs | |
new_unmatched_cites = [] | |
for ca, cb in unmatched_cite_words: | |
if next_match.b < cb and next_match.b + next_match.size > ca: | |
if next_match.b > ca: | |
new_unmatched_cites.append((ca, next_match.b)) | |
if next_match.b + next_match.size < cb: | |
new_unmatched_cites.append((next_match.b + next_match.size, cb)) | |
else: | |
new_unmatched_cites.append((ca, cb)) | |
unmatched_cite_words = new_unmatched_cites | |
# 3. Convert matching word blocks back to character spans | |
char_spans = [] | |
for i, j, n in sorted(matching_blocks, key=lambda x: x.a): | |
if n == 0: | |
continue | |
start_char = doc_tokens[i][1] | |
end_char = doc_tokens[i + n - 1][2] | |
if char_spans and char_spans[-1][1] >= start_char - 1: | |
char_spans[-1] = (char_spans[-1][0], max(char_spans[-1][1], end_char)) | |
else: | |
char_spans.append((start_char, end_char)) | |
return char_spans | |