File size: 9,871 Bytes
a265560 6319afc a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 7907ad4 a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 0ea8b9e a265560 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 |
import pandas as pd
#import argparse
#import glob
import os
import re
from tools.helper_functions import OUTPUT_FOLDER
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# import nltk
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem import PorterStemmer
#import spacy
import numpy as np
import random
import string
from typing import List
from gradio import Progress
import en_core_web_lg #en_core_web_sm
nlp = en_core_web_lg.load()
#from tqdm import tqdm
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('punkt_tab')
similarity_threshold = 0.9
def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
"""
Combines text from multiple CSV files containing page and text columns.
Groups text by file and page number, concatenating text within these groups.
Args:
input_files (list): List of paths to CSV files
Returns:
pd.DataFrame: Combined dataframe with columns [file, page, text]
"""
all_data = []
output_files = []
if isinstance(input_files, str):
file_paths_list = [input_files]
else:
file_paths_list = input_files
for file in file_paths_list:
if isinstance(file, str):
file_path = file
else:
file_path = file.name
# Read CSV file
df = pd.read_csv(file_path)
# Ensure required columns exist
if 'page' not in df.columns or 'text' not in df.columns:
print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
continue
df['text'] = df['text'].fillna('').astype(str)
# Group by page and concatenate text
grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
# Add filename column
grouped['file'] = os.path.basename(file_path)
all_data.append(grouped)
if not all_data:
raise ValueError("No valid CSV files were processed")
# Combine all dataframes
combined_df = pd.concat(all_data, ignore_index=True)
# Reorder columns
combined_df = combined_df[['file', 'page', 'text']]
output_combined_file_path = output_folder + "combined_ocr_output_files.csv"
combined_df.to_csv(output_combined_file_path, index=None)
output_files.append(output_combined_file_path)
return combined_df, output_files
def process_data(df:pd.DataFrame, column:str):
'''
Clean and stem text columns in a data frame
'''
def _clean_text(raw_text):
# Remove HTML tags
clean = re.sub(r'<.*?>', '', raw_text)
# clean = re.sub(r' ', ' ', clean)
# clean = re.sub(r'\r\n', ' ', clean)
# clean = re.sub(r'<', ' ', clean)
# clean = re.sub(r'>', ' ', clean)
# clean = re.sub(r'<strong>', ' ', clean)
# clean = re.sub(r'</strong>', ' ', clean)
# Replace non-breaking space \xa0 with a space
# clean = clean.replace(u'\xa0', u' ')
# Remove extra whitespace
clean = ' '.join(clean.split())
# # Tokenize the text
# words = word_tokenize(clean.lower())
# # Remove punctuation and numbers
# words = [word for word in words if word.isalpha()]
# # Remove stopwords
# words = [word for word in words if word not in stop_words]
# Join the cleaned words back into a string
return clean
# Function to apply lemmatization and remove stopwords
def _apply_lemmatization(text):
doc = nlp(text)
# Keep only alphabetic tokens and remove stopwords
lemmatized_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
return ' '.join(lemmatized_words)
df['text_clean'] = df[column].apply(_clean_text)
df['text_clean'] = df['text_clean'].apply(_apply_lemmatization)
return df
def identify_similar_pages(input_files: List[str], similarity_threshold: float = 0.9, output_folder:str=OUTPUT_FOLDER, progress=Progress(track_tqdm=True)):
output_paths = []
progress(0.1, desc="Cleaning input texts")
# Load and clean data
df, output_files = combine_ocr_output_text(input_files)
output_paths.extend(output_files)
df = process_data(df, 'text') # Assume this returns 'text_clean', 'file', and 'page' columns
# Vectorize text
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['text_clean'])
progress(0.3, desc="Calculating text similarity")
# Compute sparse cosine similarity
similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False) # Keep sparse format
# Extract indices of similar pages above threshold
coo_matrix = similarity_matrix.tocoo()
similar_pages = np.array([(i, j, v) for i, j, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data) if v > similarity_threshold])
if similar_pages.size == 0:
return pd.DataFrame(), output_paths # Return empty if no matches
# Create a DataFrame for similar pairs
similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
# Remove duplicate pairs (keep one direction)
similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
progress(0.8, desc="Mapping back results")
# Map indices to metadata
# index_map = df[['file', 'page', 'text']].to_dict(orient='index')
# similarity_df['Page1_File'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['file'])
# similarity_df['Page2_File'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['file'])
# similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['page'])
# similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['page'])
# similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['text'][0:200])
# similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['text'][0:200])
# Create a DataFrame with the metadata
metadata_df = df[['file', 'page', 'text']].reset_index()
# Merge to get the metadata for Page1
similarity_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_on='index', suffixes=('', '_Page1'))
similarity_df = similarity_df.rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'})
# Merge to get the metadata for Page2
similarity_df = similarity_df.merge(metadata_df, left_on='Page2_Index', right_on='index', suffixes=('', '_Page2'))
similarity_df = similarity_df.rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'})
# Optionally, drop the index columns if not needed
#similarity_df = similarity_df.drop(columns=['index_Page1', 'index_Page2'])
similarity_df["Similarity_Score"] = similarity_df["Similarity_Score"].round(3)
# Sort results
similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])
similarity_df_out['Page1_Text'] = similarity_df_out['Page1_Text'][0:100]
similarity_df_out['Page2_Text'] = similarity_df_out['Page2_Text'][0:100]
progress(0.8, desc="Saving output files")
# Save results
similarity_file_output_path = output_folder + 'page_similarity_results.csv'
similarity_df_out.to_csv(similarity_file_output_path, index=False)
output_paths.append(similarity_file_output_path)
# Save per-file redaction lists
for redact_file in similarity_df_out['Page2_File'].unique():
output_file_name = output_folder + redact_file + "_whole_page.csv"
whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File'] == redact_file, ['Page2_Page']].drop_duplicates(['Page2_Page']).sort_values('Page2_Page')
whole_pages_to_redact_df.to_csv(output_file_name, header=False, index=False)
output_paths.append(output_file_name)
return similarity_df_out, output_paths
# Perturb text
# Apply the perturbation function with a 10% error probability
def perturb_text_with_errors(series:pd.Series):
def _perturb_text(text, error_probability=0.1):
words = text.split() # Split text into words
perturbed_words = []
for word in words:
if random.random() < error_probability: # Add a random error
perturbation_type = random.choice(['char_error', 'extra_space', 'extra_punctuation'])
if perturbation_type == 'char_error': # Introduce a character error
idx = random.randint(0, len(word) - 1)
char = random.choice(string.ascii_lowercase) # Add a random letter
word = word[:idx] + char + word[idx:]
elif perturbation_type == 'extra_space': # Add extra space around a word
word = ' ' + word + ' '
elif perturbation_type == 'extra_punctuation': # Add punctuation to the word
punctuation = random.choice(string.punctuation)
idx = random.randint(0, len(word)) # Insert punctuation randomly
word = word[:idx] + punctuation + word[idx:]
perturbed_words.append(word)
return ' '.join(perturbed_words)
series = series.apply(lambda x: _perturb_text(x, error_probability=0.1))
return series
|