File size: 9,871 Bytes
a265560
6319afc
 
a265560
 
0ea8b9e
a265560
 
0ea8b9e
 
 
 
 
a265560
 
 
 
0ea8b9e
a265560
0ea8b9e
 
 
a265560
0ea8b9e
 
 
a265560
0ea8b9e
a265560
 
0ea8b9e
a265560
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7907ad4
 
a265560
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ea8b9e
a265560
 
 
 
 
 
 
0ea8b9e
 
 
 
 
 
a265560
 
0ea8b9e
a265560
 
 
0ea8b9e
 
a265560
0ea8b9e
 
a265560
0ea8b9e
 
a265560
 
0ea8b9e
 
 
 
 
 
 
 
 
a265560
0ea8b9e
 
a265560
 
 
0ea8b9e
a265560
0ea8b9e
 
a265560
0ea8b9e
a265560
 
0ea8b9e
a265560
0ea8b9e
 
a265560
 
0ea8b9e
a265560
0ea8b9e
 
a265560
0ea8b9e
 
 
a265560
0ea8b9e
 
 
 
a265560
0ea8b9e
 
 
 
a265560
 
0ea8b9e
 
 
 
 
 
 
 
 
a265560
0ea8b9e
 
a265560
0ea8b9e
 
 
a265560
0ea8b9e
 
 
 
 
 
 
 
 
 
 
a265560
 
 
0ea8b9e
 
 
 
 
 
a265560
 
 
 
0ea8b9e
 
 
 
 
 
a265560
 
 
 
 
0ea8b9e
a265560
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
import pandas as pd
#import argparse
#import glob
import os
import re
from tools.helper_functions import OUTPUT_FOLDER
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# import nltk
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem import PorterStemmer
#import spacy
import numpy as np
import random
import string
from typing import List
from gradio import Progress

import en_core_web_lg #en_core_web_sm
nlp = en_core_web_lg.load()
#from tqdm import tqdm

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('punkt_tab')

similarity_threshold = 0.9


def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
    """
    Combines text from multiple CSV files containing page and text columns.
    Groups text by file and page number, concatenating text within these groups.
    
    Args:
        input_files (list): List of paths to CSV files
    
    Returns:
        pd.DataFrame: Combined dataframe with columns [file, page, text]
    """
    all_data = []
    output_files = []

    if isinstance(input_files, str):
        file_paths_list = [input_files]
    else:
        file_paths_list = input_files
    
    for file in file_paths_list:

        if isinstance(file, str):
            file_path = file
        else:
            file_path = file.name

        # Read CSV file
        df = pd.read_csv(file_path)
        
        # Ensure required columns exist
        if 'page' not in df.columns or 'text' not in df.columns:
            print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
            continue

        df['text'] = df['text'].fillna('').astype(str)
        
        # Group by page and concatenate text
        grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
        
        # Add filename column
        grouped['file'] = os.path.basename(file_path)
        
        all_data.append(grouped)
    
    if not all_data:
        raise ValueError("No valid CSV files were processed")
    
    # Combine all dataframes
    combined_df = pd.concat(all_data, ignore_index=True)
    
    # Reorder columns
    combined_df = combined_df[['file', 'page', 'text']]

    output_combined_file_path = output_folder + "combined_ocr_output_files.csv"
    combined_df.to_csv(output_combined_file_path, index=None)

    output_files.append(output_combined_file_path)
    
    return combined_df, output_files

def process_data(df:pd.DataFrame, column:str):
    '''
    Clean and stem text columns in a data frame
    '''
    
    def _clean_text(raw_text):
        # Remove HTML tags
        clean = re.sub(r'<.*?>', '', raw_text)
        # clean = re.sub(r'&nbsp;', ' ', clean)
        # clean = re.sub(r'\r\n', ' ', clean)
        # clean = re.sub(r'&lt;', ' ', clean)
        # clean = re.sub(r'&gt;', ' ', clean)
        # clean = re.sub(r'<strong>', ' ', clean)
        # clean = re.sub(r'</strong>', ' ', clean)

        # Replace non-breaking space \xa0 with a space
        # clean = clean.replace(u'\xa0', u' ')
        # Remove extra whitespace
        clean = ' '.join(clean.split())

        # # Tokenize the text
        # words = word_tokenize(clean.lower())

        # # Remove punctuation and numbers
        # words = [word for word in words if word.isalpha()]

        # # Remove stopwords
        # words = [word for word in words if word not in stop_words]

        # Join the cleaned words back into a string
        return clean

    # Function to apply lemmatization and remove stopwords
    def _apply_lemmatization(text):
        doc = nlp(text)
        # Keep only alphabetic tokens and remove stopwords
        lemmatized_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
        return ' '.join(lemmatized_words)
    
    df['text_clean'] = df[column].apply(_clean_text)

    df['text_clean'] = df['text_clean'].apply(_apply_lemmatization)
    
    return df

def identify_similar_pages(input_files: List[str], similarity_threshold: float = 0.9, output_folder:str=OUTPUT_FOLDER, progress=Progress(track_tqdm=True)):
    output_paths = []
    
    progress(0.1, desc="Cleaning input texts")

    # Load and clean data
    df, output_files = combine_ocr_output_text(input_files)
    output_paths.extend(output_files)
    df = process_data(df, 'text')  # Assume this returns 'text_clean', 'file', and 'page' columns

    # Vectorize text
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['text_clean'])

    progress(0.3, desc="Calculating text similarity")

    # Compute sparse cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)  # Keep sparse format

    # Extract indices of similar pages above threshold
    coo_matrix = similarity_matrix.tocoo()
    similar_pages = np.array([(i, j, v) for i, j, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data) if v > similarity_threshold])

    if similar_pages.size == 0:
        return pd.DataFrame(), output_paths  # Return empty if no matches
    
    

    # Create a DataFrame for similar pairs
    similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
    
    # Remove duplicate pairs (keep one direction)
    similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]

    progress(0.8, desc="Mapping back results")
    # Map indices to metadata
    # index_map = df[['file', 'page', 'text']].to_dict(orient='index')
    # similarity_df['Page1_File'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['file'])
    # similarity_df['Page2_File'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['file'])
    # similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['page'])
    # similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['page'])
    # similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['text'][0:200])
    # similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['text'][0:200])

    # Create a DataFrame with the metadata
    metadata_df = df[['file', 'page', 'text']].reset_index()

    # Merge to get the metadata for Page1
    similarity_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_on='index', suffixes=('', '_Page1'))
    similarity_df = similarity_df.rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'})

    # Merge to get the metadata for Page2
    similarity_df = similarity_df.merge(metadata_df, left_on='Page2_Index', right_on='index', suffixes=('', '_Page2'))
    similarity_df = similarity_df.rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'})

    # Optionally, drop the index columns if not needed
    #similarity_df = similarity_df.drop(columns=['index_Page1', 'index_Page2'])


    similarity_df["Similarity_Score"] = similarity_df["Similarity_Score"].round(3)

    # Sort results
    similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
    similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])

    similarity_df_out['Page1_Text'] = similarity_df_out['Page1_Text'][0:100]
    similarity_df_out['Page2_Text'] = similarity_df_out['Page2_Text'][0:100]

    progress(0.8, desc="Saving output files")

    # Save results
    similarity_file_output_path = output_folder + 'page_similarity_results.csv'
    similarity_df_out.to_csv(similarity_file_output_path, index=False)
    output_paths.append(similarity_file_output_path)

    # Save per-file redaction lists
    for redact_file in similarity_df_out['Page2_File'].unique():
        output_file_name = output_folder + redact_file + "_whole_page.csv"
        whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File'] == redact_file, ['Page2_Page']].drop_duplicates(['Page2_Page']).sort_values('Page2_Page')
        whole_pages_to_redact_df.to_csv(output_file_name, header=False, index=False)
        output_paths.append(output_file_name)

    return similarity_df_out, output_paths

# Perturb text
# Apply the perturbation function with a 10% error probability
def perturb_text_with_errors(series:pd.Series):

    def _perturb_text(text, error_probability=0.1):
        words = text.split()  # Split text into words
        perturbed_words = []
        
        for word in words:
            if random.random() < error_probability:  # Add a random error
                perturbation_type = random.choice(['char_error', 'extra_space', 'extra_punctuation'])
                
                if perturbation_type == 'char_error':  # Introduce a character error
                    idx = random.randint(0, len(word) - 1)
                    char = random.choice(string.ascii_lowercase)  # Add a random letter
                    word = word[:idx] + char + word[idx:]
                
                elif perturbation_type == 'extra_space':  # Add extra space around a word
                    word = ' ' + word + ' '
                
                elif perturbation_type == 'extra_punctuation':  # Add punctuation to the word
                    punctuation = random.choice(string.punctuation)
                    idx = random.randint(0, len(word))  # Insert punctuation randomly
                    word = word[:idx] + punctuation + word[idx:]
            
            perturbed_words.append(word)
        
        return ' '.join(perturbed_words)

    series = series.apply(lambda x: _perturb_text(x, error_probability=0.1))

    return series