Commit
·
a33b955
1
Parent(s):
0b9e789
Fixed issue in Docker containers built locally without correct folder permissions. Improved config file. Updated Gradio version to fix issue with selecting filtered rows. Minor bug fixes.
Browse files- Dockerfile +10 -7
- app.py +4 -9
- requirements.txt +2 -2
- tools/aws_textract.py +4 -5
- tools/config.py +59 -12
- tools/file_redaction.py +21 -12
- tools/helper_functions.py +2 -2
Dockerfile
CHANGED
@@ -69,10 +69,11 @@ RUN chmod +x /entrypoint.sh
|
|
69 |
# Switch to the "user" user
|
70 |
USER user
|
71 |
|
|
|
|
|
72 |
# Set environmental variables
|
73 |
-
ENV
|
74 |
-
|
75 |
-
PYTHONPATH=/home/user/app \
|
76 |
PYTHONUNBUFFERED=1 \
|
77 |
PYTHONDONTWRITEBYTECODE=1 \
|
78 |
GRADIO_ALLOW_FLAGGING=never \
|
@@ -80,15 +81,17 @@ ENV HOME=/home/user \
|
|
80 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
81 |
GRADIO_SERVER_PORT=7860 \
|
82 |
GRADIO_ANALYTICS_ENABLED=False \
|
83 |
-
|
84 |
-
TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
|
85 |
SYSTEM=spaces
|
86 |
|
87 |
# Set the working directory to the user's home directory
|
88 |
-
WORKDIR $
|
89 |
|
90 |
# Copy the app code to the container
|
91 |
-
COPY --chown=user . $
|
|
|
|
|
|
|
92 |
|
93 |
ENTRYPOINT [ "/entrypoint.sh" ]
|
94 |
|
|
|
69 |
# Switch to the "user" user
|
70 |
USER user
|
71 |
|
72 |
+
ENV APP_HOME=/home/user
|
73 |
+
|
74 |
# Set environmental variables
|
75 |
+
ENV PATH=$APP_HOME/.local/bin:$PATH \
|
76 |
+
PYTHONPATH=$APP_HOME/app \
|
|
|
77 |
PYTHONUNBUFFERED=1 \
|
78 |
PYTHONDONTWRITEBYTECODE=1 \
|
79 |
GRADIO_ALLOW_FLAGGING=never \
|
|
|
81 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
82 |
GRADIO_SERVER_PORT=7860 \
|
83 |
GRADIO_ANALYTICS_ENABLED=False \
|
84 |
+
TLDEXTRACT_CACHE=$APP_HOME/app/tld/.tld_set_snapshot \
|
|
|
85 |
SYSTEM=spaces
|
86 |
|
87 |
# Set the working directory to the user's home directory
|
88 |
+
WORKDIR $APP_HOME/app
|
89 |
|
90 |
# Copy the app code to the container
|
91 |
+
COPY --chown=user . $APP_HOME/app
|
92 |
+
|
93 |
+
# Ensure permissions are really user:user again after copying
|
94 |
+
RUN chown -R user:user $APP_HOME/app && chmod -R u+rwX $APP_HOME/app
|
95 |
|
96 |
ENTRYPOINT [ "/entrypoint.sh" ]
|
97 |
|
app.py
CHANGED
@@ -5,7 +5,7 @@ import gradio as gr
|
|
5 |
from gradio_image_annotation import image_annotator
|
6 |
|
7 |
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS, TEXTRACT_BULK_ANALYSIS_BUCKET, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE
|
8 |
-
from tools.helper_functions import
|
9 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3
|
10 |
from tools.file_redaction import choose_and_run_redactor
|
11 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
|
@@ -20,11 +20,6 @@ from tools.textract_batch_call import analyse_document_with_textract_api, poll_b
|
|
20 |
# Suppress downcasting warnings
|
21 |
pd.set_option('future.no_silent_downcasting', True)
|
22 |
|
23 |
-
add_folder_to_path(TESSERACT_FOLDER)
|
24 |
-
add_folder_to_path(POPPLER_FOLDER)
|
25 |
-
|
26 |
-
ensure_output_folder_exists(OUTPUT_FOLDER)
|
27 |
-
|
28 |
chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
|
29 |
|
30 |
full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
|
@@ -369,8 +364,8 @@ with app:
|
|
369 |
with gr.Accordion("Identify duplicate pages to redact", open = True):
|
370 |
in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
|
371 |
with gr.Row():
|
372 |
-
duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale
|
373 |
-
find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale =
|
374 |
|
375 |
duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
|
376 |
|
@@ -588,7 +583,7 @@ with app:
|
|
588 |
|
589 |
# Review OCR text buttom
|
590 |
all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
|
591 |
-
reset_all_ocr_results_btn.click(
|
592 |
|
593 |
# Convert review file to xfdf Adobe format
|
594 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
|
|
5 |
from gradio_image_annotation import image_annotator
|
6 |
|
7 |
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS, TEXTRACT_BULK_ANALYSIS_BUCKET, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE
|
8 |
+
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select
|
9 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3
|
10 |
from tools.file_redaction import choose_and_run_redactor
|
11 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
|
|
|
20 |
# Suppress downcasting warnings
|
21 |
pd.set_option('future.no_silent_downcasting', True)
|
22 |
|
|
|
|
|
|
|
|
|
|
|
23 |
chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
|
24 |
|
25 |
full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
|
|
|
364 |
with gr.Accordion("Identify duplicate pages to redact", open = True):
|
365 |
in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
|
366 |
with gr.Row():
|
367 |
+
duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
|
368 |
+
find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 4)
|
369 |
|
370 |
duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
|
371 |
|
|
|
583 |
|
584 |
# Review OCR text buttom
|
585 |
all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
|
586 |
+
reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
587 |
|
588 |
# Convert review file to xfdf Adobe format
|
589 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
requirements.txt
CHANGED
@@ -7,12 +7,12 @@ presidio_anonymizer==2.2.358
|
|
7 |
presidio-image-redactor==0.0.56
|
8 |
pikepdf==9.5.2
|
9 |
pandas==2.2.3
|
10 |
-
#nltk==3.9.1 # Not required
|
11 |
scikit-learn==1.6.1
|
12 |
spacy==3.8.4
|
13 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
14 |
#en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
15 |
-
gradio==5.23.3
|
|
|
16 |
boto3==1.37.29
|
17 |
pyarrow==19.0.1
|
18 |
openpyxl==3.1.5
|
|
|
7 |
presidio-image-redactor==0.0.56
|
8 |
pikepdf==9.5.2
|
9 |
pandas==2.2.3
|
|
|
10 |
scikit-learn==1.6.1
|
11 |
spacy==3.8.4
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
#en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
14 |
+
#gradio==5.23.3 # Using latest version of Gradio 5.25.0 below as it fixes the table select issues while filtered
|
15 |
+
https://gradio-pypi-previews.s3.amazonaws.com/3e66dcbc9f3b1d106f5488fb1dca51f0787e6d79/gradio-5.25.0-py3-none-any.whl
|
16 |
boto3==1.37.29
|
17 |
pyarrow==19.0.1
|
18 |
openpyxl==3.1.5
|
tools/aws_textract.py
CHANGED
@@ -39,12 +39,10 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
|
|
39 |
else:
|
40 |
client = boto3.client('textract', region_name=AWS_REGION)
|
41 |
except:
|
42 |
-
|
|
|
|
|
43 |
return [], "" # Return an empty list and an empty string
|
44 |
-
|
45 |
-
#print("Analysing page with AWS Textract")
|
46 |
-
#print("pdf_page_bytes:", pdf_page_bytes)
|
47 |
-
#print("handwrite_signature_checkbox:", handwrite_signature_checkbox)
|
48 |
|
49 |
# Redact signatures if specified
|
50 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
@@ -138,6 +136,7 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
|
|
138 |
# This is a new page
|
139 |
elif "page_no" in page_json_data:
|
140 |
text_blocks = page_json_data["data"]["Blocks"]
|
|
|
141 |
|
142 |
is_signature = False
|
143 |
is_handwriting = False
|
|
|
39 |
else:
|
40 |
client = boto3.client('textract', region_name=AWS_REGION)
|
41 |
except:
|
42 |
+
out_message = "Cannot connect to AWS Textract"
|
43 |
+
print(out_message)
|
44 |
+
raise Exception(out_message)
|
45 |
return [], "" # Return an empty list and an empty string
|
|
|
|
|
|
|
|
|
46 |
|
47 |
# Redact signatures if specified
|
48 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
|
|
136 |
# This is a new page
|
137 |
elif "page_no" in page_json_data:
|
138 |
text_blocks = page_json_data["data"]["Blocks"]
|
139 |
+
else: text_blocks = []
|
140 |
|
141 |
is_signature = False
|
142 |
is_handwriting = False
|
tools/config.py
CHANGED
@@ -28,16 +28,45 @@ def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False)
|
|
28 |
|
29 |
return value
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
|
33 |
-
APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', 'config/app_config.env')
|
34 |
|
35 |
if APP_CONFIG_PATH:
|
36 |
if os.path.exists(APP_CONFIG_PATH):
|
37 |
print(f"Loading app variables from config file {APP_CONFIG_PATH}")
|
38 |
load_dotenv(APP_CONFIG_PATH)
|
39 |
-
else:
|
40 |
-
print("App config file not found at location:", APP_CONFIG_PATH)
|
41 |
|
42 |
# Report logging to console?
|
43 |
LOGGING = get_or_create_env_var('LOGGING', 'False')
|
@@ -51,14 +80,13 @@ if LOGGING == 'True':
|
|
51 |
###
|
52 |
|
53 |
# If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
|
54 |
-
AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', 'config/aws_config.env
|
55 |
|
56 |
if AWS_CONFIG_PATH:
|
57 |
if os.path.exists(AWS_CONFIG_PATH):
|
58 |
print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
|
59 |
load_dotenv(AWS_CONFIG_PATH)
|
60 |
-
else:
|
61 |
-
print("AWS config file not found at location:", AWS_CONFIG_PATH)
|
62 |
|
63 |
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
|
64 |
|
@@ -116,6 +144,9 @@ SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False')
|
|
116 |
OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
|
117 |
INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
|
118 |
|
|
|
|
|
|
|
119 |
# Allow for files to be saved in a temporary folder for increased security in some instances
|
120 |
if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
|
121 |
# Create a temporary directory
|
@@ -128,19 +159,35 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
|
|
128 |
# By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
|
129 |
# Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
|
130 |
|
131 |
-
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
# Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
|
136 |
DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
|
137 |
|
138 |
###
|
139 |
# REDACTION CONFIG
|
140 |
-
###
|
141 |
-
TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "tesseract/")
|
142 |
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
|
146 |
# Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
|
|
|
28 |
|
29 |
return value
|
30 |
|
31 |
+
def ensure_folder_exists(output_folder:str):
|
32 |
+
"""Checks if the specified folder exists, creates it if not."""
|
33 |
+
|
34 |
+
if not os.path.exists(output_folder):
|
35 |
+
# Create the folder if it doesn't exist
|
36 |
+
os.makedirs(output_folder, exist_ok=True)
|
37 |
+
print(f"Created the {output_folder} folder.")
|
38 |
+
else:
|
39 |
+
print(f"The {output_folder} folder already exists.")
|
40 |
+
|
41 |
+
def add_folder_to_path(folder_path: str):
|
42 |
+
'''
|
43 |
+
Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
44 |
+
'''
|
45 |
+
|
46 |
+
if os.path.exists(folder_path) and os.path.isdir(folder_path):
|
47 |
+
print(folder_path, "folder exists.")
|
48 |
+
|
49 |
+
# Resolve relative path to absolute path
|
50 |
+
absolute_path = os.path.abspath(folder_path)
|
51 |
+
|
52 |
+
current_path = os.environ['PATH']
|
53 |
+
if absolute_path not in current_path.split(os.pathsep):
|
54 |
+
full_path_extension = absolute_path + os.pathsep + current_path
|
55 |
+
os.environ['PATH'] = full_path_extension
|
56 |
+
#print(f"Updated PATH with: ", full_path_extension)
|
57 |
+
else:
|
58 |
+
print(f"Directory {folder_path} already exists in PATH.")
|
59 |
+
else:
|
60 |
+
print(f"Folder not found at {folder_path} - not added to PATH")
|
61 |
|
62 |
# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
|
63 |
+
APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', 'config/app_config.env') # e.g. config/app_config.env
|
64 |
|
65 |
if APP_CONFIG_PATH:
|
66 |
if os.path.exists(APP_CONFIG_PATH):
|
67 |
print(f"Loading app variables from config file {APP_CONFIG_PATH}")
|
68 |
load_dotenv(APP_CONFIG_PATH)
|
69 |
+
else: print("App config file not found at location:", APP_CONFIG_PATH)
|
|
|
70 |
|
71 |
# Report logging to console?
|
72 |
LOGGING = get_or_create_env_var('LOGGING', 'False')
|
|
|
80 |
###
|
81 |
|
82 |
# If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
|
83 |
+
AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '') # e.g. config/aws_config.env
|
84 |
|
85 |
if AWS_CONFIG_PATH:
|
86 |
if os.path.exists(AWS_CONFIG_PATH):
|
87 |
print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
|
88 |
load_dotenv(AWS_CONFIG_PATH)
|
89 |
+
else: print("AWS config file not found at location:", AWS_CONFIG_PATH)
|
|
|
90 |
|
91 |
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
|
92 |
|
|
|
144 |
OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
|
145 |
INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
|
146 |
|
147 |
+
ensure_folder_exists(OUTPUT_FOLDER)
|
148 |
+
ensure_folder_exists(INPUT_FOLDER)
|
149 |
+
|
150 |
# Allow for files to be saved in a temporary folder for increased security in some instances
|
151 |
if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
|
152 |
# Create a temporary directory
|
|
|
159 |
# By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
|
160 |
# Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
|
161 |
|
162 |
+
USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
|
163 |
+
|
164 |
+
if USE_LOG_SUBFOLDERS == "True":
|
165 |
+
day_log_subfolder = today_rev + '/'
|
166 |
+
host_name_subfolder = HOST_NAME + '/'
|
167 |
+
full_log_subfolder = day_log_subfolder + host_name_subfolder
|
168 |
+
else:
|
169 |
+
full_log_subfolder = ""
|
170 |
+
|
171 |
+
FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + full_log_subfolder)
|
172 |
+
ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/' + full_log_subfolder)
|
173 |
+
USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/' + full_log_subfolder)
|
174 |
+
|
175 |
+
ensure_folder_exists(FEEDBACK_LOGS_FOLDER)
|
176 |
+
ensure_folder_exists(ACCESS_LOGS_FOLDER)
|
177 |
+
ensure_folder_exists(USAGE_LOGS_FOLDER)
|
178 |
|
179 |
# Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
|
180 |
DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
|
181 |
|
182 |
###
|
183 |
# REDACTION CONFIG
|
|
|
|
|
184 |
|
185 |
+
# Create Tesseract and Poppler folders if you have installed them locally
|
186 |
+
TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # e.g. tesseract/
|
187 |
+
POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # e.g. poppler/poppler-24.02.0/Library/bin/
|
188 |
+
|
189 |
+
if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
|
190 |
+
if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)
|
191 |
|
192 |
|
193 |
# Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
|
tools/file_redaction.py
CHANGED
@@ -330,7 +330,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
330 |
|
331 |
|
332 |
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
|
333 |
-
if pii_identification_method ==
|
334 |
if aws_access_key_textbox and aws_secret_key_textbox:
|
335 |
print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
|
336 |
comprehend_client = boto3.client('comprehend',
|
@@ -349,7 +349,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
349 |
out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
|
350 |
print(out_message)
|
351 |
raise Exception(out_message)
|
352 |
-
else:
|
|
|
353 |
|
354 |
# Try to connect to AWS Textract Client if using that text extraction method
|
355 |
if text_extraction_method == textract_option:
|
@@ -365,13 +366,17 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
365 |
print("Getting Textract credentials from environment variables.")
|
366 |
textract_client = boto3.client('textract',
|
367 |
aws_access_key_id=AWS_ACCESS_KEY,
|
368 |
-
aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
|
|
|
|
|
|
|
369 |
else:
|
370 |
textract_client = ""
|
371 |
-
|
372 |
-
print(
|
373 |
-
|
374 |
-
else:
|
|
|
375 |
|
376 |
# Check if output_folder exists, create it if it doesn't
|
377 |
if not os.path.exists(output_folder): os.makedirs(output_folder)
|
@@ -1208,8 +1213,7 @@ def redact_image_pdf(file_path:str,
|
|
1208 |
|
1209 |
tic = time.perf_counter()
|
1210 |
|
1211 |
-
file_name = get_file_name_without_type(file_path)
|
1212 |
-
|
1213 |
comprehend_query_number_new = 0
|
1214 |
|
1215 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
@@ -1323,6 +1327,8 @@ def redact_image_pdf(file_path:str,
|
|
1323 |
|
1324 |
# Check if page exists in existing textract data. If not, send to service to analyse
|
1325 |
if text_extraction_method == textract_option:
|
|
|
|
|
1326 |
if not textract_data:
|
1327 |
try:
|
1328 |
# Convert the image_path to bytes using an in-memory buffer
|
@@ -1365,12 +1371,15 @@ def redact_image_pdf(file_path:str,
|
|
1365 |
textract_data["pages"].append(text_blocks)
|
1366 |
|
1367 |
except Exception as e:
|
1368 |
-
|
|
|
1369 |
text_blocks = []
|
1370 |
-
new_request_metadata = "Failed Textract API call"
|
1371 |
|
1372 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
1373 |
-
if "pages" not in textract_data: textract_data["pages"] = []
|
|
|
|
|
1374 |
|
1375 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
1376 |
|
|
|
330 |
|
331 |
|
332 |
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
|
333 |
+
if pii_identification_method == aws_pii_detector:
|
334 |
if aws_access_key_textbox and aws_secret_key_textbox:
|
335 |
print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
|
336 |
comprehend_client = boto3.client('comprehend',
|
|
|
349 |
out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
|
350 |
print(out_message)
|
351 |
raise Exception(out_message)
|
352 |
+
else:
|
353 |
+
comprehend_client = ""
|
354 |
|
355 |
# Try to connect to AWS Textract Client if using that text extraction method
|
356 |
if text_extraction_method == textract_option:
|
|
|
366 |
print("Getting Textract credentials from environment variables.")
|
367 |
textract_client = boto3.client('textract',
|
368 |
aws_access_key_id=AWS_ACCESS_KEY,
|
369 |
+
aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
|
370 |
+
elif textract_output_found==True:
|
371 |
+
print("Existing Textract data found for file, no need to connect to AWS Textract")
|
372 |
+
textract_client = boto3.client('textract', region_name=AWS_REGION)
|
373 |
else:
|
374 |
textract_client = ""
|
375 |
+
out_message = "Cannot connect to AWS Textract service."
|
376 |
+
print(out_message)
|
377 |
+
raise Exception(out_message)
|
378 |
+
else:
|
379 |
+
textract_client = ""
|
380 |
|
381 |
# Check if output_folder exists, create it if it doesn't
|
382 |
if not os.path.exists(output_folder): os.makedirs(output_folder)
|
|
|
1213 |
|
1214 |
tic = time.perf_counter()
|
1215 |
|
1216 |
+
file_name = get_file_name_without_type(file_path)
|
|
|
1217 |
comprehend_query_number_new = 0
|
1218 |
|
1219 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
|
|
1327 |
|
1328 |
# Check if page exists in existing textract data. If not, send to service to analyse
|
1329 |
if text_extraction_method == textract_option:
|
1330 |
+
text_blocks = []
|
1331 |
+
|
1332 |
if not textract_data:
|
1333 |
try:
|
1334 |
# Convert the image_path to bytes using an in-memory buffer
|
|
|
1371 |
textract_data["pages"].append(text_blocks)
|
1372 |
|
1373 |
except Exception as e:
|
1374 |
+
out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
|
1375 |
+
print(out_message)
|
1376 |
text_blocks = []
|
1377 |
+
new_request_metadata = "Failed Textract API call"
|
1378 |
|
1379 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
1380 |
+
if "pages" not in textract_data: textract_data["pages"] = []
|
1381 |
+
|
1382 |
+
raise Exception(out_message)
|
1383 |
|
1384 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
1385 |
|
tools/helper_functions.py
CHANGED
@@ -232,10 +232,10 @@ def check_for_existing_textract_file(doc_file_name_no_extension_textbox:str, out
|
|
232 |
else:
|
233 |
return False
|
234 |
|
235 |
-
#
|
236 |
def add_folder_to_path(folder_path: str):
|
237 |
'''
|
238 |
-
Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist.
|
239 |
'''
|
240 |
|
241 |
if os.path.exists(folder_path) and os.path.isdir(folder_path):
|
|
|
232 |
else:
|
233 |
return False
|
234 |
|
235 |
+
#
|
236 |
def add_folder_to_path(folder_path: str):
|
237 |
'''
|
238 |
+
Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
239 |
'''
|
240 |
|
241 |
if os.path.exists(folder_path) and os.path.isdir(folder_path):
|