Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on 20 days ago

Commit

a33b955

1 Parent(s): 0b9e789

Fixed issue in Docker containers built locally without correct folder permissions. Improved config file. Updated Gradio version to fix issue with selecting filtered rows. Minor bug fixes.

Browse files

Files changed (7) hide show

Dockerfile +10 -7
app.py +4 -9
requirements.txt +2 -2
tools/aws_textract.py +4 -5
tools/config.py +59 -12
tools/file_redaction.py +21 -12
tools/helper_functions.py +2 -2

Dockerfile CHANGED Viewed

@@ -69,10 +69,11 @@ RUN chmod +x /entrypoint.sh
 # Switch to the "user" user
 USER user
 # Set environmental variables
-ENV HOME=/home/user \
-    PATH=/home/user/.local/bin:$PATH \
-    PYTHONPATH=/home/user/app \
     PYTHONUNBUFFERED=1 \
     PYTHONDONTWRITEBYTECODE=1 \
     GRADIO_ALLOW_FLAGGING=never \
@@ -80,15 +81,17 @@ ENV HOME=/home/user \
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_SERVER_PORT=7860 \
     GRADIO_ANALYTICS_ENABLED=False \
-    GRADIO_THEME=huggingface \
-    TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
     SYSTEM=spaces
 # Set the working directory to the user's home directory
-WORKDIR $HOME/app
 # Copy the app code to the container
-COPY --chown=user . $HOME/app
 ENTRYPOINT [ "/entrypoint.sh" ]

 # Switch to the "user" user
 USER user
+ENV APP_HOME=/home/user
 # Set environmental variables
+ENV PATH=$APP_HOME/.local/bin:$PATH \
+    PYTHONPATH=$APP_HOME/app \
     PYTHONUNBUFFERED=1 \
     PYTHONDONTWRITEBYTECODE=1 \
     GRADIO_ALLOW_FLAGGING=never \
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_SERVER_PORT=7860 \
     GRADIO_ANALYTICS_ENABLED=False \
+    TLDEXTRACT_CACHE=$APP_HOME/app/tld/.tld_set_snapshot \
     SYSTEM=spaces
 # Set the working directory to the user's home directory
+WORKDIR $APP_HOME/app
 # Copy the app code to the container
+COPY --chown=user . $APP_HOME/app
+# Ensure permissions are really user:user again after copying
+RUN chown -R user:user $APP_HOME/app && chmod -R u+rwX $APP_HOME/app
 ENTRYPOINT [ "/entrypoint.sh" ]

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import gradio as gr
 from gradio_image_annotation import image_annotator
 from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS, TEXTRACT_BULK_ANALYSIS_BUCKET, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE
-from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
@@ -20,11 +20,6 @@ from tools.textract_batch_call import analyse_document_with_textract_api, poll_b
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
-add_folder_to_path(TESSERACT_FOLDER)
-add_folder_to_path(POPPLER_FOLDER)
-ensure_output_folder_exists(OUTPUT_FOLDER)
 chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
 full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
@@ -369,8 +364,8 @@ with app:
         with gr.Accordion("Identify duplicate pages to redact", open = True):
             in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
             with gr.Row():
-                duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale  =1)
-                find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 5)
             duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
@@ -588,7 +583,7 @@ with app:
     # Review OCR text buttom
     all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
-    reset_all_ocr_results_btn.click(reset_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\

 from gradio_image_annotation import image_annotator
 from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS, TEXTRACT_BULK_ANALYSIS_BUCKET, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE
+from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
 chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
 full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
         with gr.Accordion("Identify duplicate pages to redact", open = True):
             in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
             with gr.Row():
+                duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
+                find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 4)
             duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
     # Review OCR text buttom
     all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
+    reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\

requirements.txt CHANGED Viewed

@@ -7,12 +7,12 @@ presidio_anonymizer==2.2.358
 presidio-image-redactor==0.0.56
 pikepdf==9.5.2
 pandas==2.2.3
-#nltk==3.9.1 # Not required
 scikit-learn==1.6.1
 spacy==3.8.4
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
-gradio==5.23.3
 boto3==1.37.29
 pyarrow==19.0.1
 openpyxl==3.1.5

 presidio-image-redactor==0.0.56
 pikepdf==9.5.2
 pandas==2.2.3
 scikit-learn==1.6.1
 spacy==3.8.4
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+#gradio==5.23.3 # Using latest version of Gradio 5.25.0 below as it fixes the table select issues while filtered
+https://gradio-pypi-previews.s3.amazonaws.com/3e66dcbc9f3b1d106f5488fb1dca51f0787e6d79/gradio-5.25.0-py3-none-any.whl
 boto3==1.37.29
 pyarrow==19.0.1
 openpyxl==3.1.5

tools/aws_textract.py CHANGED Viewed

@@ -39,12 +39,10 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
             else:
                 client = boto3.client('textract', region_name=AWS_REGION)
         except:
-            print("Cannot connect to AWS Textract")
             return [], ""  # Return an empty list and an empty string
-    #print("Analysing page with AWS Textract")
-    #print("pdf_page_bytes:", pdf_page_bytes)
-    #print("handwrite_signature_checkbox:", handwrite_signature_checkbox)
     # Redact signatures if specified
     if "Redact all identified signatures" in handwrite_signature_checkbox:
@@ -138,6 +136,7 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
     # This is a new page
     elif "page_no" in page_json_data:
         text_blocks = page_json_data["data"]["Blocks"]
     is_signature = False
     is_handwriting = False

             else:
                 client = boto3.client('textract', region_name=AWS_REGION)
         except:
+            out_message = "Cannot connect to AWS Textract"
+            print(out_message)
+            raise Exception(out_message)
             return [], ""  # Return an empty list and an empty string
     # Redact signatures if specified
     if "Redact all identified signatures" in handwrite_signature_checkbox:
     # This is a new page
     elif "page_no" in page_json_data:
         text_blocks = page_json_data["data"]["Blocks"]
+    else: text_blocks = []
     is_signature = False
     is_handwriting = False

tools/config.py CHANGED Viewed

@@ -28,16 +28,45 @@ def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False)
     return value
 # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
-APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', 'config/app_config.env')
 if APP_CONFIG_PATH:
     if os.path.exists(APP_CONFIG_PATH):
         print(f"Loading app variables from config file {APP_CONFIG_PATH}")
         load_dotenv(APP_CONFIG_PATH)
-    else:
-        print("App config file not found at location:", APP_CONFIG_PATH)
 # Report logging to console?
 LOGGING = get_or_create_env_var('LOGGING', 'False')
@@ -51,14 +80,13 @@ if LOGGING == 'True':
 ###
 # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
-AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', 'config/aws_config.env')
 if AWS_CONFIG_PATH:
     if os.path.exists(AWS_CONFIG_PATH):
         print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
         load_dotenv(AWS_CONFIG_PATH)
-    else:
-        print("AWS config file not found at location:", AWS_CONFIG_PATH)
 RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
@@ -116,6 +144,9 @@ SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False')
 OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
 INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
 # Allow for files to be saved in a temporary folder for increased security in some instances
 if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
     # Create a temporary directory
@@ -128,19 +159,35 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
 # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
 # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
-FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + today_rev + '/' + HOST_NAME + '/')
-ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/' + today_rev + '/' + HOST_NAME + '/')
-USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/' + today_rev + '/' + HOST_NAME + '/')
 # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
 DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
 ###
 # REDACTION CONFIG
-###
-TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "tesseract/")
-POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "poppler/poppler-24.02.0/Library/bin/")
 # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).

     return value
+def ensure_folder_exists(output_folder:str):
+    """Checks if the specified folder exists, creates it if not."""
+    if not os.path.exists(output_folder):
+        # Create the folder if it doesn't exist
+        os.makedirs(output_folder, exist_ok=True)
+        print(f"Created the {output_folder} folder.")
+    else:
+        print(f"The {output_folder} folder already exists.")
+def add_folder_to_path(folder_path: str):
+    '''
+    Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
+    '''
+    if os.path.exists(folder_path) and os.path.isdir(folder_path):
+        print(folder_path, "folder exists.")
+        # Resolve relative path to absolute path
+        absolute_path = os.path.abspath(folder_path)
+        current_path = os.environ['PATH']
+        if absolute_path not in current_path.split(os.pathsep):
+            full_path_extension = absolute_path + os.pathsep + current_path
+            os.environ['PATH'] = full_path_extension
+            #print(f"Updated PATH with: ", full_path_extension)
+        else:
+            print(f"Directory {folder_path} already exists in PATH.")
+    else:
+        print(f"Folder not found at {folder_path} - not added to PATH")
 # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
+APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', 'config/app_config.env') # e.g. config/app_config.env
 if APP_CONFIG_PATH:
     if os.path.exists(APP_CONFIG_PATH):
         print(f"Loading app variables from config file {APP_CONFIG_PATH}")
         load_dotenv(APP_CONFIG_PATH)
+    else: print("App config file not found at location:", APP_CONFIG_PATH)
 # Report logging to console?
 LOGGING = get_or_create_env_var('LOGGING', 'False')
 ###
 # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
+AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '') # e.g. config/aws_config.env
 if AWS_CONFIG_PATH:
     if os.path.exists(AWS_CONFIG_PATH):
         print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
         load_dotenv(AWS_CONFIG_PATH)
+    else: print("AWS config file not found at location:", AWS_CONFIG_PATH)
 RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
 OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
 INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
+ensure_folder_exists(OUTPUT_FOLDER)
+ensure_folder_exists(INPUT_FOLDER)
 # Allow for files to be saved in a temporary folder for increased security in some instances
 if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
     # Create a temporary directory
 # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
 # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
+USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
+if USE_LOG_SUBFOLDERS == "True":
+    day_log_subfolder = today_rev + '/'
+    host_name_subfolder = HOST_NAME + '/'
+    full_log_subfolder = day_log_subfolder + host_name_subfolder
+else:
+    full_log_subfolder = ""
+FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + full_log_subfolder)
+ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/' + full_log_subfolder)
+USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/' + full_log_subfolder)
+ensure_folder_exists(FEEDBACK_LOGS_FOLDER)
+ensure_folder_exists(ACCESS_LOGS_FOLDER)
+ensure_folder_exists(USAGE_LOGS_FOLDER)
 # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
 DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
 ###
 # REDACTION CONFIG
+# Create Tesseract and Poppler folders if you have installed them locally
+TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # e.g. tesseract/
+POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # e.g. poppler/poppler-24.02.0/Library/bin/
+if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
+if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)
 # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).

tools/file_redaction.py CHANGED Viewed

@@ -330,7 +330,7 @@ def choose_and_run_redactor(file_paths:List[str],
     # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
-    if pii_identification_method == "AWS Comprehend":
         if aws_access_key_textbox and aws_secret_key_textbox:
             print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
             comprehend_client = boto3.client('comprehend',
@@ -349,7 +349,8 @@ def choose_and_run_redactor(file_paths:List[str],
             out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
             print(out_message)
             raise Exception(out_message)
-    else: comprehend_client = ""
     # Try to connect to AWS Textract Client if using that text extraction method
     if text_extraction_method == textract_option:
@@ -365,13 +366,17 @@ def choose_and_run_redactor(file_paths:List[str],
             print("Getting Textract credentials from environment variables.")
             textract_client = boto3.client('textract',
                 aws_access_key_id=AWS_ACCESS_KEY,
-                aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
         else:
             textract_client = ""
-            out_message_warning = "Cannot connect to AWS Textract service."
-            print(out_message_warning)
-            #raise Warning(out_message)
-    else: textract_client = ""
     # Check if output_folder exists, create it if it doesn't
     if not os.path.exists(output_folder): os.makedirs(output_folder)
@@ -1208,8 +1213,7 @@ def redact_image_pdf(file_path:str,
     tic = time.perf_counter()
-    file_name = get_file_name_without_type(file_path)
     comprehend_query_number_new = 0
     # Update custom word list analyser object with any new words that have been added to the custom deny list
@@ -1323,6 +1327,8 @@ def redact_image_pdf(file_path:str,
             # Check if page exists in existing textract data. If not, send to service to analyse
             if text_extraction_method == textract_option:
                 if not textract_data:
                     try:
                         # Convert the image_path to bytes using an in-memory buffer
@@ -1365,12 +1371,15 @@ def redact_image_pdf(file_path:str,
                             textract_data["pages"].append(text_blocks)
                         except Exception as e:
-                            print("Textract extraction for page", reported_page_number, "failed due to:", e)
                             text_blocks = []
-                            new_request_metadata = "Failed Textract API call"
                             # Check if "pages" key exists, if not, initialise it as an empty list
-                            if "pages" not in textract_data: textract_data["pages"] = []
                         request_metadata = request_metadata + "\n" + new_request_metadata

     # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
+    if pii_identification_method == aws_pii_detector:
         if aws_access_key_textbox and aws_secret_key_textbox:
             print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
             comprehend_client = boto3.client('comprehend',
             out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
             print(out_message)
             raise Exception(out_message)
+    else:
+        comprehend_client = ""
     # Try to connect to AWS Textract Client if using that text extraction method
     if text_extraction_method == textract_option:
             print("Getting Textract credentials from environment variables.")
             textract_client = boto3.client('textract',
                 aws_access_key_id=AWS_ACCESS_KEY,
+                aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
+        elif textract_output_found==True:
+            print("Existing Textract data found for file, no need to connect to AWS Textract")
+            textract_client = boto3.client('textract', region_name=AWS_REGION)
         else:
             textract_client = ""
+            out_message = "Cannot connect to AWS Textract service."
+            print(out_message)
+            raise Exception(out_message)
+    else:
+        textract_client = ""
     # Check if output_folder exists, create it if it doesn't
     if not os.path.exists(output_folder): os.makedirs(output_folder)
     tic = time.perf_counter()
+    file_name = get_file_name_without_type(file_path)
     comprehend_query_number_new = 0
     # Update custom word list analyser object with any new words that have been added to the custom deny list
             # Check if page exists in existing textract data. If not, send to service to analyse
             if text_extraction_method == textract_option:
+                text_blocks = []
                 if not textract_data:
                     try:
                         # Convert the image_path to bytes using an in-memory buffer
                             textract_data["pages"].append(text_blocks)
                         except Exception as e:
+                            out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
+                            print(out_message)
                             text_blocks = []
+                            new_request_metadata = "Failed Textract API call"
                             # Check if "pages" key exists, if not, initialise it as an empty list
+                            if "pages" not in textract_data: textract_data["pages"] = []
+                            raise Exception(out_message)
                         request_metadata = request_metadata + "\n" + new_request_metadata

tools/helper_functions.py CHANGED Viewed

@@ -232,10 +232,10 @@ def check_for_existing_textract_file(doc_file_name_no_extension_textbox:str, out
     else:
         return False
-# Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
 def add_folder_to_path(folder_path: str):
     '''
-    Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist.
     '''
     if os.path.exists(folder_path) and os.path.isdir(folder_path):

     else:
         return False
+#
 def add_folder_to_path(folder_path: str):
     '''
+    Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
     '''
     if os.path.exists(folder_path) and os.path.isdir(folder_path):