seanpedrickcase commited on
Commit
a33b955
·
1 Parent(s): 0b9e789

Fixed issue in Docker containers built locally without correct folder permissions. Improved config file. Updated Gradio version to fix issue with selecting filtered rows. Minor bug fixes.

Browse files
Dockerfile CHANGED
@@ -69,10 +69,11 @@ RUN chmod +x /entrypoint.sh
69
  # Switch to the "user" user
70
  USER user
71
 
 
 
72
  # Set environmental variables
73
- ENV HOME=/home/user \
74
- PATH=/home/user/.local/bin:$PATH \
75
- PYTHONPATH=/home/user/app \
76
  PYTHONUNBUFFERED=1 \
77
  PYTHONDONTWRITEBYTECODE=1 \
78
  GRADIO_ALLOW_FLAGGING=never \
@@ -80,15 +81,17 @@ ENV HOME=/home/user \
80
  GRADIO_SERVER_NAME=0.0.0.0 \
81
  GRADIO_SERVER_PORT=7860 \
82
  GRADIO_ANALYTICS_ENABLED=False \
83
- GRADIO_THEME=huggingface \
84
- TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
85
  SYSTEM=spaces
86
 
87
  # Set the working directory to the user's home directory
88
- WORKDIR $HOME/app
89
 
90
  # Copy the app code to the container
91
- COPY --chown=user . $HOME/app
 
 
 
92
 
93
  ENTRYPOINT [ "/entrypoint.sh" ]
94
 
 
69
  # Switch to the "user" user
70
  USER user
71
 
72
+ ENV APP_HOME=/home/user
73
+
74
  # Set environmental variables
75
+ ENV PATH=$APP_HOME/.local/bin:$PATH \
76
+ PYTHONPATH=$APP_HOME/app \
 
77
  PYTHONUNBUFFERED=1 \
78
  PYTHONDONTWRITEBYTECODE=1 \
79
  GRADIO_ALLOW_FLAGGING=never \
 
81
  GRADIO_SERVER_NAME=0.0.0.0 \
82
  GRADIO_SERVER_PORT=7860 \
83
  GRADIO_ANALYTICS_ENABLED=False \
84
+ TLDEXTRACT_CACHE=$APP_HOME/app/tld/.tld_set_snapshot \
 
85
  SYSTEM=spaces
86
 
87
  # Set the working directory to the user's home directory
88
+ WORKDIR $APP_HOME/app
89
 
90
  # Copy the app code to the container
91
+ COPY --chown=user . $APP_HOME/app
92
+
93
+ # Ensure permissions are really user:user again after copying
94
+ RUN chown -R user:user $APP_HOME/app && chmod -R u+rwX $APP_HOME/app
95
 
96
  ENTRYPOINT [ "/entrypoint.sh" ]
97
 
app.py CHANGED
@@ -5,7 +5,7 @@ import gradio as gr
5
  from gradio_image_annotation import image_annotator
6
 
7
  from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS, TEXTRACT_BULK_ANALYSIS_BUCKET, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE
8
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select
9
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3
10
  from tools.file_redaction import choose_and_run_redactor
11
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
@@ -20,11 +20,6 @@ from tools.textract_batch_call import analyse_document_with_textract_api, poll_b
20
  # Suppress downcasting warnings
21
  pd.set_option('future.no_silent_downcasting', True)
22
 
23
- add_folder_to_path(TESSERACT_FOLDER)
24
- add_folder_to_path(POPPLER_FOLDER)
25
-
26
- ensure_output_folder_exists(OUTPUT_FOLDER)
27
-
28
  chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
29
 
30
  full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
@@ -369,8 +364,8 @@ with app:
369
  with gr.Accordion("Identify duplicate pages to redact", open = True):
370
  in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
371
  with gr.Row():
372
- duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
373
- find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 5)
374
 
375
  duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
376
 
@@ -588,7 +583,7 @@ with app:
588
 
589
  # Review OCR text buttom
590
  all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
591
- reset_all_ocr_results_btn.click(reset_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
592
 
593
  # Convert review file to xfdf Adobe format
594
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
 
5
  from gradio_image_annotation import image_annotator
6
 
7
  from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS, TEXTRACT_BULK_ANALYSIS_BUCKET, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE
8
+ from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select
9
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3
10
  from tools.file_redaction import choose_and_run_redactor
11
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
 
20
  # Suppress downcasting warnings
21
  pd.set_option('future.no_silent_downcasting', True)
22
 
 
 
 
 
 
23
  chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
24
 
25
  full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
 
364
  with gr.Accordion("Identify duplicate pages to redact", open = True):
365
  in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
366
  with gr.Row():
367
+ duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
368
+ find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 4)
369
 
370
  duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
371
 
 
583
 
584
  # Review OCR text buttom
585
  all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
586
+ reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
587
 
588
  # Convert review file to xfdf Adobe format
589
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
requirements.txt CHANGED
@@ -7,12 +7,12 @@ presidio_anonymizer==2.2.358
7
  presidio-image-redactor==0.0.56
8
  pikepdf==9.5.2
9
  pandas==2.2.3
10
- #nltk==3.9.1 # Not required
11
  scikit-learn==1.6.1
12
  spacy==3.8.4
13
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
14
  #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
- gradio==5.23.3
 
16
  boto3==1.37.29
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
 
7
  presidio-image-redactor==0.0.56
8
  pikepdf==9.5.2
9
  pandas==2.2.3
 
10
  scikit-learn==1.6.1
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
  #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
14
+ #gradio==5.23.3 # Using latest version of Gradio 5.25.0 below as it fixes the table select issues while filtered
15
+ https://gradio-pypi-previews.s3.amazonaws.com/3e66dcbc9f3b1d106f5488fb1dca51f0787e6d79/gradio-5.25.0-py3-none-any.whl
16
  boto3==1.37.29
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
tools/aws_textract.py CHANGED
@@ -39,12 +39,10 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
39
  else:
40
  client = boto3.client('textract', region_name=AWS_REGION)
41
  except:
42
- print("Cannot connect to AWS Textract")
 
 
43
  return [], "" # Return an empty list and an empty string
44
-
45
- #print("Analysing page with AWS Textract")
46
- #print("pdf_page_bytes:", pdf_page_bytes)
47
- #print("handwrite_signature_checkbox:", handwrite_signature_checkbox)
48
 
49
  # Redact signatures if specified
50
  if "Redact all identified signatures" in handwrite_signature_checkbox:
@@ -138,6 +136,7 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
138
  # This is a new page
139
  elif "page_no" in page_json_data:
140
  text_blocks = page_json_data["data"]["Blocks"]
 
141
 
142
  is_signature = False
143
  is_handwriting = False
 
39
  else:
40
  client = boto3.client('textract', region_name=AWS_REGION)
41
  except:
42
+ out_message = "Cannot connect to AWS Textract"
43
+ print(out_message)
44
+ raise Exception(out_message)
45
  return [], "" # Return an empty list and an empty string
 
 
 
 
46
 
47
  # Redact signatures if specified
48
  if "Redact all identified signatures" in handwrite_signature_checkbox:
 
136
  # This is a new page
137
  elif "page_no" in page_json_data:
138
  text_blocks = page_json_data["data"]["Blocks"]
139
+ else: text_blocks = []
140
 
141
  is_signature = False
142
  is_handwriting = False
tools/config.py CHANGED
@@ -28,16 +28,45 @@ def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False)
28
 
29
  return value
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
33
- APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', 'config/app_config.env')
34
 
35
  if APP_CONFIG_PATH:
36
  if os.path.exists(APP_CONFIG_PATH):
37
  print(f"Loading app variables from config file {APP_CONFIG_PATH}")
38
  load_dotenv(APP_CONFIG_PATH)
39
- else:
40
- print("App config file not found at location:", APP_CONFIG_PATH)
41
 
42
  # Report logging to console?
43
  LOGGING = get_or_create_env_var('LOGGING', 'False')
@@ -51,14 +80,13 @@ if LOGGING == 'True':
51
  ###
52
 
53
  # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
54
- AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', 'config/aws_config.env')
55
 
56
  if AWS_CONFIG_PATH:
57
  if os.path.exists(AWS_CONFIG_PATH):
58
  print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
59
  load_dotenv(AWS_CONFIG_PATH)
60
- else:
61
- print("AWS config file not found at location:", AWS_CONFIG_PATH)
62
 
63
  RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
64
 
@@ -116,6 +144,9 @@ SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False')
116
  OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
117
  INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
118
 
 
 
 
119
  # Allow for files to be saved in a temporary folder for increased security in some instances
120
  if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
121
  # Create a temporary directory
@@ -128,19 +159,35 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
128
  # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
129
  # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
130
 
131
- FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + today_rev + '/' + HOST_NAME + '/')
132
- ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/' + today_rev + '/' + HOST_NAME + '/')
133
- USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/' + today_rev + '/' + HOST_NAME + '/')
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
136
  DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
137
 
138
  ###
139
  # REDACTION CONFIG
140
- ###
141
- TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "tesseract/")
142
 
143
- POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "poppler/poppler-24.02.0/Library/bin/")
 
 
 
 
 
144
 
145
 
146
  # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
 
28
 
29
  return value
30
 
31
+ def ensure_folder_exists(output_folder:str):
32
+ """Checks if the specified folder exists, creates it if not."""
33
+
34
+ if not os.path.exists(output_folder):
35
+ # Create the folder if it doesn't exist
36
+ os.makedirs(output_folder, exist_ok=True)
37
+ print(f"Created the {output_folder} folder.")
38
+ else:
39
+ print(f"The {output_folder} folder already exists.")
40
+
41
+ def add_folder_to_path(folder_path: str):
42
+ '''
43
+ Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
44
+ '''
45
+
46
+ if os.path.exists(folder_path) and os.path.isdir(folder_path):
47
+ print(folder_path, "folder exists.")
48
+
49
+ # Resolve relative path to absolute path
50
+ absolute_path = os.path.abspath(folder_path)
51
+
52
+ current_path = os.environ['PATH']
53
+ if absolute_path not in current_path.split(os.pathsep):
54
+ full_path_extension = absolute_path + os.pathsep + current_path
55
+ os.environ['PATH'] = full_path_extension
56
+ #print(f"Updated PATH with: ", full_path_extension)
57
+ else:
58
+ print(f"Directory {folder_path} already exists in PATH.")
59
+ else:
60
+ print(f"Folder not found at {folder_path} - not added to PATH")
61
 
62
  # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
63
+ APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', 'config/app_config.env') # e.g. config/app_config.env
64
 
65
  if APP_CONFIG_PATH:
66
  if os.path.exists(APP_CONFIG_PATH):
67
  print(f"Loading app variables from config file {APP_CONFIG_PATH}")
68
  load_dotenv(APP_CONFIG_PATH)
69
+ else: print("App config file not found at location:", APP_CONFIG_PATH)
 
70
 
71
  # Report logging to console?
72
  LOGGING = get_or_create_env_var('LOGGING', 'False')
 
80
  ###
81
 
82
  # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
83
+ AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '') # e.g. config/aws_config.env
84
 
85
  if AWS_CONFIG_PATH:
86
  if os.path.exists(AWS_CONFIG_PATH):
87
  print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
88
  load_dotenv(AWS_CONFIG_PATH)
89
+ else: print("AWS config file not found at location:", AWS_CONFIG_PATH)
 
90
 
91
  RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
92
 
 
144
  OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
145
  INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
146
 
147
+ ensure_folder_exists(OUTPUT_FOLDER)
148
+ ensure_folder_exists(INPUT_FOLDER)
149
+
150
  # Allow for files to be saved in a temporary folder for increased security in some instances
151
  if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
152
  # Create a temporary directory
 
159
  # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
160
  # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
161
 
162
+ USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
163
+
164
+ if USE_LOG_SUBFOLDERS == "True":
165
+ day_log_subfolder = today_rev + '/'
166
+ host_name_subfolder = HOST_NAME + '/'
167
+ full_log_subfolder = day_log_subfolder + host_name_subfolder
168
+ else:
169
+ full_log_subfolder = ""
170
+
171
+ FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + full_log_subfolder)
172
+ ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/' + full_log_subfolder)
173
+ USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/' + full_log_subfolder)
174
+
175
+ ensure_folder_exists(FEEDBACK_LOGS_FOLDER)
176
+ ensure_folder_exists(ACCESS_LOGS_FOLDER)
177
+ ensure_folder_exists(USAGE_LOGS_FOLDER)
178
 
179
  # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
180
  DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
181
 
182
  ###
183
  # REDACTION CONFIG
 
 
184
 
185
+ # Create Tesseract and Poppler folders if you have installed them locally
186
+ TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # e.g. tesseract/
187
+ POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # e.g. poppler/poppler-24.02.0/Library/bin/
188
+
189
+ if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
190
+ if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)
191
 
192
 
193
  # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
tools/file_redaction.py CHANGED
@@ -330,7 +330,7 @@ def choose_and_run_redactor(file_paths:List[str],
330
 
331
 
332
  # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
333
- if pii_identification_method == "AWS Comprehend":
334
  if aws_access_key_textbox and aws_secret_key_textbox:
335
  print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
336
  comprehend_client = boto3.client('comprehend',
@@ -349,7 +349,8 @@ def choose_and_run_redactor(file_paths:List[str],
349
  out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
350
  print(out_message)
351
  raise Exception(out_message)
352
- else: comprehend_client = ""
 
353
 
354
  # Try to connect to AWS Textract Client if using that text extraction method
355
  if text_extraction_method == textract_option:
@@ -365,13 +366,17 @@ def choose_and_run_redactor(file_paths:List[str],
365
  print("Getting Textract credentials from environment variables.")
366
  textract_client = boto3.client('textract',
367
  aws_access_key_id=AWS_ACCESS_KEY,
368
- aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
 
 
 
369
  else:
370
  textract_client = ""
371
- out_message_warning = "Cannot connect to AWS Textract service."
372
- print(out_message_warning)
373
- #raise Warning(out_message)
374
- else: textract_client = ""
 
375
 
376
  # Check if output_folder exists, create it if it doesn't
377
  if not os.path.exists(output_folder): os.makedirs(output_folder)
@@ -1208,8 +1213,7 @@ def redact_image_pdf(file_path:str,
1208
 
1209
  tic = time.perf_counter()
1210
 
1211
- file_name = get_file_name_without_type(file_path)
1212
-
1213
  comprehend_query_number_new = 0
1214
 
1215
  # Update custom word list analyser object with any new words that have been added to the custom deny list
@@ -1323,6 +1327,8 @@ def redact_image_pdf(file_path:str,
1323
 
1324
  # Check if page exists in existing textract data. If not, send to service to analyse
1325
  if text_extraction_method == textract_option:
 
 
1326
  if not textract_data:
1327
  try:
1328
  # Convert the image_path to bytes using an in-memory buffer
@@ -1365,12 +1371,15 @@ def redact_image_pdf(file_path:str,
1365
  textract_data["pages"].append(text_blocks)
1366
 
1367
  except Exception as e:
1368
- print("Textract extraction for page", reported_page_number, "failed due to:", e)
 
1369
  text_blocks = []
1370
- new_request_metadata = "Failed Textract API call"
1371
 
1372
  # Check if "pages" key exists, if not, initialise it as an empty list
1373
- if "pages" not in textract_data: textract_data["pages"] = []
 
 
1374
 
1375
  request_metadata = request_metadata + "\n" + new_request_metadata
1376
 
 
330
 
331
 
332
  # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
333
+ if pii_identification_method == aws_pii_detector:
334
  if aws_access_key_textbox and aws_secret_key_textbox:
335
  print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
336
  comprehend_client = boto3.client('comprehend',
 
349
  out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
350
  print(out_message)
351
  raise Exception(out_message)
352
+ else:
353
+ comprehend_client = ""
354
 
355
  # Try to connect to AWS Textract Client if using that text extraction method
356
  if text_extraction_method == textract_option:
 
366
  print("Getting Textract credentials from environment variables.")
367
  textract_client = boto3.client('textract',
368
  aws_access_key_id=AWS_ACCESS_KEY,
369
+ aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
370
+ elif textract_output_found==True:
371
+ print("Existing Textract data found for file, no need to connect to AWS Textract")
372
+ textract_client = boto3.client('textract', region_name=AWS_REGION)
373
  else:
374
  textract_client = ""
375
+ out_message = "Cannot connect to AWS Textract service."
376
+ print(out_message)
377
+ raise Exception(out_message)
378
+ else:
379
+ textract_client = ""
380
 
381
  # Check if output_folder exists, create it if it doesn't
382
  if not os.path.exists(output_folder): os.makedirs(output_folder)
 
1213
 
1214
  tic = time.perf_counter()
1215
 
1216
+ file_name = get_file_name_without_type(file_path)
 
1217
  comprehend_query_number_new = 0
1218
 
1219
  # Update custom word list analyser object with any new words that have been added to the custom deny list
 
1327
 
1328
  # Check if page exists in existing textract data. If not, send to service to analyse
1329
  if text_extraction_method == textract_option:
1330
+ text_blocks = []
1331
+
1332
  if not textract_data:
1333
  try:
1334
  # Convert the image_path to bytes using an in-memory buffer
 
1371
  textract_data["pages"].append(text_blocks)
1372
 
1373
  except Exception as e:
1374
+ out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
1375
+ print(out_message)
1376
  text_blocks = []
1377
+ new_request_metadata = "Failed Textract API call"
1378
 
1379
  # Check if "pages" key exists, if not, initialise it as an empty list
1380
+ if "pages" not in textract_data: textract_data["pages"] = []
1381
+
1382
+ raise Exception(out_message)
1383
 
1384
  request_metadata = request_metadata + "\n" + new_request_metadata
1385
 
tools/helper_functions.py CHANGED
@@ -232,10 +232,10 @@ def check_for_existing_textract_file(doc_file_name_no_extension_textbox:str, out
232
  else:
233
  return False
234
 
235
- # Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
236
  def add_folder_to_path(folder_path: str):
237
  '''
238
- Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist.
239
  '''
240
 
241
  if os.path.exists(folder_path) and os.path.isdir(folder_path):
 
232
  else:
233
  return False
234
 
235
+ #
236
  def add_folder_to_path(folder_path: str):
237
  '''
238
+ Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
239
  '''
240
 
241
  if os.path.exists(folder_path) and os.path.isdir(folder_path):