|
import os |
|
import tempfile |
|
import socket |
|
import logging |
|
from datetime import datetime |
|
from dotenv import load_dotenv |
|
from tldextract import TLDExtract |
|
|
|
today_rev = datetime.now().strftime("%Y%m%d") |
|
HOST_NAME = socket.gethostname() |
|
|
|
|
|
|
|
def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False): |
|
''' |
|
Get an environmental variable, and set it to a default value if it doesn't exist |
|
''' |
|
|
|
value = os.environ.get(var_name) |
|
|
|
|
|
if value is None: |
|
os.environ[var_name] = default_value |
|
value = default_value |
|
|
|
if print_val == True: |
|
print(f'The value of {var_name} is {value}') |
|
|
|
return value |
|
|
|
def ensure_folder_exists(output_folder:str): |
|
"""Checks if the specified folder exists, creates it if not.""" |
|
|
|
if not os.path.exists(output_folder): |
|
|
|
os.makedirs(output_folder, exist_ok=True) |
|
print(f"Created the {output_folder} folder.") |
|
else: |
|
print(f"The {output_folder} folder already exists.") |
|
|
|
def add_folder_to_path(folder_path: str): |
|
''' |
|
Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run) |
|
''' |
|
|
|
if os.path.exists(folder_path) and os.path.isdir(folder_path): |
|
print(folder_path, "folder exists.") |
|
|
|
|
|
absolute_path = os.path.abspath(folder_path) |
|
|
|
current_path = os.environ['PATH'] |
|
if absolute_path not in current_path.split(os.pathsep): |
|
full_path_extension = absolute_path + os.pathsep + current_path |
|
os.environ['PATH'] = full_path_extension |
|
|
|
else: |
|
print(f"Directory {folder_path} already exists in PATH.") |
|
else: |
|
print(f"Folder not found at {folder_path} - not added to PATH") |
|
|
|
ensure_folder_exists("config/") |
|
|
|
|
|
APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', 'config/app_config.env') |
|
|
|
if APP_CONFIG_PATH: |
|
if os.path.exists(APP_CONFIG_PATH): |
|
print(f"Loading app variables from config file {APP_CONFIG_PATH}") |
|
load_dotenv(APP_CONFIG_PATH) |
|
else: print("App config file not found at location:", APP_CONFIG_PATH) |
|
|
|
|
|
LOGGING = get_or_create_env_var('LOGGING', 'False') |
|
|
|
if LOGGING == 'True': |
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
|
|
|
|
|
|
|
|
|
|
AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '') |
|
|
|
if AWS_CONFIG_PATH: |
|
if os.path.exists(AWS_CONFIG_PATH): |
|
print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}") |
|
load_dotenv(AWS_CONFIG_PATH) |
|
else: print("AWS config file not found at location:", AWS_CONFIG_PATH) |
|
|
|
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0") |
|
|
|
AWS_REGION = get_or_create_env_var('AWS_REGION', '') |
|
|
|
AWS_CLIENT_ID = get_or_create_env_var('AWS_CLIENT_ID', '') |
|
|
|
AWS_CLIENT_SECRET = get_or_create_env_var('AWS_CLIENT_SECRET', '') |
|
|
|
AWS_USER_POOL_ID = get_or_create_env_var('AWS_USER_POOL_ID', '') |
|
|
|
AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '') |
|
if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables') |
|
|
|
AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '') |
|
if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables') |
|
|
|
DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '') |
|
|
|
|
|
|
|
|
|
|
|
CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '') |
|
|
|
|
|
|
|
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '') |
|
|
|
|
|
|
|
|
|
|
|
IMAGES_DPI = get_or_create_env_var('IMAGES_DPI', '300.0') |
|
LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True') |
|
MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') |
|
|
|
|
|
|
|
|
|
SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False') |
|
|
|
OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') |
|
INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') |
|
|
|
ensure_folder_exists(OUTPUT_FOLDER) |
|
ensure_folder_exists(INPUT_FOLDER) |
|
|
|
|
|
if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP": |
|
|
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
print(f'Temporary directory created at: {temp_dir}') |
|
|
|
if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/" |
|
if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/" |
|
|
|
|
|
|
|
|
|
SAVE_LOGS_TO_CSV = get_or_create_env_var('SAVE_LOGS_TO_CSV', 'True') |
|
|
|
USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True') |
|
|
|
if USE_LOG_SUBFOLDERS == "True": |
|
day_log_subfolder = today_rev + '/' |
|
host_name_subfolder = HOST_NAME + '/' |
|
full_log_subfolder = day_log_subfolder + host_name_subfolder |
|
else: |
|
full_log_subfolder = "" |
|
|
|
FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + full_log_subfolder) |
|
ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/' + full_log_subfolder) |
|
USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/' + full_log_subfolder) |
|
|
|
ensure_folder_exists(FEEDBACK_LOGS_FOLDER) |
|
ensure_folder_exists(ACCESS_LOGS_FOLDER) |
|
ensure_folder_exists(USAGE_LOGS_FOLDER) |
|
|
|
|
|
DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False') |
|
|
|
|
|
|
|
CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') |
|
CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') |
|
CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') |
|
|
|
|
|
|
|
SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False') |
|
|
|
ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', 'redaction_access_log') |
|
DYNAMODB_ACCESS_LOG_HEADERS = get_or_create_env_var('DYNAMODB_ACCESS_LOG_HEADERS', '') |
|
|
|
FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('FEEDBACK_LOG_DYNAMODB_TABLE_NAME', 'redaction_feedback') |
|
DYNAMODB_FEEDBACK_LOG_HEADERS = get_or_create_env_var('DYNAMODB_FEEDBACK_LOG_HEADERS', '') |
|
|
|
USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', 'redaction_usage') |
|
DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var('DYNAMODB_USAGE_LOG_HEADERS', '') |
|
|
|
|
|
|
|
|
|
|
|
|
|
TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") |
|
POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") |
|
|
|
if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER) |
|
if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER) |
|
|
|
|
|
|
|
PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999') |
|
|
|
MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999') |
|
|
|
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") |
|
|
|
REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") |
|
|
|
|
|
|
|
|
|
|
|
TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot') |
|
try: |
|
extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE) |
|
except: |
|
extract = TLDExtract(cache_dir=None) |
|
|
|
|
|
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0') |
|
|
|
RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0') |
|
|
|
MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5')) |
|
|
|
MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb') |
|
|
|
GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860')) |
|
|
|
ROOT_PATH = get_or_create_env_var('ROOT_PATH', '') |
|
|
|
DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3') |
|
|
|
GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', '') |
|
|
|
ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') |
|
|
|
S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') |
|
|
|
if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH |
|
else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv' |
|
|
|
|
|
|
|
SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False') |
|
|
|
GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False') |
|
|
|
DEFAULT_COST_CODE = get_or_create_env_var('DEFAULT_COST_CODE', '') |
|
|
|
COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '') |
|
|
|
S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') |
|
|
|
|
|
if COST_CODES_PATH: OUTPUT_COST_CODES_PATH = COST_CODES_PATH |
|
else: OUTPUT_COST_CODES_PATH = 'config/cost_codes.csv' |
|
|
|
ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') |
|
|
|
if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True' |
|
|
|
|
|
|
|
SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS', 'False') |
|
|
|
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET', '') |
|
|
|
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER', 'input') |
|
|
|
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER', 'output') |
|
|
|
LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_JOBS_S3', 'False') |
|
|
|
TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') |
|
|
|
TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') |