underwriting-workflow / utils /process_files.py
vamsidharmuthireddy's picture
Upload 90 files
52c1998 verified
import tempfile
import os
import pdf2image
import zipfile
from .logger import setup_logger
import pandas as pd
logger = setup_logger(__name__)
def process_pdf(file_path: str, file_groups: dict, file_paths: list):
images = pdf2image.convert_from_path(file_path)
image_paths = []
for i, img in enumerate(images):
img_path = f"{file_path}_page_{i}.png"
img.save(img_path, "PNG")
file_paths.append(img_path)
image_paths.append(img_path)
file_groups[file_path] = image_paths
return file_groups, file_paths
def process_uploaded_files(uploaded_files):
file_paths = []
file_groups = {}
application_form = None
memo = None
temp_dir = tempfile.mkdtemp()
print("temp_dir", temp_dir)
for uploaded_file in uploaded_files:
file_path = os.path.join(temp_dir, uploaded_file.name)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
logger.info(
f"file_path: {file_path}, uploaded_file.type : {uploaded_file.type}")
if uploaded_file.type == "application/pdf":
file_groups, file_paths = process_pdf(
file_path=file_path, file_groups=file_groups, file_paths=file_paths)
elif uploaded_file.type.startswith("image"):
file_paths.append(file_path)
file_groups[file_path] = [file_path]
elif uploaded_file.type == "application/zip":
with zipfile.ZipFile(file_path, 'r') as zip_ref:
extract_dir = os.path.join(
temp_dir, uploaded_file.name.replace(".zip", ""))
print(f"extract_dir : {extract_dir}")
zip_ref.extractall(extract_dir)
for root, _, files in os.walk(extract_dir):
for file in files:
if file.lower().endswith((".pdf")):
extracted_path = os.path.join(root, file)
file_groups, file_paths = process_pdf(
file_path=extracted_path,
file_groups=file_groups,
file_paths=file_paths)
elif file.lower().endswith((".png", ".jpg", ".jpeg")):
extracted_path = os.path.join(root, file)
file_paths.append(extracted_path)
file_groups[extracted_path] = [extracted_path]
elif file.lower().endswith((".csv")):
extracted_path = os.path.join(root, file)
application_form = pd.read_csv(
extracted_path, header=None)
logger.info(
f"application_form: {application_form}")
application_form[0] = 'application_summary_' + \
application_form[0].str.strip()
elif file.lower().endswith((".xlsx")):
extracted_path = os.path.join(root, file)
df_dict = pd.read_excel(
extracted_path, sheet_name=None, header=None)
# logger.info(f"df_dict: {df_dict}")
yellow_df = pd.DataFrame()
yellow_df = pd.concat(
[yellow_df, df_dict['Sheet1'].iloc[31:32]], axis=0, ignore_index=True)
yellow_df = pd.concat(
[yellow_df, df_dict['Sheet1'].iloc[33:34]], axis=0, ignore_index=True)
yellow_df = pd.concat(
[yellow_df, df_dict['Sheet1'].iloc[50:51]], axis=0, ignore_index=True)
yellow_df = yellow_df[[0, 1]]
blue_df = pd.DataFrame()
# Deposit details
blue_df = pd.concat(
[blue_df, df_dict['Sheet6'].iloc[44:47]], axis=0, ignore_index=True)
# memo = pd.concat([memo, df['Sheet6'].iloc[50:51]], axis=0, ignore_index=True)
blue_df = blue_df[[0, 1]]
green_df = pd.DataFrame()
# Monthly costs for both applicants
green_df = pd.concat(
[green_df, df_dict['Sheet7'].iloc[5:23]], axis=0, ignore_index=True)
green_df = green_df[[0, 1]]
memo = {
"Mortgage Details": yellow_df,
"Deposit details": blue_df,
"Monthly costs for both applicants": green_df,
}
logger.info(f"memo : {memo}")
else:
extracted_path = os.path.join(root, file)
file_paths.append(extracted_path)
file_groups[extracted_path] = [extracted_path]
print(f"file_groups : {file_groups}")
return file_paths, file_groups, temp_dir, application_form, memo