import streamlit as st from PIL import Image import json import os from utils import im_2_b64, load_pdf_as_image, generate_metadata from .llm import DocumentLLM from prompts import (document_type_prompt, passport_prompt, payslip_prompt, bank_statement_prompt, p60_prompt, driving_license_prompt, genric_ocr_prompt) from utils.json_utils import restructure_documents from utils import setup_logger logger = setup_logger(__name__) def analyze_files(file_groups: dict, temp_dir, current_upload): document_llm = DocumentLLM() results_group = {} for original_file, extracted_files in file_groups.items(): results = {} for file_name in extracted_files: results[file_name] = {"status": "processed", "type": "image", "dummy_data": 12345} logger.info(f"file_name : {file_name}") extension = file_name.lower().split('.')[-1] results[file_name] = generate_metadata(file_name) try: logger.info(f"Starting analysis for {file_name}") if extension in ['jpg', 'jpeg', 'png', 'gif']: image = Image.open(file_name) image_buffer = im_2_b64(image) elif extension == 'pdf': img = load_pdf_as_image(file_name) image_buffer = im_2_b64(img) st.image(img, use_container_width=True) else: st.write( f"Unsupported file format: {extension}") if image_buffer is not None: results[file_name] = document_llm.call_llm_api( prompt=document_type_prompt, image_path=file_name) logger.info( f"File name: {file_name}, Results: {results[file_name]}") document_type = results[file_name].get( 'document_type', None) if document_type is not None: prompt = None if document_type == 'passport': prompt = passport_prompt elif document_type == 'driving_license': prompt = driving_license_prompt elif document_type == 'bank_statement': prompt = bank_statement_prompt elif document_type == 'payslip': prompt = payslip_prompt elif document_type == 'p60': prompt = p60_prompt else: prompt = genric_ocr_prompt if prompt is not None: data = document_llm.call_llm_api( prompt=prompt, image_path=file_name) results[file_name].update(data) logger.info(f"{file_name}: {data}") except Exception as e: st.error(f"Error processing {file_name}: {str(e)}") image_buffer = None results_group[original_file] = results results_transformed = restructure_documents(results_group) st.session_state['uploads'][current_upload]['results_transformed'] = results_transformed # Save analysis results to a JSON file json_output_path = os.path.join( temp_dir, "analysis_results.json") with open(json_output_path, "w") as json_file: json.dump(results_group, json_file, indent=4) return results_group, json_output_path