File size: 3,750 Bytes
48e7216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

import streamlit as st
from PIL import Image
import json
import os
from utils import im_2_b64, load_pdf_as_image, generate_metadata
from .llm import DocumentLLM
from prompts import (document_type_prompt, passport_prompt,
                     payslip_prompt, bank_statement_prompt,
                     p60_prompt, driving_license_prompt,
                     genric_ocr_prompt)
from utils.json_utils import restructure_documents

from utils import setup_logger

logger = setup_logger(__name__)


def analyze_files(file_groups: dict, temp_dir, current_upload):
    document_llm = DocumentLLM()
    results_group = {}
    for original_file, extracted_files in file_groups.items():
        results = {}
        for file_name in extracted_files:
            results[file_name] = {"status": "processed",
                                  "type": "image", "dummy_data": 12345}

            logger.info(f"file_name : {file_name}")
            extension = file_name.lower().split('.')[-1]

            results[file_name] = generate_metadata(file_name)

            try:
                logger.info(f"Starting analysis for {file_name}")

                if extension in ['jpg', 'jpeg', 'png', 'gif']:
                    image = Image.open(file_name)
                    image_buffer = im_2_b64(image)
                elif extension == 'pdf':
                    img = load_pdf_as_image(file_name)
                    image_buffer = im_2_b64(img)
                    st.image(img, use_container_width=True)

                else:
                    st.write(
                        f"Unsupported file format: {extension}")

                if image_buffer is not None:
                    results[file_name] = document_llm.call_llm_api(
                        prompt=document_type_prompt,
                        image_path=file_name)

                    logger.info(
                        f"File name: {file_name}, Results: {results[file_name]}")
                    document_type = results[file_name].get(
                        'document_type', None)

                    if document_type is not None:

                        prompt = None

                        if document_type == 'passport':
                            prompt = passport_prompt
                        elif document_type == 'driving_license':
                            prompt = driving_license_prompt
                        elif document_type == 'bank_statement':
                            prompt = bank_statement_prompt
                        elif document_type == 'payslip':
                            prompt = payslip_prompt
                        elif document_type == 'p60':
                            prompt = p60_prompt
                        else:
                            prompt = genric_ocr_prompt

                        if prompt is not None:
                            data = document_llm.call_llm_api(
                                prompt=prompt,
                                image_path=file_name)

                            results[file_name].update(data)

                            logger.info(f"{file_name}: {data}")

            except Exception as e:
                st.error(f"Error processing {file_name}: {str(e)}")

            image_buffer = None

        results_group[original_file] = results


    results_transformed = restructure_documents(results_group)
    st.session_state['uploads'][current_upload]['results_transformed'] = results_transformed


    # Save analysis results to a JSON file
    json_output_path = os.path.join(
        temp_dir, "analysis_results.json")
    with open(json_output_path, "w") as json_file:
        json.dump(results_group, json_file, indent=4)

    return results_group, json_output_path