rockerritesh commited on
Commit
fc9f4fe
·
verified ·
1 Parent(s): bb3f8b3

Upload 5 files

Browse files
Files changed (5) hide show
  1. main.py +69 -0
  2. models.py +7 -0
  3. prompts.py +27 -0
  4. requirements.txt +21 -0
  5. utils.py +148 -0
main.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile
2
+ from fastapi.responses import JSONResponse
3
+ from PIL import Image
4
+ import io
5
+ import json
6
+ from utils import get_text
7
+ from json_flatten import flatten
8
+
9
+ app = FastAPI(
10
+ title="DOCUMANTICAI API",
11
+ description="""
12
+ This API allows you to upload an image and get a formatted response with details and image information.
13
+ """
14
+ )
15
+
16
+ @app.post("/upload")
17
+ async def upload_image(fields:str, model:str, file: UploadFile = File(...)):
18
+ """
19
+ ### Endpoint Description:
20
+ Extract form data from an uploaded image and return the extracted data in JSON format.
21
+
22
+ #### Request Parameters:
23
+ - `file`: The image file to extract data from. (Required)
24
+
25
+ #### Response:
26
+ ### Notes:
27
+ - The image should be in a supported format (e.g., PNG, JPEG).
28
+ - The data extracted will vary depending on the image content.
29
+ """
30
+
31
+ try:
32
+ # Load the uploaded image
33
+ image = Image.open(io.BytesIO(await file.read()))
34
+
35
+ # Example: Get image details
36
+ image_details = {
37
+ "filename": file.filename,
38
+ "format": image.format,
39
+ "size": image.size, # (width, height)
40
+ "mode": image.mode
41
+ }
42
+ response = get_text(image,image_details['filename'], model, fields)
43
+ # Step 1: Convert the escaped JSON string to a proper dictionary
44
+ # Step 2: Convert the response to a proper dictionary
45
+ response = json.loads(response)
46
+
47
+ # Step 3: Convert fields and values into key-value pairs
48
+ if 'fields' in response and 'values' in response:
49
+ response = dict(zip(response['fields'], response['values']))
50
+
51
+ # response flattening
52
+ response = flatten(response)
53
+
54
+ # Process image (example: return metadata)
55
+ return JSONResponse(content={"response": response, "details": image_details})
56
+
57
+ except Exception as e:
58
+ return JSONResponse(content={"error": str(e)}, status_code=400)
59
+
60
+ @app.post("/list_models")
61
+ async def list_models():
62
+ """
63
+ ### Endpoint Description:
64
+ List available models for text generation.
65
+
66
+ #### Response:
67
+ - A list of available models for text generation.
68
+ """
69
+ return JSONResponse(content={"models": ["gpt-4o-mini", "gpt-4o", "deepseek-chat", "claude-3-5-sonnet-20241022", "llama_llm_d","llama_llm_o"]})
models.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import Optional, List
3
+
4
+
5
+ class FormDetails(BaseModel):
6
+ fields: List[str]
7
+ values: List[str]
prompts.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define the system_prompt template
2
+ system_prompt_template = """
3
+ You are an OCR-like data extraction tool that extracts information from the image provided of a form for {}.
4
+ 1. Please extract the data in this image, and then output into JSON.
5
+ 2. Please keep the keys and values of the JSON in the original language.
6
+ 3. The type of data you might encounter in the image includes but is not limited to: names, dates, checkboxes, etc.
7
+ 4. If there are tables in the image, capture all of the rows and columns in the JSON object.
8
+ Even if a column is blank, include it as a key in the JSON object with a null value.
9
+ 5. Don't interpolate or make up data.
10
+ 6. Please maintain the table structure of the charges, i.e., capture all of the rows and columns in the JSON object.
11
+ 7. Return null if the data is not available.
12
+ 8. If no checkboxes are selected, just return null.
13
+ 9. Triple check any numbers provided in the attached image.
14
+ 10. Properly check which row the data belongs to.
15
+
16
+ EXAMPLE JSON OUTPUT:
17
+ {}
18
+ """
19
+
20
+ prompt = """Please extract the [{}] details from this image, and then output into JSON."""
21
+
22
+ # # Fill in the mission dynamically using the format function
23
+ # mission = "processing medical records" # Replace with your specific mission
24
+ # system_prompt = system_prompt_template.format(mission)
25
+
26
+ # # Print or use the resulting system_prompt
27
+ # print(system_prompt)
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ anthropic==0.44.0
2
+ fastapi==0.115.6
3
+ httpcore==1.0.7
4
+ httpx==0.28.1
5
+ ipykernel==6.29.5
6
+ ipython==8.31.0
7
+ jupyter_client==8.6.3
8
+ jupyter_core==5.7.2
9
+ matplotlib-inline==0.1.7
10
+ nest-asyncio==1.6.0
11
+ openai==1.59.9
12
+ pillow==11.1.0
13
+ pydantic==2.10.5
14
+ python-dotenv==1.0.1
15
+ python-multipart==0.0.20
16
+ tqdm==4.67.1
17
+ uvicorn==0.34.0
18
+ llama-index-core==0.12.14
19
+ llama-index-readers-file==0.4.4
20
+ llama-parse==0.5.20
21
+ json-flatten==0.3.0
utils.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os
3
+ # import openai
4
+ from openai import OpenAI
5
+ from models import FormDetails
6
+ from prompts import system_prompt_template, prompt
7
+ import base64
8
+ from io import BytesIO
9
+ import anthropic
10
+ import nest_asyncio
11
+ from llama_parse import LlamaParse
12
+
13
+ nest_asyncio.apply()
14
+ load_dotenv()
15
+
16
+ # set up parser
17
+ parser = LlamaParse(
18
+ result_type="markdown" # "markdown" and "text" are available
19
+ )
20
+
21
+ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
22
+
23
+
24
+ # Function to encode the image
25
+ def encode_image(image):
26
+ buffer = BytesIO()
27
+ # Save the image to the buffer in its format (e.g., PNG, JPEG)
28
+ image.save(buffer, format=image.format)
29
+ # Get the byte data
30
+ image_bytes = buffer.getvalue()
31
+ return base64.b64encode(image_bytes).decode("utf-8")
32
+
33
+ def get_text(image, filename, model, fields="ALL"):
34
+ # print(model)
35
+ # Getting the base64 string
36
+ base64_image = encode_image(image)
37
+ # check if model name starts with gpt
38
+ if model.startswith("gpt"):
39
+ print("gpt")
40
+ client = OpenAI(api_key = OPENAI_API_KEY)
41
+ response = client.beta.chat.completions.parse(
42
+ model=model,
43
+ messages=[
44
+ {
45
+ "role":"system",
46
+ "content":system_prompt_template.format(filename,FormDetails.schema_json())
47
+ },
48
+ {
49
+ "role": "user",
50
+ "content": [
51
+ {
52
+ "type": "text",
53
+ "text": prompt.format(fields),
54
+ },
55
+ {
56
+ "type": "image_url",
57
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
58
+ },
59
+ ],
60
+ }
61
+ ],
62
+ response_format=FormDetails,
63
+ temperature=0.0,
64
+ )
65
+ response = response.choices[0].message.content
66
+ # check if model name starts with claude
67
+ elif model.startswith("claude"):
68
+ print("claude")
69
+ client = anthropic.Anthropic()
70
+ message = client.messages.create(
71
+ model=model,
72
+ max_tokens=1024,
73
+ system= system_prompt_template.format(filename,FormDetails.schema_json()) + " In following Json format,class FormDetails(BaseModel):\nfields: List[str]\nvalues: List[str] ",
74
+ messages=[
75
+ {
76
+ "role": "user",
77
+ "content": [
78
+ {
79
+ "type": "image",
80
+ "source": {
81
+ "type": "base64",
82
+ "media_type": "image/png",
83
+ "data": base64_image,
84
+ },
85
+ },
86
+ {
87
+ "type": "text",
88
+ "text": prompt.format(fields),
89
+ }
90
+ ],
91
+ }
92
+ ],
93
+ temperature=0.0,
94
+ )
95
+ response = message.content[0].text
96
+
97
+ # check if model name starts with llama_llm
98
+ elif model.startswith("llama_llm"):
99
+ print("llama_llm")
100
+ # Ensure the image is in RGB mode (to handle RGBA images)
101
+ if image.mode == "RGBA":
102
+ image = image.convert("RGB")
103
+ # save image to a file
104
+ image.save("image.jpg")
105
+ # parse the image
106
+ text = parser.load_data("image.jpg")
107
+ if model == "llama_llm_o":
108
+ client = OpenAI(api_key = OPENAI_API_KEY)
109
+ response = client.beta.chat.completions.parse(
110
+ model="gpt-4o-mini",
111
+ messages=[
112
+ {
113
+ "role":"system",
114
+ "content":system_prompt_template.format(filename,FormDetails.schema_json())
115
+ },
116
+ {
117
+ "role": "user",
118
+ "content": f"{prompt.format(fields)} \n Knowledge Base {text}"
119
+ }
120
+ ],
121
+ response_format=FormDetails,
122
+ temperature=0.0,
123
+ )
124
+ response = response.choices[0].message.content
125
+ elif model == "llama_llm_d":
126
+ #deepseek
127
+ print("deepseek")
128
+ client = OpenAI(api_key=os.getenv('DEEPSEEK_API_KEY'), base_url=os.getenv('DEEPSEEK_API_URL'))
129
+ response = client.chat.completions.create(
130
+ model="deepseek-chat",
131
+ messages=[
132
+ {
133
+ "role":"system",
134
+ "content":system_prompt_template.format(filename,FormDetails.schema_json())
135
+ },
136
+ {
137
+ "role": "user",
138
+ "content": f"{prompt.format(fields)} \n Knowledge Base {text}"
139
+ }
140
+ ],
141
+ stream=False,
142
+ response_format={
143
+ 'type': 'json_object'
144
+ }
145
+ )
146
+ response = response.choices[0].message.content
147
+ # print(response)
148
+ return response