Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- main.py +69 -0
- models.py +7 -0
- prompts.py +27 -0
- requirements.txt +21 -0
- utils.py +148 -0
main.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, File, UploadFile
|
2 |
+
from fastapi.responses import JSONResponse
|
3 |
+
from PIL import Image
|
4 |
+
import io
|
5 |
+
import json
|
6 |
+
from utils import get_text
|
7 |
+
from json_flatten import flatten
|
8 |
+
|
9 |
+
app = FastAPI(
|
10 |
+
title="DOCUMANTICAI API",
|
11 |
+
description="""
|
12 |
+
This API allows you to upload an image and get a formatted response with details and image information.
|
13 |
+
"""
|
14 |
+
)
|
15 |
+
|
16 |
+
@app.post("/upload")
|
17 |
+
async def upload_image(fields:str, model:str, file: UploadFile = File(...)):
|
18 |
+
"""
|
19 |
+
### Endpoint Description:
|
20 |
+
Extract form data from an uploaded image and return the extracted data in JSON format.
|
21 |
+
|
22 |
+
#### Request Parameters:
|
23 |
+
- `file`: The image file to extract data from. (Required)
|
24 |
+
|
25 |
+
#### Response:
|
26 |
+
### Notes:
|
27 |
+
- The image should be in a supported format (e.g., PNG, JPEG).
|
28 |
+
- The data extracted will vary depending on the image content.
|
29 |
+
"""
|
30 |
+
|
31 |
+
try:
|
32 |
+
# Load the uploaded image
|
33 |
+
image = Image.open(io.BytesIO(await file.read()))
|
34 |
+
|
35 |
+
# Example: Get image details
|
36 |
+
image_details = {
|
37 |
+
"filename": file.filename,
|
38 |
+
"format": image.format,
|
39 |
+
"size": image.size, # (width, height)
|
40 |
+
"mode": image.mode
|
41 |
+
}
|
42 |
+
response = get_text(image,image_details['filename'], model, fields)
|
43 |
+
# Step 1: Convert the escaped JSON string to a proper dictionary
|
44 |
+
# Step 2: Convert the response to a proper dictionary
|
45 |
+
response = json.loads(response)
|
46 |
+
|
47 |
+
# Step 3: Convert fields and values into key-value pairs
|
48 |
+
if 'fields' in response and 'values' in response:
|
49 |
+
response = dict(zip(response['fields'], response['values']))
|
50 |
+
|
51 |
+
# response flattening
|
52 |
+
response = flatten(response)
|
53 |
+
|
54 |
+
# Process image (example: return metadata)
|
55 |
+
return JSONResponse(content={"response": response, "details": image_details})
|
56 |
+
|
57 |
+
except Exception as e:
|
58 |
+
return JSONResponse(content={"error": str(e)}, status_code=400)
|
59 |
+
|
60 |
+
@app.post("/list_models")
|
61 |
+
async def list_models():
|
62 |
+
"""
|
63 |
+
### Endpoint Description:
|
64 |
+
List available models for text generation.
|
65 |
+
|
66 |
+
#### Response:
|
67 |
+
- A list of available models for text generation.
|
68 |
+
"""
|
69 |
+
return JSONResponse(content={"models": ["gpt-4o-mini", "gpt-4o", "deepseek-chat", "claude-3-5-sonnet-20241022", "llama_llm_d","llama_llm_o"]})
|
models.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel, Field
|
2 |
+
from typing import Optional, List
|
3 |
+
|
4 |
+
|
5 |
+
class FormDetails(BaseModel):
|
6 |
+
fields: List[str]
|
7 |
+
values: List[str]
|
prompts.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Define the system_prompt template
|
2 |
+
system_prompt_template = """
|
3 |
+
You are an OCR-like data extraction tool that extracts information from the image provided of a form for {}.
|
4 |
+
1. Please extract the data in this image, and then output into JSON.
|
5 |
+
2. Please keep the keys and values of the JSON in the original language.
|
6 |
+
3. The type of data you might encounter in the image includes but is not limited to: names, dates, checkboxes, etc.
|
7 |
+
4. If there are tables in the image, capture all of the rows and columns in the JSON object.
|
8 |
+
Even if a column is blank, include it as a key in the JSON object with a null value.
|
9 |
+
5. Don't interpolate or make up data.
|
10 |
+
6. Please maintain the table structure of the charges, i.e., capture all of the rows and columns in the JSON object.
|
11 |
+
7. Return null if the data is not available.
|
12 |
+
8. If no checkboxes are selected, just return null.
|
13 |
+
9. Triple check any numbers provided in the attached image.
|
14 |
+
10. Properly check which row the data belongs to.
|
15 |
+
|
16 |
+
EXAMPLE JSON OUTPUT:
|
17 |
+
{}
|
18 |
+
"""
|
19 |
+
|
20 |
+
prompt = """Please extract the [{}] details from this image, and then output into JSON."""
|
21 |
+
|
22 |
+
# # Fill in the mission dynamically using the format function
|
23 |
+
# mission = "processing medical records" # Replace with your specific mission
|
24 |
+
# system_prompt = system_prompt_template.format(mission)
|
25 |
+
|
26 |
+
# # Print or use the resulting system_prompt
|
27 |
+
# print(system_prompt)
|
requirements.txt
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
anthropic==0.44.0
|
2 |
+
fastapi==0.115.6
|
3 |
+
httpcore==1.0.7
|
4 |
+
httpx==0.28.1
|
5 |
+
ipykernel==6.29.5
|
6 |
+
ipython==8.31.0
|
7 |
+
jupyter_client==8.6.3
|
8 |
+
jupyter_core==5.7.2
|
9 |
+
matplotlib-inline==0.1.7
|
10 |
+
nest-asyncio==1.6.0
|
11 |
+
openai==1.59.9
|
12 |
+
pillow==11.1.0
|
13 |
+
pydantic==2.10.5
|
14 |
+
python-dotenv==1.0.1
|
15 |
+
python-multipart==0.0.20
|
16 |
+
tqdm==4.67.1
|
17 |
+
uvicorn==0.34.0
|
18 |
+
llama-index-core==0.12.14
|
19 |
+
llama-index-readers-file==0.4.4
|
20 |
+
llama-parse==0.5.20
|
21 |
+
json-flatten==0.3.0
|
utils.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
import os
|
3 |
+
# import openai
|
4 |
+
from openai import OpenAI
|
5 |
+
from models import FormDetails
|
6 |
+
from prompts import system_prompt_template, prompt
|
7 |
+
import base64
|
8 |
+
from io import BytesIO
|
9 |
+
import anthropic
|
10 |
+
import nest_asyncio
|
11 |
+
from llama_parse import LlamaParse
|
12 |
+
|
13 |
+
nest_asyncio.apply()
|
14 |
+
load_dotenv()
|
15 |
+
|
16 |
+
# set up parser
|
17 |
+
parser = LlamaParse(
|
18 |
+
result_type="markdown" # "markdown" and "text" are available
|
19 |
+
)
|
20 |
+
|
21 |
+
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
|
22 |
+
|
23 |
+
|
24 |
+
# Function to encode the image
|
25 |
+
def encode_image(image):
|
26 |
+
buffer = BytesIO()
|
27 |
+
# Save the image to the buffer in its format (e.g., PNG, JPEG)
|
28 |
+
image.save(buffer, format=image.format)
|
29 |
+
# Get the byte data
|
30 |
+
image_bytes = buffer.getvalue()
|
31 |
+
return base64.b64encode(image_bytes).decode("utf-8")
|
32 |
+
|
33 |
+
def get_text(image, filename, model, fields="ALL"):
|
34 |
+
# print(model)
|
35 |
+
# Getting the base64 string
|
36 |
+
base64_image = encode_image(image)
|
37 |
+
# check if model name starts with gpt
|
38 |
+
if model.startswith("gpt"):
|
39 |
+
print("gpt")
|
40 |
+
client = OpenAI(api_key = OPENAI_API_KEY)
|
41 |
+
response = client.beta.chat.completions.parse(
|
42 |
+
model=model,
|
43 |
+
messages=[
|
44 |
+
{
|
45 |
+
"role":"system",
|
46 |
+
"content":system_prompt_template.format(filename,FormDetails.schema_json())
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"role": "user",
|
50 |
+
"content": [
|
51 |
+
{
|
52 |
+
"type": "text",
|
53 |
+
"text": prompt.format(fields),
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"type": "image_url",
|
57 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
58 |
+
},
|
59 |
+
],
|
60 |
+
}
|
61 |
+
],
|
62 |
+
response_format=FormDetails,
|
63 |
+
temperature=0.0,
|
64 |
+
)
|
65 |
+
response = response.choices[0].message.content
|
66 |
+
# check if model name starts with claude
|
67 |
+
elif model.startswith("claude"):
|
68 |
+
print("claude")
|
69 |
+
client = anthropic.Anthropic()
|
70 |
+
message = client.messages.create(
|
71 |
+
model=model,
|
72 |
+
max_tokens=1024,
|
73 |
+
system= system_prompt_template.format(filename,FormDetails.schema_json()) + " In following Json format,class FormDetails(BaseModel):\nfields: List[str]\nvalues: List[str] ",
|
74 |
+
messages=[
|
75 |
+
{
|
76 |
+
"role": "user",
|
77 |
+
"content": [
|
78 |
+
{
|
79 |
+
"type": "image",
|
80 |
+
"source": {
|
81 |
+
"type": "base64",
|
82 |
+
"media_type": "image/png",
|
83 |
+
"data": base64_image,
|
84 |
+
},
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"type": "text",
|
88 |
+
"text": prompt.format(fields),
|
89 |
+
}
|
90 |
+
],
|
91 |
+
}
|
92 |
+
],
|
93 |
+
temperature=0.0,
|
94 |
+
)
|
95 |
+
response = message.content[0].text
|
96 |
+
|
97 |
+
# check if model name starts with llama_llm
|
98 |
+
elif model.startswith("llama_llm"):
|
99 |
+
print("llama_llm")
|
100 |
+
# Ensure the image is in RGB mode (to handle RGBA images)
|
101 |
+
if image.mode == "RGBA":
|
102 |
+
image = image.convert("RGB")
|
103 |
+
# save image to a file
|
104 |
+
image.save("image.jpg")
|
105 |
+
# parse the image
|
106 |
+
text = parser.load_data("image.jpg")
|
107 |
+
if model == "llama_llm_o":
|
108 |
+
client = OpenAI(api_key = OPENAI_API_KEY)
|
109 |
+
response = client.beta.chat.completions.parse(
|
110 |
+
model="gpt-4o-mini",
|
111 |
+
messages=[
|
112 |
+
{
|
113 |
+
"role":"system",
|
114 |
+
"content":system_prompt_template.format(filename,FormDetails.schema_json())
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"role": "user",
|
118 |
+
"content": f"{prompt.format(fields)} \n Knowledge Base {text}"
|
119 |
+
}
|
120 |
+
],
|
121 |
+
response_format=FormDetails,
|
122 |
+
temperature=0.0,
|
123 |
+
)
|
124 |
+
response = response.choices[0].message.content
|
125 |
+
elif model == "llama_llm_d":
|
126 |
+
#deepseek
|
127 |
+
print("deepseek")
|
128 |
+
client = OpenAI(api_key=os.getenv('DEEPSEEK_API_KEY'), base_url=os.getenv('DEEPSEEK_API_URL'))
|
129 |
+
response = client.chat.completions.create(
|
130 |
+
model="deepseek-chat",
|
131 |
+
messages=[
|
132 |
+
{
|
133 |
+
"role":"system",
|
134 |
+
"content":system_prompt_template.format(filename,FormDetails.schema_json())
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"role": "user",
|
138 |
+
"content": f"{prompt.format(fields)} \n Knowledge Base {text}"
|
139 |
+
}
|
140 |
+
],
|
141 |
+
stream=False,
|
142 |
+
response_format={
|
143 |
+
'type': 'json_object'
|
144 |
+
}
|
145 |
+
)
|
146 |
+
response = response.choices[0].message.content
|
147 |
+
# print(response)
|
148 |
+
return response
|