mominah commited on
Commit
f661102
·
verified ·
1 Parent(s): a5b5679

Create extraction_routes.py

Browse files
Files changed (1) hide show
  1. extraction_routes.py +78 -0
extraction_routes.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # extraction_routes.py
2
+ import os
3
+ import io
4
+ import time
5
+ import PIL.Image
6
+ from fastapi import APIRouter, File, UploadFile, HTTPException, Request
7
+ from fastapi.responses import JSONResponse
8
+ from pdf2image import convert_from_bytes
9
+ from google import genai
10
+ from google.genai.errors import ClientError
11
+
12
+ router = APIRouter()
13
+
14
+ API_KEY = os.getenv("API_KEY")
15
+ if not API_KEY:
16
+ raise ValueError("API_KEY environment variable is not set")
17
+
18
+ client = genai.Client(api_key=API_KEY)
19
+
20
+ def extract_text_from_image(img):
21
+ max_retries = 3
22
+ for attempt in range(max_retries):
23
+ try:
24
+ response = client.models.generate_content(
25
+ model="gemini-2.0-flash",
26
+ contents=[
27
+ "Extract the text from the image. Do not write anything except the extracted content",
28
+ img,
29
+ ]
30
+ )
31
+ return response.text
32
+ except ClientError as e:
33
+ error_code = e.args[0] if e.args and isinstance(e.args[0], int) else None
34
+ if error_code == 429:
35
+ if attempt < max_retries - 1:
36
+ time.sleep(2 ** attempt)
37
+ continue
38
+ else:
39
+ raise HTTPException(
40
+ status_code=503,
41
+ detail="API resource exhausted. Please try again later."
42
+ )
43
+ else:
44
+ raise HTTPException(
45
+ status_code=500,
46
+ detail=f"Error processing image: {str(e)}"
47
+ )
48
+
49
+ @router.post("/upload", summary="Upload a PDF or image file", response_description="Returns extracted text as JSON")
50
+ async def upload_file(file: UploadFile = File(...)):
51
+ if not file.filename:
52
+ raise HTTPException(status_code=400, detail="No file provided")
53
+
54
+ file_contents = await file.read()
55
+ output_text = ""
56
+
57
+ if file.filename.lower().endswith(".pdf"):
58
+ try:
59
+ images = convert_from_bytes(file_contents, dpi=200)
60
+ except Exception as e:
61
+ raise HTTPException(status_code=500, detail=f"Error converting PDF: {str(e)}")
62
+
63
+ for idx, img in enumerate(images, start=1):
64
+ page_text = extract_text_from_image(img)
65
+ output_text += f"### Page {idx}\n\n{page_text}\n\n"
66
+ else:
67
+ try:
68
+ img = PIL.Image.open(io.BytesIO(file_contents))
69
+ except Exception as e:
70
+ raise HTTPException(status_code=400, detail="Uploaded file is not a valid image")
71
+
72
+ output_text += extract_text_from_image(img) + "\n\n"
73
+
74
+ return JSONResponse(content={"extracted_text": output_text})
75
+
76
+ @router.get("/", summary="Health Check for Extraction")
77
+ async def root():
78
+ return JSONResponse(content={"message": "Text Extraction API is up and running."})