Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -123,6 +123,9 @@ app = FastAPI()
|
|
123 |
# Mount the static directory to serve HTML, CSS, JS files
|
124 |
app.mount("/static", StaticFiles(directory="static"), name="static")
|
125 |
|
|
|
|
|
|
|
126 |
# Initialize transformers pipelines
|
127 |
qa_pipeline = pipeline("question-answering", model="microsoft/phi-2", tokenizer="microsoft/phi-2")
|
128 |
image_qa_pipeline = pipeline("vqa", model="Salesforce/blip-vqa-base")
|
@@ -130,13 +133,6 @@ image_qa_pipeline = pipeline("vqa", model="Salesforce/blip-vqa-base")
|
|
130 |
# Initialize EasyOCR for image-based text extraction
|
131 |
reader = easyocr.Reader(['en'])
|
132 |
|
133 |
-
# Define a template for rendering HTML
|
134 |
-
templates = Jinja2Templates(directory="templates")
|
135 |
-
|
136 |
-
# Ensure temp_files directory exists
|
137 |
-
temp_dir = "temp_files"
|
138 |
-
os.makedirs(temp_dir, exist_ok=True)
|
139 |
-
|
140 |
# Maximum allowed file size in bytes (e.g., 5 MB)
|
141 |
MAX_FILE_SIZE = 5 * 1024 * 1024 # 5 MB
|
142 |
|
@@ -165,10 +161,10 @@ def extract_pptx_text(file_path: str):
|
|
165 |
def extract_text_from_image(image: Image):
|
166 |
return pytesseract.image_to_string(image)
|
167 |
|
168 |
-
# Home route
|
169 |
@app.get("/")
|
170 |
-
def home():
|
171 |
-
return
|
172 |
|
173 |
# Function to answer questions based on document content
|
174 |
@app.post("/question-answering-doc")
|
@@ -177,18 +173,29 @@ async def question_answering_doc(request: Request, question: str = Form(...), fi
|
|
177 |
if file.spool_max_size > MAX_FILE_SIZE:
|
178 |
raise HTTPException(status_code=400, detail=f"File size exceeds the {MAX_FILE_SIZE / (1024 * 1024)} MB limit.")
|
179 |
|
180 |
-
file_path = os.path.join(temp_dir, file.filename)
|
181 |
-
with open(file_path, "wb") as f:
|
182 |
-
f.write(await file.read())
|
183 |
-
|
184 |
try:
|
|
|
|
|
|
|
185 |
# Extract text based on the file type
|
186 |
if file.filename.endswith(".pdf"):
|
|
|
|
|
|
|
187 |
text = extract_pdf_text(file_path)
|
|
|
188 |
elif file.filename.endswith(".docx"):
|
|
|
|
|
|
|
189 |
text = extract_docx_text(file_path)
|
|
|
190 |
elif file.filename.endswith(".pptx"):
|
|
|
|
|
|
|
191 |
text = extract_pptx_text(file_path)
|
|
|
192 |
else:
|
193 |
raise HTTPException(status_code=400, detail="Unsupported file format")
|
194 |
except Exception as e:
|
@@ -196,9 +203,6 @@ async def question_answering_doc(request: Request, question: str = Form(...), fi
|
|
196 |
|
197 |
qa_result = qa_pipeline(question=question, context=text)
|
198 |
|
199 |
-
# Clean up the temporary file
|
200 |
-
os.remove(file_path)
|
201 |
-
|
202 |
return templates.TemplateResponse("index.html", {"request": request, "answer": qa_result['answer']})
|
203 |
|
204 |
# Function to answer questions based on images
|
|
|
123 |
# Mount the static directory to serve HTML, CSS, JS files
|
124 |
app.mount("/static", StaticFiles(directory="static"), name="static")
|
125 |
|
126 |
+
# Define a template for rendering HTML
|
127 |
+
templates = Jinja2Templates(directory="templates")
|
128 |
+
|
129 |
# Initialize transformers pipelines
|
130 |
qa_pipeline = pipeline("question-answering", model="microsoft/phi-2", tokenizer="microsoft/phi-2")
|
131 |
image_qa_pipeline = pipeline("vqa", model="Salesforce/blip-vqa-base")
|
|
|
133 |
# Initialize EasyOCR for image-based text extraction
|
134 |
reader = easyocr.Reader(['en'])
|
135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
# Maximum allowed file size in bytes (e.g., 5 MB)
|
137 |
MAX_FILE_SIZE = 5 * 1024 * 1024 # 5 MB
|
138 |
|
|
|
161 |
def extract_text_from_image(image: Image):
|
162 |
return pytesseract.image_to_string(image)
|
163 |
|
164 |
+
# Home route - Render the index page
|
165 |
@app.get("/")
|
166 |
+
async def home(request: Request):
|
167 |
+
return templates.TemplateResponse("index.html", {"request": request})
|
168 |
|
169 |
# Function to answer questions based on document content
|
170 |
@app.post("/question-answering-doc")
|
|
|
173 |
if file.spool_max_size > MAX_FILE_SIZE:
|
174 |
raise HTTPException(status_code=400, detail=f"File size exceeds the {MAX_FILE_SIZE / (1024 * 1024)} MB limit.")
|
175 |
|
|
|
|
|
|
|
|
|
176 |
try:
|
177 |
+
# Read the file content into memory
|
178 |
+
file_content = await file.read()
|
179 |
+
|
180 |
# Extract text based on the file type
|
181 |
if file.filename.endswith(".pdf"):
|
182 |
+
file_path = "/tmp/tempfile.pdf"
|
183 |
+
with open(file_path, "wb") as f:
|
184 |
+
f.write(file_content)
|
185 |
text = extract_pdf_text(file_path)
|
186 |
+
os.remove(file_path)
|
187 |
elif file.filename.endswith(".docx"):
|
188 |
+
file_path = "/tmp/tempfile.docx"
|
189 |
+
with open(file_path, "wb") as f:
|
190 |
+
f.write(file_content)
|
191 |
text = extract_docx_text(file_path)
|
192 |
+
os.remove(file_path)
|
193 |
elif file.filename.endswith(".pptx"):
|
194 |
+
file_path = "/tmp/tempfile.pptx"
|
195 |
+
with open(file_path, "wb") as f:
|
196 |
+
f.write(file_content)
|
197 |
text = extract_pptx_text(file_path)
|
198 |
+
os.remove(file_path)
|
199 |
else:
|
200 |
raise HTTPException(status_code=400, detail="Unsupported file format")
|
201 |
except Exception as e:
|
|
|
203 |
|
204 |
qa_result = qa_pipeline(question=question, context=text)
|
205 |
|
|
|
|
|
|
|
206 |
return templates.TemplateResponse("index.html", {"request": request, "answer": qa_result['answer']})
|
207 |
|
208 |
# Function to answer questions based on images
|