ikraamkb commited on
Commit
4fa0b0a
·
verified ·
1 Parent(s): 7acae8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -17
app.py CHANGED
@@ -123,6 +123,9 @@ app = FastAPI()
123
  # Mount the static directory to serve HTML, CSS, JS files
124
  app.mount("/static", StaticFiles(directory="static"), name="static")
125
 
 
 
 
126
  # Initialize transformers pipelines
127
  qa_pipeline = pipeline("question-answering", model="microsoft/phi-2", tokenizer="microsoft/phi-2")
128
  image_qa_pipeline = pipeline("vqa", model="Salesforce/blip-vqa-base")
@@ -130,13 +133,6 @@ image_qa_pipeline = pipeline("vqa", model="Salesforce/blip-vqa-base")
130
  # Initialize EasyOCR for image-based text extraction
131
  reader = easyocr.Reader(['en'])
132
 
133
- # Define a template for rendering HTML
134
- templates = Jinja2Templates(directory="templates")
135
-
136
- # Ensure temp_files directory exists
137
- temp_dir = "temp_files"
138
- os.makedirs(temp_dir, exist_ok=True)
139
-
140
  # Maximum allowed file size in bytes (e.g., 5 MB)
141
  MAX_FILE_SIZE = 5 * 1024 * 1024 # 5 MB
142
 
@@ -165,10 +161,10 @@ def extract_pptx_text(file_path: str):
165
  def extract_text_from_image(image: Image):
166
  return pytesseract.image_to_string(image)
167
 
168
- # Home route
169
  @app.get("/")
170
- def home():
171
- return RedirectResponse(url="/docs")
172
 
173
  # Function to answer questions based on document content
174
  @app.post("/question-answering-doc")
@@ -177,18 +173,29 @@ async def question_answering_doc(request: Request, question: str = Form(...), fi
177
  if file.spool_max_size > MAX_FILE_SIZE:
178
  raise HTTPException(status_code=400, detail=f"File size exceeds the {MAX_FILE_SIZE / (1024 * 1024)} MB limit.")
179
 
180
- file_path = os.path.join(temp_dir, file.filename)
181
- with open(file_path, "wb") as f:
182
- f.write(await file.read())
183
-
184
  try:
 
 
 
185
  # Extract text based on the file type
186
  if file.filename.endswith(".pdf"):
 
 
 
187
  text = extract_pdf_text(file_path)
 
188
  elif file.filename.endswith(".docx"):
 
 
 
189
  text = extract_docx_text(file_path)
 
190
  elif file.filename.endswith(".pptx"):
 
 
 
191
  text = extract_pptx_text(file_path)
 
192
  else:
193
  raise HTTPException(status_code=400, detail="Unsupported file format")
194
  except Exception as e:
@@ -196,9 +203,6 @@ async def question_answering_doc(request: Request, question: str = Form(...), fi
196
 
197
  qa_result = qa_pipeline(question=question, context=text)
198
 
199
- # Clean up the temporary file
200
- os.remove(file_path)
201
-
202
  return templates.TemplateResponse("index.html", {"request": request, "answer": qa_result['answer']})
203
 
204
  # Function to answer questions based on images
 
123
  # Mount the static directory to serve HTML, CSS, JS files
124
  app.mount("/static", StaticFiles(directory="static"), name="static")
125
 
126
+ # Define a template for rendering HTML
127
+ templates = Jinja2Templates(directory="templates")
128
+
129
  # Initialize transformers pipelines
130
  qa_pipeline = pipeline("question-answering", model="microsoft/phi-2", tokenizer="microsoft/phi-2")
131
  image_qa_pipeline = pipeline("vqa", model="Salesforce/blip-vqa-base")
 
133
  # Initialize EasyOCR for image-based text extraction
134
  reader = easyocr.Reader(['en'])
135
 
 
 
 
 
 
 
 
136
  # Maximum allowed file size in bytes (e.g., 5 MB)
137
  MAX_FILE_SIZE = 5 * 1024 * 1024 # 5 MB
138
 
 
161
  def extract_text_from_image(image: Image):
162
  return pytesseract.image_to_string(image)
163
 
164
+ # Home route - Render the index page
165
  @app.get("/")
166
+ async def home(request: Request):
167
+ return templates.TemplateResponse("index.html", {"request": request})
168
 
169
  # Function to answer questions based on document content
170
  @app.post("/question-answering-doc")
 
173
  if file.spool_max_size > MAX_FILE_SIZE:
174
  raise HTTPException(status_code=400, detail=f"File size exceeds the {MAX_FILE_SIZE / (1024 * 1024)} MB limit.")
175
 
 
 
 
 
176
  try:
177
+ # Read the file content into memory
178
+ file_content = await file.read()
179
+
180
  # Extract text based on the file type
181
  if file.filename.endswith(".pdf"):
182
+ file_path = "/tmp/tempfile.pdf"
183
+ with open(file_path, "wb") as f:
184
+ f.write(file_content)
185
  text = extract_pdf_text(file_path)
186
+ os.remove(file_path)
187
  elif file.filename.endswith(".docx"):
188
+ file_path = "/tmp/tempfile.docx"
189
+ with open(file_path, "wb") as f:
190
+ f.write(file_content)
191
  text = extract_docx_text(file_path)
192
+ os.remove(file_path)
193
  elif file.filename.endswith(".pptx"):
194
+ file_path = "/tmp/tempfile.pptx"
195
+ with open(file_path, "wb") as f:
196
+ f.write(file_content)
197
  text = extract_pptx_text(file_path)
198
+ os.remove(file_path)
199
  else:
200
  raise HTTPException(status_code=400, detail="Unsupported file format")
201
  except Exception as e:
 
203
 
204
  qa_result = qa_pipeline(question=question, context=text)
205
 
 
 
 
206
  return templates.TemplateResponse("index.html", {"request": request, "answer": qa_result['answer']})
207
 
208
  # Function to answer questions based on images