ikraamkb commited on
Commit
4f031a5
·
verified ·
1 Parent(s): 7abb419

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -44
app.py CHANGED
@@ -1,10 +1,11 @@
1
- from fastapi import FastAPI, UploadFile, Form
2
- from fastapi.responses import JSONResponse, FileResponse
3
- import fitz # PyMuPDF for PDFs
4
- import easyocr # OCR for images
5
- import openpyxl # XLSX processing
6
- import pptx # PPTX processing
7
- import docx # DOCX processing
 
8
  from transformers import pipeline
9
  from gtts import gTTS
10
  import tempfile
@@ -12,56 +13,40 @@ import os
12
 
13
  app = FastAPI()
14
 
15
- # Load AI models
16
  qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
17
  reader = easyocr.Reader(['en', 'fr'])
18
 
19
- # Text Extraction
20
  def extract_text_from_pdf(pdf_file):
21
- text = []
22
  try:
23
  with fitz.open(pdf_file) as doc:
24
- for page in doc:
25
- text.append(page.get_text("text"))
26
  except Exception as e:
27
  return f"Error reading PDF: {e}"
28
- return "\n".join(text)
29
 
30
  def extract_text_from_docx(docx_file):
31
  doc = docx.Document(docx_file)
32
  return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
33
 
34
  def extract_text_from_pptx(pptx_file):
35
- text = []
36
  try:
37
  prs = pptx.Presentation(pptx_file)
38
- for slide in prs.slides:
39
- for shape in slide.shapes:
40
- if hasattr(shape, "text"):
41
- text.append(shape.text)
42
  except Exception as e:
43
  return f"Error reading PPTX: {e}"
44
- return "\n".join(text)
45
 
46
  def extract_text_from_xlsx(xlsx_file):
47
- text = []
48
  try:
49
  wb = openpyxl.load_workbook(xlsx_file)
50
- for sheet in wb.sheetnames:
51
- ws = wb[sheet]
52
- for row in ws.iter_rows(values_only=True):
53
- text.append(" ".join(str(cell) for cell in row if cell))
54
  except Exception as e:
55
  return f"Error reading XLSX: {e}"
56
- return "\n".join(text)
57
 
58
- # Main QA logic
59
  def answer_question_from_doc(file, question):
60
  ext = file.filename.split(".")[-1].lower()
61
  file_path = f"/tmp/{file.filename}"
62
 
63
  with open(file_path, "wb") as f:
64
- f.write(file.file.read())
65
 
66
  if ext == "pdf":
67
  context = extract_text_from_pdf(file_path)
@@ -83,22 +68,6 @@ def answer_question_from_doc(file, question):
83
  tts = gTTS(answer)
84
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
85
  tts.save(tmp.name)
86
- audio_path = tmp.name
87
- return answer, audio_path
88
  except Exception as e:
89
  return f"Error generating answer: {e}", None
90
-
91
- # API route for prediction
92
- @app.post("/predict")
93
- async def predict(file: UploadFile, question: str = Form(...)):
94
- answer, audio_path = answer_question_from_doc(file, question)
95
- if audio_path:
96
- return JSONResponse(content={"answer": answer, "audio": f"/audio/{os.path.basename(audio_path)}"})
97
- else:
98
- return JSONResponse(content={"answer": answer})
99
-
100
- # Route to serve audio
101
- @app.get("/audio/{filename}")
102
- async def get_audio(filename: str):
103
- file_path = os.path.join(tempfile.gettempdir(), filename)
104
- return FileResponse(path=file_path, media_type="audio/mpeg")
 
1
+ ### app.py Document QA Backend (Cleaned)
2
+ from fastapi import FastAPI
3
+ from fastapi.responses import FileResponse, JSONResponse
4
+ import fitz # PyMuPDF
5
+ import easyocr
6
+ import openpyxl
7
+ import pptx
8
+ import docx
9
  from transformers import pipeline
10
  from gtts import gTTS
11
  import tempfile
 
13
 
14
  app = FastAPI()
15
 
 
16
  qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
17
  reader = easyocr.Reader(['en', 'fr'])
18
 
 
19
  def extract_text_from_pdf(pdf_file):
 
20
  try:
21
  with fitz.open(pdf_file) as doc:
22
+ return "\n".join(page.get_text("text") for page in doc)
 
23
  except Exception as e:
24
  return f"Error reading PDF: {e}"
 
25
 
26
  def extract_text_from_docx(docx_file):
27
  doc = docx.Document(docx_file)
28
  return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
29
 
30
  def extract_text_from_pptx(pptx_file):
 
31
  try:
32
  prs = pptx.Presentation(pptx_file)
33
+ return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))
 
 
 
34
  except Exception as e:
35
  return f"Error reading PPTX: {e}"
 
36
 
37
  def extract_text_from_xlsx(xlsx_file):
 
38
  try:
39
  wb = openpyxl.load_workbook(xlsx_file)
40
+ return "\n".join(" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True))
 
 
 
41
  except Exception as e:
42
  return f"Error reading XLSX: {e}"
 
43
 
 
44
  def answer_question_from_doc(file, question):
45
  ext = file.filename.split(".")[-1].lower()
46
  file_path = f"/tmp/{file.filename}"
47
 
48
  with open(file_path, "wb") as f:
49
+ f.write(file.read())
50
 
51
  if ext == "pdf":
52
  context = extract_text_from_pdf(file_path)
 
68
  tts = gTTS(answer)
69
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
70
  tts.save(tmp.name)
71
+ return answer, tmp.name
 
72
  except Exception as e:
73
  return f"Error generating answer: {e}", None