ikraamkb commited on
Commit
811b0b3
·
verified ·
1 Parent(s): f23d324

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -45
app.py CHANGED
@@ -1,27 +1,17 @@
1
-
2
- import gradio as gr
3
  import fitz # PyMuPDF for PDFs
4
  import easyocr # OCR for images
5
  import openpyxl # XLSX processing
6
  import pptx # PPTX processing
7
  import docx # DOCX processing
8
- import json # Exporting results
9
- from deep_translator import GoogleTranslator
10
  from transformers import pipeline
11
- from fastapi import FastAPI
12
- from starlette.responses import RedirectResponse
13
-
14
- # Initialize FastAPI app
15
- app = FastAPI()
16
 
17
  # Initialize AI Models
18
  qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
19
- image_captioning = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
20
  reader = easyocr.Reader(['en', 'fr']) # OCR for English & French
21
 
22
  # ---- TEXT EXTRACTION FUNCTIONS ----
23
  def extract_text_from_pdf(pdf_file):
24
- """Extract text from a PDF file."""
25
  text = []
26
  try:
27
  with fitz.open(pdf_file) as doc:
@@ -32,12 +22,10 @@ def extract_text_from_pdf(pdf_file):
32
  return "\n".join(text)
33
 
34
  def extract_text_from_docx(docx_file):
35
- """Extract text from a DOCX file."""
36
  doc = docx.Document(docx_file)
37
  return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
38
 
39
  def extract_text_from_pptx(pptx_file):
40
- """Extract text from a PPTX file."""
41
  text = []
42
  try:
43
  presentation = pptx.Presentation(pptx_file)
@@ -50,7 +38,6 @@ def extract_text_from_pptx(pptx_file):
50
  return "\n".join(text)
51
 
52
  def extract_text_from_xlsx(xlsx_file):
53
- """Extract text from an XLSX file."""
54
  text = []
55
  try:
56
  wb = openpyxl.load_workbook(xlsx_file)
@@ -62,12 +49,10 @@ def extract_text_from_xlsx(xlsx_file):
62
  return f"Error reading XLSX: {e}"
63
  return "\n".join(text)
64
 
65
-
66
- # ---- MAIN PROCESSING FUNCTIONS ----
67
  def answer_question_from_doc(file, question):
68
- """Process document and answer a question based on its content."""
69
  ext = file.name.split(".")[-1].lower()
70
-
71
  if ext == "pdf":
72
  context = extract_text_from_pdf(file.name)
73
  elif ext == "docx":
@@ -77,38 +62,13 @@ def answer_question_from_doc(file, question):
77
  elif ext == "xlsx":
78
  context = extract_text_from_xlsx(file.name)
79
  else:
80
- return """Unsupported file format."""
81
 
82
  if not context.strip():
83
- return """No text found in the document."""
84
 
85
- # Generate answer using QA pipeline correctly
86
  try:
87
  result = qa_model({"question": question, "context": context})
88
  return result["answer"]
89
  except Exception as e:
90
  return f"Error generating answer: {e}"
91
-
92
- try:
93
- result = qa_model({"question": question, "context": img_text})
94
- return result["answer"]
95
- except Exception as e:
96
- return f"Error generating answer: {e}"
97
-
98
-
99
- with gr.Blocks() as img_interface:
100
- gr.Markdown("## 🖼️ Image Question Answering")
101
- image_input = gr.Image(label="Upload an Image")
102
- img_question_input = gr.Textbox(label="Ask a question")
103
- img_answer_output = gr.Textbox(label="Answer")
104
- image_submit = gr.Button("Get Answer")
105
- image_submit.click(answer_question_from_image, inputs=[image_input, img_question_input], outputs=img_answer_output)
106
-
107
- # ---- MOUNT GRADIO APP ----
108
- demo = gr.TabbedInterface(img_interface, "Image QA")
109
- app = gr.mount_gradio_app(app, demo, path="/")
110
-
111
- @app.get("/")
112
- def home():
113
- return RedirectResponse(url="/")
114
-
 
1
+ # app.py
 
2
  import fitz # PyMuPDF for PDFs
3
  import easyocr # OCR for images
4
  import openpyxl # XLSX processing
5
  import pptx # PPTX processing
6
  import docx # DOCX processing
 
 
7
  from transformers import pipeline
 
 
 
 
 
8
 
9
  # Initialize AI Models
10
  qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
 
11
  reader = easyocr.Reader(['en', 'fr']) # OCR for English & French
12
 
13
  # ---- TEXT EXTRACTION FUNCTIONS ----
14
  def extract_text_from_pdf(pdf_file):
 
15
  text = []
16
  try:
17
  with fitz.open(pdf_file) as doc:
 
22
  return "\n".join(text)
23
 
24
  def extract_text_from_docx(docx_file):
 
25
  doc = docx.Document(docx_file)
26
  return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
27
 
28
  def extract_text_from_pptx(pptx_file):
 
29
  text = []
30
  try:
31
  presentation = pptx.Presentation(pptx_file)
 
38
  return "\n".join(text)
39
 
40
  def extract_text_from_xlsx(xlsx_file):
 
41
  text = []
42
  try:
43
  wb = openpyxl.load_workbook(xlsx_file)
 
49
  return f"Error reading XLSX: {e}"
50
  return "\n".join(text)
51
 
52
+ # ---- MAIN QA FUNCTION ----
 
53
  def answer_question_from_doc(file, question):
 
54
  ext = file.name.split(".")[-1].lower()
55
+
56
  if ext == "pdf":
57
  context = extract_text_from_pdf(file.name)
58
  elif ext == "docx":
 
62
  elif ext == "xlsx":
63
  context = extract_text_from_xlsx(file.name)
64
  else:
65
+ return "Unsupported file format."
66
 
67
  if not context.strip():
68
+ return "No text found in the document."
69
 
 
70
  try:
71
  result = qa_model({"question": question, "context": context})
72
  return result["answer"]
73
  except Exception as e:
74
  return f"Error generating answer: {e}"