Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -36,64 +36,73 @@ ALLOWED_EXTENSIONS = {"pdf", "docx", "pptx", "xlsx"}
|
|
36 |
|
37 |
def validate_file_type(file):
|
38 |
ext = file.name.split(".")[-1].lower()
|
|
|
39 |
if ext not in ALLOWED_EXTENSIONS:
|
40 |
-
return f"Unsupported file format: {ext}"
|
41 |
return None
|
42 |
|
43 |
# Function to truncate text to 450 tokens
|
44 |
def truncate_text(text, max_tokens=450):
|
45 |
words = text.split()
|
46 |
-
|
|
|
|
|
47 |
|
48 |
# Document Text Extraction Functions
|
49 |
def extract_text_from_pdf(pdf_file):
|
50 |
try:
|
|
|
51 |
doc = fitz.open(pdf_file)
|
52 |
text = "\n".join([page.get_text("text") for page in doc])
|
53 |
-
return text if text else "No text found."
|
54 |
except Exception as e:
|
55 |
-
return f"Error reading PDF: {str(e)}"
|
56 |
|
57 |
def extract_text_with_tika(file):
|
58 |
try:
|
|
|
59 |
parsed = parser.from_buffer(file)
|
60 |
-
return parsed.get("content", "No text found.").strip()
|
61 |
except Exception as e:
|
62 |
-
return f"Error reading document: {str(e)}"
|
63 |
|
64 |
def extract_text_from_pptx(pptx_file):
|
65 |
try:
|
|
|
66 |
ppt = Presentation(pptx_file)
|
67 |
text = []
|
68 |
for slide in ppt.slides:
|
69 |
for shape in slide.shapes:
|
70 |
if hasattr(shape, "text"):
|
71 |
text.append(shape.text)
|
72 |
-
return "\n".join(text) if text else "No text found."
|
73 |
except Exception as e:
|
74 |
-
return f"Error reading PPTX: {str(e)}"
|
75 |
|
76 |
def extract_text_from_excel(excel_file):
|
77 |
try:
|
|
|
78 |
wb = openpyxl.load_workbook(excel_file, read_only=True)
|
79 |
text = []
|
80 |
for sheet in wb.worksheets:
|
81 |
for row in sheet.iter_rows(values_only=True):
|
82 |
text.append(" ".join(map(str, row)))
|
83 |
-
return "\n".join(text) if text else "No text found."
|
84 |
except Exception as e:
|
85 |
-
return f"Error reading Excel: {str(e)}"
|
86 |
|
87 |
def extract_text_from_image(image_file):
|
|
|
88 |
image = Image.open(image_file).convert("RGB")
|
89 |
if np.array(image).std() < 10: # Low contrast = likely empty
|
90 |
-
return "No meaningful content detected in the image."
|
91 |
|
92 |
result = reader.readtext(np.array(image))
|
93 |
-
return " ".join([res[1] for res in result]) if result else "No text found."
|
94 |
|
95 |
# Function to answer questions based on document content
|
96 |
def answer_question_from_document(file, question):
|
|
|
97 |
validation_error = validate_file_type(file)
|
98 |
if validation_error:
|
99 |
return validation_error
|
@@ -106,22 +115,25 @@ def answer_question_from_document(file, question):
|
|
106 |
elif file_ext == "xlsx":
|
107 |
text = extract_text_from_excel(file)
|
108 |
else:
|
109 |
-
return "Unsupported file format!"
|
110 |
|
111 |
if not text:
|
112 |
-
return "No text extracted from the document."
|
113 |
|
114 |
truncated_text = truncate_text(text)
|
|
|
115 |
response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
|
116 |
|
117 |
return response[0]["generated_text"]
|
118 |
|
119 |
def answer_question_from_image(image, question):
|
|
|
120 |
image_text = extract_text_from_image(image)
|
121 |
if not image_text:
|
122 |
-
return "No meaningful content detected in the image."
|
123 |
|
124 |
truncated_text = truncate_text(image_text)
|
|
|
125 |
response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
|
126 |
|
127 |
return response[0]["generated_text"]
|
@@ -129,20 +141,20 @@ def answer_question_from_image(image, question):
|
|
129 |
# Gradio UI for Document & Image QA
|
130 |
doc_interface = gr.Interface(
|
131 |
fn=answer_question_from_document,
|
132 |
-
inputs=[gr.File(label="Upload Document"), gr.Textbox(label="Ask a Question")],
|
133 |
outputs="text",
|
134 |
-
title="AI Document Question Answering"
|
135 |
)
|
136 |
|
137 |
img_interface = gr.Interface(
|
138 |
fn=answer_question_from_image,
|
139 |
-
inputs=[gr.Image(label="Upload Image"), gr.Textbox(label="Ask a Question")],
|
140 |
outputs="text",
|
141 |
-
title="AI Image Question Answering"
|
142 |
)
|
143 |
|
144 |
# Mount Gradio Interfaces
|
145 |
-
demo = gr.TabbedInterface([doc_interface, img_interface], ["Document QA", "Image QA"])
|
146 |
app = gr.mount_gradio_app(app, demo, path="/")
|
147 |
|
148 |
@app.get("/")
|
|
|
36 |
|
37 |
def validate_file_type(file):
|
38 |
ext = file.name.split(".")[-1].lower()
|
39 |
+
print(f"π Validating file type: {ext}")
|
40 |
if ext not in ALLOWED_EXTENSIONS:
|
41 |
+
return f"β Unsupported file format: {ext}"
|
42 |
return None
|
43 |
|
44 |
# Function to truncate text to 450 tokens
|
45 |
def truncate_text(text, max_tokens=450):
|
46 |
words = text.split()
|
47 |
+
truncated = " ".join(words[:max_tokens])
|
48 |
+
print(f"βοΈ Truncated text to {max_tokens} tokens.")
|
49 |
+
return truncated
|
50 |
|
51 |
# Document Text Extraction Functions
|
52 |
def extract_text_from_pdf(pdf_file):
|
53 |
try:
|
54 |
+
print("π Extracting text from PDF...")
|
55 |
doc = fitz.open(pdf_file)
|
56 |
text = "\n".join([page.get_text("text") for page in doc])
|
57 |
+
return text if text else "β οΈ No text found."
|
58 |
except Exception as e:
|
59 |
+
return f"β Error reading PDF: {str(e)}"
|
60 |
|
61 |
def extract_text_with_tika(file):
|
62 |
try:
|
63 |
+
print("π Extracting text with Tika...")
|
64 |
parsed = parser.from_buffer(file)
|
65 |
+
return parsed.get("content", "β οΈ No text found.").strip()
|
66 |
except Exception as e:
|
67 |
+
return f"β Error reading document: {str(e)}"
|
68 |
|
69 |
def extract_text_from_pptx(pptx_file):
|
70 |
try:
|
71 |
+
print("π Extracting text from PPTX...")
|
72 |
ppt = Presentation(pptx_file)
|
73 |
text = []
|
74 |
for slide in ppt.slides:
|
75 |
for shape in slide.shapes:
|
76 |
if hasattr(shape, "text"):
|
77 |
text.append(shape.text)
|
78 |
+
return "\n".join(text) if text else "β οΈ No text found."
|
79 |
except Exception as e:
|
80 |
+
return f"β Error reading PPTX: {str(e)}"
|
81 |
|
82 |
def extract_text_from_excel(excel_file):
|
83 |
try:
|
84 |
+
print("π Extracting text from Excel...")
|
85 |
wb = openpyxl.load_workbook(excel_file, read_only=True)
|
86 |
text = []
|
87 |
for sheet in wb.worksheets:
|
88 |
for row in sheet.iter_rows(values_only=True):
|
89 |
text.append(" ".join(map(str, row)))
|
90 |
+
return "\n".join(text) if text else "β οΈ No text found."
|
91 |
except Exception as e:
|
92 |
+
return f"β Error reading Excel: {str(e)}"
|
93 |
|
94 |
def extract_text_from_image(image_file):
|
95 |
+
print("πΌοΈ Extracting text from image...")
|
96 |
image = Image.open(image_file).convert("RGB")
|
97 |
if np.array(image).std() < 10: # Low contrast = likely empty
|
98 |
+
return "β οΈ No meaningful content detected in the image."
|
99 |
|
100 |
result = reader.readtext(np.array(image))
|
101 |
+
return " ".join([res[1] for res in result]) if result else "β οΈ No text found."
|
102 |
|
103 |
# Function to answer questions based on document content
|
104 |
def answer_question_from_document(file, question):
|
105 |
+
print("π Processing document for QA...")
|
106 |
validation_error = validate_file_type(file)
|
107 |
if validation_error:
|
108 |
return validation_error
|
|
|
115 |
elif file_ext == "xlsx":
|
116 |
text = extract_text_from_excel(file)
|
117 |
else:
|
118 |
+
return "β Unsupported file format!"
|
119 |
|
120 |
if not text:
|
121 |
+
return "β οΈ No text extracted from the document."
|
122 |
|
123 |
truncated_text = truncate_text(text)
|
124 |
+
print("π€ Generating response...")
|
125 |
response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
|
126 |
|
127 |
return response[0]["generated_text"]
|
128 |
|
129 |
def answer_question_from_image(image, question):
|
130 |
+
print("πΌοΈ Processing image for QA...")
|
131 |
image_text = extract_text_from_image(image)
|
132 |
if not image_text:
|
133 |
+
return "β οΈ No meaningful content detected in the image."
|
134 |
|
135 |
truncated_text = truncate_text(image_text)
|
136 |
+
print("π€ Generating response...")
|
137 |
response = qa_pipeline(f"Question: {question}\nContext: {truncated_text}")
|
138 |
|
139 |
return response[0]["generated_text"]
|
|
|
141 |
# Gradio UI for Document & Image QA
|
142 |
doc_interface = gr.Interface(
|
143 |
fn=answer_question_from_document,
|
144 |
+
inputs=[gr.File(label="π Upload Document"), gr.Textbox(label="π¬ Ask a Question")],
|
145 |
outputs="text",
|
146 |
+
title="π AI Document Question Answering"
|
147 |
)
|
148 |
|
149 |
img_interface = gr.Interface(
|
150 |
fn=answer_question_from_image,
|
151 |
+
inputs=[gr.Image(label="πΌοΈ Upload Image"), gr.Textbox(label="π¬ Ask a Question")],
|
152 |
outputs="text",
|
153 |
+
title="πΌοΈ AI Image Question Answering"
|
154 |
)
|
155 |
|
156 |
# Mount Gradio Interfaces
|
157 |
+
demo = gr.TabbedInterface([doc_interface, img_interface], ["π Document QA", "πΌοΈ Image QA"])
|
158 |
app = gr.mount_gradio_app(app, demo, path="/")
|
159 |
|
160 |
@app.get("/")
|