Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,19 +4,20 @@ import time
|
|
4 |
import re
|
5 |
import logging
|
6 |
from datetime import datetime
|
|
|
7 |
import gradio as gr
|
8 |
import google.generativeai as genai
|
9 |
from PyPDF2 import PdfReader
|
10 |
from tika import parser
|
11 |
|
12 |
# Configure logging
|
13 |
-
|
14 |
logging.basicConfig(
|
15 |
level=logging.INFO,
|
16 |
-
format=
|
17 |
handlers=[
|
18 |
logging.StreamHandler(),
|
19 |
-
logging.FileHandler(
|
20 |
]
|
21 |
)
|
22 |
logger = logging.getLogger("pdf_processor")
|
@@ -29,125 +30,264 @@ except ImportError:
|
|
29 |
UNSTRUCTURED_AVAILABLE = False
|
30 |
logger.warning("unstructured.partition.pdf not available; skipping that extraction method")
|
31 |
|
32 |
-
# Load API key from environment
|
33 |
-
API_KEY = os.getenv("GOOGLE_API_KEY"
|
34 |
-
if
|
35 |
-
logger.warning("GOOGLE_API_KEY not set in environment.")
|
36 |
-
else:
|
37 |
genai.configure(api_key=API_KEY)
|
|
|
|
|
38 |
|
39 |
# Globals to store state
|
40 |
EXTRACTED_TEXT = ""
|
41 |
PDF_SECTIONS = []
|
42 |
EXTRACTION_METHOD = ""
|
43 |
|
|
|
44 |
# --- Extraction Functions ---
|
45 |
def extract_text_with_unstructured(pdf_path):
|
46 |
logger.info("Extracting via Unstructured.io...")
|
47 |
elements = partition_pdf(filename=pdf_path, extract_images_in_pdf=False)
|
48 |
-
sections, current = [], {"title":"Introduction","content":""}
|
49 |
for e in elements:
|
50 |
if hasattr(e, "text") and (t := e.text.strip()):
|
51 |
-
|
52 |
-
|
53 |
-
current
|
|
|
|
|
54 |
else:
|
55 |
current["content"] += t + "\n\n"
|
56 |
-
if current["content"]:
|
|
|
57 |
return sections
|
58 |
|
|
|
59 |
def extract_text_with_pypdf(pdf_path):
|
60 |
logger.info("Extracting via PyPDF2...")
|
61 |
reader = PdfReader(pdf_path)
|
62 |
-
|
63 |
-
for i,
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
def extract_text_with_tika(pdf_path):
|
72 |
logger.info("Extracting via Tika...")
|
73 |
parsed = parser.from_file(pdf_path)
|
74 |
-
lines = parsed.get("content"
|
75 |
-
sections, current = [], {"title":"Introduction","content":""}
|
76 |
for ln in lines:
|
77 |
ln = ln.strip()
|
78 |
-
if not ln:
|
79 |
-
|
80 |
-
|
81 |
-
current
|
|
|
|
|
82 |
else:
|
83 |
current["content"] += ln + "\n\n"
|
84 |
-
if current["content"]:
|
|
|
85 |
return sections
|
86 |
|
|
|
87 |
# --- Gemini API calls ---
|
88 |
def generate_greg_brockman_summary(content):
|
89 |
-
model = genai.GenerativeModel(
|
90 |
prompt = f"""
|
91 |
-
You are an expert document analyst
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
{content}
|
93 |
"""
|
94 |
try:
|
95 |
resp = model.generate_content(prompt)
|
96 |
return resp.text, None
|
97 |
except Exception as e:
|
98 |
-
logger.error(e)
|
99 |
return None, str(e)
|
100 |
|
|
|
101 |
def answer_question_about_pdf(content, question):
|
102 |
-
model = genai.GenerativeModel(
|
103 |
prompt = f"""
|
104 |
-
You are a precise document analysis assistant
|
|
|
105 |
DOCUMENT CONTENT:
|
106 |
{content}
|
|
|
107 |
QUESTION: {question}
|
108 |
"""
|
109 |
try:
|
110 |
resp = model.generate_content(prompt)
|
111 |
return resp.text, None
|
112 |
except Exception as e:
|
113 |
-
logger.error(e)
|
114 |
return None, str(e)
|
115 |
|
116 |
-
|
|
|
117 |
def process_pdf(pdf_file, progress=gr.Progress()):
|
118 |
global EXTRACTED_TEXT, PDF_SECTIONS, EXTRACTION_METHOD
|
|
|
119 |
if not API_KEY:
|
120 |
-
return None, None, "β Set GOOGLE_API_KEY in
|
121 |
if pdf_file is None:
|
122 |
return None, None, "β No file uploaded.", ""
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
126 |
methods = []
|
127 |
-
if UNSTRUCTURED_AVAILABLE:
|
128 |
-
|
129 |
-
methods
|
130 |
-
|
131 |
-
|
132 |
-
]
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
save_sum_btn = gr.Button("Save Summary")
|
144 |
-
save_sum_status = gr.Markdown(
|
145 |
save_sum_btn.click(save_summary, inputs=[summary_out], outputs=[save_sum_status])
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
147 |
save_qa_btn = gr.Button("Save Q&A")
|
148 |
-
save_qa_status = gr.Markdown(
|
149 |
-
save_qa_btn.click(save_qa, inputs=[
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
if __name__ == "__main__":
|
152 |
-
#
|
153 |
-
app.launch()
|
|
|
4 |
import re
|
5 |
import logging
|
6 |
from datetime import datetime
|
7 |
+
|
8 |
import gradio as gr
|
9 |
import google.generativeai as genai
|
10 |
from PyPDF2 import PdfReader
|
11 |
from tika import parser
|
12 |
|
13 |
# Configure logging
|
14 |
+
LOG_FILE = "pdf_processor_log.txt"
|
15 |
logging.basicConfig(
|
16 |
level=logging.INFO,
|
17 |
+
format="%(asctime)s - %(levelname)s - %(message)s",
|
18 |
handlers=[
|
19 |
logging.StreamHandler(),
|
20 |
+
logging.FileHandler(LOG_FILE)
|
21 |
]
|
22 |
)
|
23 |
logger = logging.getLogger("pdf_processor")
|
|
|
30 |
UNSTRUCTURED_AVAILABLE = False
|
31 |
logger.warning("unstructured.partition.pdf not available; skipping that extraction method")
|
32 |
|
33 |
+
# Load API key from environment (set this in your Space's Secrets as GOOGLE_API_KEY)
|
34 |
+
API_KEY = os.getenv("GOOGLE_API_KEY")
|
35 |
+
if API_KEY:
|
|
|
|
|
36 |
genai.configure(api_key=API_KEY)
|
37 |
+
else:
|
38 |
+
logger.warning("GOOGLE_API_KEY not set in environment.")
|
39 |
|
40 |
# Globals to store state
|
41 |
EXTRACTED_TEXT = ""
|
42 |
PDF_SECTIONS = []
|
43 |
EXTRACTION_METHOD = ""
|
44 |
|
45 |
+
|
46 |
# --- Extraction Functions ---
|
47 |
def extract_text_with_unstructured(pdf_path):
|
48 |
logger.info("Extracting via Unstructured.io...")
|
49 |
elements = partition_pdf(filename=pdf_path, extract_images_in_pdf=False)
|
50 |
+
sections, current = [], {"title": "Introduction", "content": ""}
|
51 |
for e in elements:
|
52 |
if hasattr(e, "text") and (t := e.text.strip()):
|
53 |
+
# Section header heuristic
|
54 |
+
if len(t) < 80 and (t.isupper() or t.endswith(":") or re.match(r"^[0-9]+\.?\s+", t)):
|
55 |
+
if current["content"]:
|
56 |
+
sections.append(current)
|
57 |
+
current = {"title": t, "content": ""}
|
58 |
else:
|
59 |
current["content"] += t + "\n\n"
|
60 |
+
if current["content"]:
|
61 |
+
sections.append(current)
|
62 |
return sections
|
63 |
|
64 |
+
|
65 |
def extract_text_with_pypdf(pdf_path):
|
66 |
logger.info("Extracting via PyPDF2...")
|
67 |
reader = PdfReader(pdf_path)
|
68 |
+
full_text = ""
|
69 |
+
for i, page in enumerate(reader.pages, start=1):
|
70 |
+
txt = page.extract_text()
|
71 |
+
if txt:
|
72 |
+
full_text += f"\n\n--- Page {i} ---\n\n{txt}"
|
73 |
+
parts = re.split(r"\n\s*([A-Z][A-Z\s]+:?|[0-9]+\.\s+[A-Z].*?)\s*\n", full_text)
|
74 |
+
if len(parts) > 1:
|
75 |
+
return [
|
76 |
+
{"title": parts[i].strip(), "content": parts[i + 1].strip()}
|
77 |
+
for i in range(1, len(parts), 2)
|
78 |
+
]
|
79 |
+
# fallback single section
|
80 |
+
return [{"title": "Document", "content": full_text}]
|
81 |
+
|
82 |
|
83 |
def extract_text_with_tika(pdf_path):
|
84 |
logger.info("Extracting via Tika...")
|
85 |
parsed = parser.from_file(pdf_path)
|
86 |
+
lines = (parsed.get("content") or "").split("\n")
|
87 |
+
sections, current = [], {"title": "Introduction", "content": ""}
|
88 |
for ln in lines:
|
89 |
ln = ln.strip()
|
90 |
+
if not ln:
|
91 |
+
continue
|
92 |
+
if len(ln) < 80 and (ln.isupper() or ln.endswith(":") or re.match(r"^[0-9]+\.?\s+[A-Z]", ln)):
|
93 |
+
if current["content"]:
|
94 |
+
sections.append(current)
|
95 |
+
current = {"title": ln, "content": ""}
|
96 |
else:
|
97 |
current["content"] += ln + "\n\n"
|
98 |
+
if current["content"]:
|
99 |
+
sections.append(current)
|
100 |
return sections
|
101 |
|
102 |
+
|
103 |
# --- Gemini API calls ---
|
104 |
def generate_greg_brockman_summary(content):
|
105 |
+
model = genai.GenerativeModel("gemini-1.5-pro")
|
106 |
prompt = f"""
|
107 |
+
You are an expert document analyst specializing in proposal evaluation.
|
108 |
+
|
109 |
+
# GREG BROCKMAN TEMPLATE STRUCTURE
|
110 |
+
1. GOAL: ...
|
111 |
+
... (rest of template) ...
|
112 |
+
|
113 |
+
CONTENT TO ANALYZE:
|
114 |
{content}
|
115 |
"""
|
116 |
try:
|
117 |
resp = model.generate_content(prompt)
|
118 |
return resp.text, None
|
119 |
except Exception as e:
|
120 |
+
logger.error(f"Summary generation error: {e}")
|
121 |
return None, str(e)
|
122 |
|
123 |
+
|
124 |
def answer_question_about_pdf(content, question):
|
125 |
+
model = genai.GenerativeModel("gemini-1.5-pro")
|
126 |
prompt = f"""
|
127 |
+
You are a precise document analysis assistant.
|
128 |
+
|
129 |
DOCUMENT CONTENT:
|
130 |
{content}
|
131 |
+
|
132 |
QUESTION: {question}
|
133 |
"""
|
134 |
try:
|
135 |
resp = model.generate_content(prompt)
|
136 |
return resp.text, None
|
137 |
except Exception as e:
|
138 |
+
logger.error(f"Q&A generation error: {e}")
|
139 |
return None, str(e)
|
140 |
|
141 |
+
|
142 |
+
# --- Processing & Q&A Handlers ---
|
143 |
def process_pdf(pdf_file, progress=gr.Progress()):
|
144 |
global EXTRACTED_TEXT, PDF_SECTIONS, EXTRACTION_METHOD
|
145 |
+
|
146 |
if not API_KEY:
|
147 |
+
return None, None, "β Set GOOGLE_API_KEY in Secrets.", ""
|
148 |
if pdf_file is None:
|
149 |
return None, None, "β No file uploaded.", ""
|
150 |
+
|
151 |
+
# Save to temp
|
152 |
+
tmp_dir = tempfile.gettempdir()
|
153 |
+
path = os.path.join(tmp_dir, pdf_file.name)
|
154 |
+
with open(path, "wb") as f:
|
155 |
+
f.write(pdf_file.read())
|
156 |
+
|
157 |
+
# Choose methods
|
158 |
methods = []
|
159 |
+
if UNSTRUCTURED_AVAILABLE:
|
160 |
+
methods.append(("unstructured", extract_text_with_unstructured))
|
161 |
+
methods += [
|
162 |
+
("pypdf", extract_text_with_pypdf),
|
163 |
+
("tika", extract_text_with_tika),
|
164 |
+
]
|
165 |
+
|
166 |
+
sections = None
|
167 |
+
for name, fn in methods:
|
168 |
+
try:
|
169 |
+
secs = fn(path)
|
170 |
+
if secs:
|
171 |
+
sections = secs
|
172 |
+
EXTRACTION_METHOD = name
|
173 |
+
break
|
174 |
+
except Exception as e:
|
175 |
+
logger.warning(f"{name} failed: {e}")
|
176 |
+
if not sections:
|
177 |
+
return None, None, "β Extraction failed.", ""
|
178 |
+
|
179 |
+
# Combine & store
|
180 |
+
combined = ""
|
181 |
+
structure = ""
|
182 |
+
for idx, sec in enumerate(sections, start=1):
|
183 |
+
structure += f"{idx}. {sec['title']}\n"
|
184 |
+
chunk = f"## {sec['title']}\n{sec['content']}\n\n"
|
185 |
+
if len(combined) + len(chunk) < 30000:
|
186 |
+
combined += chunk
|
187 |
+
else:
|
188 |
+
combined += f"## {sec['title']}\n[Truncated]\n\n"
|
189 |
+
structure += " [Content truncated]\n"
|
190 |
+
EXTRACTED_TEXT = combined
|
191 |
+
PDF_SECTIONS = sections
|
192 |
+
|
193 |
+
# Generate summary
|
194 |
+
summary, err = generate_greg_brockman_summary(combined)
|
195 |
+
if err:
|
196 |
+
return None, structure, f"β {err}", combined
|
197 |
+
|
198 |
+
return summary, structure, "β
PDF processed successfully", f"Used {EXTRACTION_METHOD}."
|
199 |
+
|
200 |
+
|
201 |
+
def ask_question(question):
|
202 |
+
if not API_KEY:
|
203 |
+
return "β Set GOOGLE_API_KEY in Secrets."
|
204 |
+
if not EXTRACTED_TEXT:
|
205 |
+
return "β Please upload & process a PDF first."
|
206 |
+
if not question.strip():
|
207 |
+
return "β Enter a question."
|
208 |
+
|
209 |
+
answer, err = answer_question_about_pdf(EXTRACTED_TEXT, question)
|
210 |
+
if err:
|
211 |
+
return f"β {err}"
|
212 |
+
return answer
|
213 |
+
|
214 |
+
|
215 |
+
def view_log():
|
216 |
+
try:
|
217 |
+
return open(LOG_FILE).read()
|
218 |
+
except Exception as e:
|
219 |
+
return f"Error reading log: {e}"
|
220 |
+
|
221 |
+
|
222 |
+
def save_summary(summary):
|
223 |
+
if not summary:
|
224 |
+
return "β No summary to save."
|
225 |
+
fn = f"summary_{datetime.now():%Y%m%d_%H%M%S}.txt"
|
226 |
+
with open(fn, "w", encoding="utf-8") as f:
|
227 |
+
f.write(summary)
|
228 |
+
return f"β
Saved to {fn}"
|
229 |
+
|
230 |
+
|
231 |
+
def save_qa(question, answer):
|
232 |
+
if not question or not answer:
|
233 |
+
return "β Nothing to save."
|
234 |
+
fn = f"qa_{datetime.now():%Y%m%d_%H%M%S}.txt"
|
235 |
+
with open(fn, "w", encoding="utf-8") as f:
|
236 |
+
f.write(f"Q: {question}\n\nA: {answer}")
|
237 |
+
return f"β
Saved to {fn}"
|
238 |
+
|
239 |
+
|
240 |
+
# --- Gradio UI ---
|
241 |
+
with gr.Blocks(title="PDF Analyzer with Gemini API") as app:
|
242 |
+
gr.Markdown("# π PDF Analyzer with Gemini API")
|
243 |
+
gr.Markdown("Upload a PDF, get a Greg Brockman style summary, and ask questions.")
|
244 |
+
|
245 |
+
with gr.Tab("Setup"):
|
246 |
+
with gr.Row():
|
247 |
+
api_key_input = gr.Textbox(
|
248 |
+
label="Google Gemini API Key",
|
249 |
+
type="password",
|
250 |
+
placeholder="Set in Secrets (GOOGLE_API_KEY)"
|
251 |
+
)
|
252 |
+
api_button = gr.Button("Configure API")
|
253 |
+
api_status = gr.Markdown("β οΈ Using environment GOOGLE_API_KEY")
|
254 |
+
api_button.click(
|
255 |
+
fn=lambda key: (genai.configure(api_key=key) or "β
API configured", None),
|
256 |
+
inputs=[api_key_input],
|
257 |
+
outputs=[api_status, gr.State()]
|
258 |
+
)
|
259 |
+
|
260 |
+
with gr.Tab("PDF Processing"):
|
261 |
+
with gr.Row():
|
262 |
+
pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
|
263 |
+
proc_btn = gr.Button("Process PDF", variant="primary")
|
264 |
+
status = gr.Markdown("Awaiting upload...")
|
265 |
+
summary_out = gr.Textbox(label="Summary", lines=15)
|
266 |
+
structure_out = gr.Textbox(label="Structure", lines=8)
|
267 |
+
log_info = gr.Textbox(label="Internal Log", lines=5)
|
268 |
+
proc_btn.click(
|
269 |
+
fn=process_pdf,
|
270 |
+
inputs=[pdf_file],
|
271 |
+
outputs=[summary_out, structure_out, status, log_info]
|
272 |
+
)
|
273 |
save_sum_btn = gr.Button("Save Summary")
|
274 |
+
save_sum_status = gr.Markdown()
|
275 |
save_sum_btn.click(save_summary, inputs=[summary_out], outputs=[save_sum_status])
|
276 |
+
|
277 |
+
with gr.Tab("Ask Questions"):
|
278 |
+
question_in = gr.Textbox(label="Your Question", lines=2)
|
279 |
+
ask_btn = gr.Button("Ask", variant="primary")
|
280 |
+
answer_out = gr.Textbox(label="Answer", lines=10)
|
281 |
+
ask_btn.click(ask_question, inputs=[question_in], outputs=[answer_out])
|
282 |
save_qa_btn = gr.Button("Save Q&A")
|
283 |
+
save_qa_status = gr.Markdown()
|
284 |
+
save_qa_btn.click(save_qa, inputs=[question_in, answer_out], outputs=[save_qa_status])
|
285 |
+
|
286 |
+
with gr.Tab("System Log"):
|
287 |
+
refresh_btn = gr.Button("Refresh Log")
|
288 |
+
sys_log = gr.Textbox(label="System Log", lines=20)
|
289 |
+
refresh_btn.click(view_log, inputs=None, outputs=[sys_log])
|
290 |
|
291 |
if __name__ == "__main__":
|
292 |
+
# On Hugging Face Spaces, share=True isn't needed; server_name="0.0.0.0" ensures external access
|
293 |
+
app.launch(server_name="0.0.0.0")
|