Omarrran commited on
Commit
5c06b65
Β·
verified Β·
1 Parent(s): 767fa10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +201 -61
app.py CHANGED
@@ -4,19 +4,20 @@ import time
4
  import re
5
  import logging
6
  from datetime import datetime
 
7
  import gradio as gr
8
  import google.generativeai as genai
9
  from PyPDF2 import PdfReader
10
  from tika import parser
11
 
12
  # Configure logging
13
- tmp_log = "pdf_processor_log.txt"
14
  logging.basicConfig(
15
  level=logging.INFO,
16
- format='%(asctime)s - %(levelname)s - %(message)s',
17
  handlers=[
18
  logging.StreamHandler(),
19
- logging.FileHandler(tmp_log)
20
  ]
21
  )
22
  logger = logging.getLogger("pdf_processor")
@@ -29,125 +30,264 @@ except ImportError:
29
  UNSTRUCTURED_AVAILABLE = False
30
  logger.warning("unstructured.partition.pdf not available; skipping that extraction method")
31
 
32
- # Load API key from environment
33
- API_KEY = os.getenv("GOOGLE_API_KEY", None)
34
- if not API_KEY:
35
- logger.warning("GOOGLE_API_KEY not set in environment.")
36
- else:
37
  genai.configure(api_key=API_KEY)
 
 
38
 
39
  # Globals to store state
40
  EXTRACTED_TEXT = ""
41
  PDF_SECTIONS = []
42
  EXTRACTION_METHOD = ""
43
 
 
44
  # --- Extraction Functions ---
45
  def extract_text_with_unstructured(pdf_path):
46
  logger.info("Extracting via Unstructured.io...")
47
  elements = partition_pdf(filename=pdf_path, extract_images_in_pdf=False)
48
- sections, current = [], {"title":"Introduction","content":""}
49
  for e in elements:
50
  if hasattr(e, "text") and (t := e.text.strip()):
51
- if len(t)<80 and (t.isupper() or t.endswith(':') or re.match(r'^[0-9]+\.?\s+', t)):
52
- if current["content"]: sections.append(current)
53
- current = {"title":t, "content":""}
 
 
54
  else:
55
  current["content"] += t + "\n\n"
56
- if current["content"]: sections.append(current)
 
57
  return sections
58
 
 
59
  def extract_text_with_pypdf(pdf_path):
60
  logger.info("Extracting via PyPDF2...")
61
  reader = PdfReader(pdf_path)
62
- full = ""
63
- for i,p in enumerate(reader.pages,1):
64
- if (txt := p.extract_text()): full += f"\n\n--- Page {i} ---\n\n{txt}"
65
- parts = re.split(r"\n\s*([A-Z][A-Z\s]+:?|[0-9]+\.\s+[A-Z].*?)\s*\n", full)
66
- if len(parts)>1:
67
- return [{"title":parts[i].strip(),"content":parts[i+1].strip()} for i in range(1,len(parts),2)]
68
- # fallback to single section
69
- return [{"title":"Document","content":full}]
 
 
 
 
 
 
70
 
71
  def extract_text_with_tika(pdf_path):
72
  logger.info("Extracting via Tika...")
73
  parsed = parser.from_file(pdf_path)
74
- lines = parsed.get("content","").split("\n")
75
- sections, current = [], {"title":"Introduction","content":""}
76
  for ln in lines:
77
  ln = ln.strip()
78
- if not ln: continue
79
- if len(ln)<80 and (ln.isupper() or ln.endswith(':') or re.match(r'^[0-9]+\.?\s+[A-Z]', ln)):
80
- if current["content"]: sections.append(current)
81
- current = {"title":ln, "content":""}
 
 
82
  else:
83
  current["content"] += ln + "\n\n"
84
- if current["content"]: sections.append(current)
 
85
  return sections
86
 
 
87
  # --- Gemini API calls ---
88
  def generate_greg_brockman_summary(content):
89
- model = genai.GenerativeModel('gemini-1.5-pro')
90
  prompt = f"""
91
- You are an expert document analyst...
 
 
 
 
 
 
92
  {content}
93
  """
94
  try:
95
  resp = model.generate_content(prompt)
96
  return resp.text, None
97
  except Exception as e:
98
- logger.error(e)
99
  return None, str(e)
100
 
 
101
  def answer_question_about_pdf(content, question):
102
- model = genai.GenerativeModel('gemini-1.5-pro')
103
  prompt = f"""
104
- You are a precise document analysis assistant...
 
105
  DOCUMENT CONTENT:
106
  {content}
 
107
  QUESTION: {question}
108
  """
109
  try:
110
  resp = model.generate_content(prompt)
111
  return resp.text, None
112
  except Exception as e:
113
- logger.error(e)
114
  return None, str(e)
115
 
116
- # --- Processing & Q&A ---
 
117
  def process_pdf(pdf_file, progress=gr.Progress()):
118
  global EXTRACTED_TEXT, PDF_SECTIONS, EXTRACTION_METHOD
 
119
  if not API_KEY:
120
- return None, None, "❌ Set GOOGLE_API_KEY in settings.", ""
121
  if pdf_file is None:
122
  return None, None, "❌ No file uploaded.", ""
123
- tmp = tempfile.gettempdir()
124
- path = os.path.join(tmp, pdf_file.name)
125
- with open(path, 'wb') as f: f.write(pdf_file.read())
 
 
 
 
 
126
  methods = []
127
- if UNSTRUCTURED_AVAILABLE:
128
- methods.append(("unstructured", extract_text_with_unstructured))
129
- methods.extend([
130
- ("pypdf", extract_text_with_pypdf),
131
- ("tika", extract_text_with_tika)
132
- ])
133
- with gr.Tab("Ask Questions"):
134
- question = gr.Textbox(label="Question", lines=2)
135
- ask_btn = gr.Button("Ask")
136
- answer = gr.Textbox(label="Answer", lines=10)
137
- ask_btn.click(ask_question, inputs=[question], outputs=[answer])
138
- with gr.Tab("System Log"):
139
- refresh = gr.Button("Refresh Log")
140
- syslog = gr.Textbox(label="System Log", lines=15)
141
- refresh.click(view_log, inputs=None, outputs=[syslog])
142
- with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  save_sum_btn = gr.Button("Save Summary")
144
- save_sum_status = gr.Markdown("")
145
  save_sum_btn.click(save_summary, inputs=[summary_out], outputs=[save_sum_status])
146
- with gr.Row():
 
 
 
 
 
147
  save_qa_btn = gr.Button("Save Q&A")
148
- save_qa_status = gr.Markdown("")
149
- save_qa_btn.click(save_qa, inputs=[question, answer], outputs=[save_qa_status])
 
 
 
 
 
150
 
151
  if __name__ == "__main__":
152
- # For Hugging Face Spaces, set `server_name="0.0.0.0"` if needed
153
- app.launch()
 
4
  import re
5
  import logging
6
  from datetime import datetime
7
+
8
  import gradio as gr
9
  import google.generativeai as genai
10
  from PyPDF2 import PdfReader
11
  from tika import parser
12
 
13
  # Configure logging
14
+ LOG_FILE = "pdf_processor_log.txt"
15
  logging.basicConfig(
16
  level=logging.INFO,
17
+ format="%(asctime)s - %(levelname)s - %(message)s",
18
  handlers=[
19
  logging.StreamHandler(),
20
+ logging.FileHandler(LOG_FILE)
21
  ]
22
  )
23
  logger = logging.getLogger("pdf_processor")
 
30
  UNSTRUCTURED_AVAILABLE = False
31
  logger.warning("unstructured.partition.pdf not available; skipping that extraction method")
32
 
33
+ # Load API key from environment (set this in your Space's Secrets as GOOGLE_API_KEY)
34
+ API_KEY = os.getenv("GOOGLE_API_KEY")
35
+ if API_KEY:
 
 
36
  genai.configure(api_key=API_KEY)
37
+ else:
38
+ logger.warning("GOOGLE_API_KEY not set in environment.")
39
 
40
  # Globals to store state
41
  EXTRACTED_TEXT = ""
42
  PDF_SECTIONS = []
43
  EXTRACTION_METHOD = ""
44
 
45
+
46
  # --- Extraction Functions ---
47
  def extract_text_with_unstructured(pdf_path):
48
  logger.info("Extracting via Unstructured.io...")
49
  elements = partition_pdf(filename=pdf_path, extract_images_in_pdf=False)
50
+ sections, current = [], {"title": "Introduction", "content": ""}
51
  for e in elements:
52
  if hasattr(e, "text") and (t := e.text.strip()):
53
+ # Section header heuristic
54
+ if len(t) < 80 and (t.isupper() or t.endswith(":") or re.match(r"^[0-9]+\.?\s+", t)):
55
+ if current["content"]:
56
+ sections.append(current)
57
+ current = {"title": t, "content": ""}
58
  else:
59
  current["content"] += t + "\n\n"
60
+ if current["content"]:
61
+ sections.append(current)
62
  return sections
63
 
64
+
65
  def extract_text_with_pypdf(pdf_path):
66
  logger.info("Extracting via PyPDF2...")
67
  reader = PdfReader(pdf_path)
68
+ full_text = ""
69
+ for i, page in enumerate(reader.pages, start=1):
70
+ txt = page.extract_text()
71
+ if txt:
72
+ full_text += f"\n\n--- Page {i} ---\n\n{txt}"
73
+ parts = re.split(r"\n\s*([A-Z][A-Z\s]+:?|[0-9]+\.\s+[A-Z].*?)\s*\n", full_text)
74
+ if len(parts) > 1:
75
+ return [
76
+ {"title": parts[i].strip(), "content": parts[i + 1].strip()}
77
+ for i in range(1, len(parts), 2)
78
+ ]
79
+ # fallback single section
80
+ return [{"title": "Document", "content": full_text}]
81
+
82
 
83
  def extract_text_with_tika(pdf_path):
84
  logger.info("Extracting via Tika...")
85
  parsed = parser.from_file(pdf_path)
86
+ lines = (parsed.get("content") or "").split("\n")
87
+ sections, current = [], {"title": "Introduction", "content": ""}
88
  for ln in lines:
89
  ln = ln.strip()
90
+ if not ln:
91
+ continue
92
+ if len(ln) < 80 and (ln.isupper() or ln.endswith(":") or re.match(r"^[0-9]+\.?\s+[A-Z]", ln)):
93
+ if current["content"]:
94
+ sections.append(current)
95
+ current = {"title": ln, "content": ""}
96
  else:
97
  current["content"] += ln + "\n\n"
98
+ if current["content"]:
99
+ sections.append(current)
100
  return sections
101
 
102
+
103
  # --- Gemini API calls ---
104
  def generate_greg_brockman_summary(content):
105
+ model = genai.GenerativeModel("gemini-1.5-pro")
106
  prompt = f"""
107
+ You are an expert document analyst specializing in proposal evaluation.
108
+
109
+ # GREG BROCKMAN TEMPLATE STRUCTURE
110
+ 1. GOAL: ...
111
+ ... (rest of template) ...
112
+
113
+ CONTENT TO ANALYZE:
114
  {content}
115
  """
116
  try:
117
  resp = model.generate_content(prompt)
118
  return resp.text, None
119
  except Exception as e:
120
+ logger.error(f"Summary generation error: {e}")
121
  return None, str(e)
122
 
123
+
124
  def answer_question_about_pdf(content, question):
125
+ model = genai.GenerativeModel("gemini-1.5-pro")
126
  prompt = f"""
127
+ You are a precise document analysis assistant.
128
+
129
  DOCUMENT CONTENT:
130
  {content}
131
+
132
  QUESTION: {question}
133
  """
134
  try:
135
  resp = model.generate_content(prompt)
136
  return resp.text, None
137
  except Exception as e:
138
+ logger.error(f"Q&A generation error: {e}")
139
  return None, str(e)
140
 
141
+
142
+ # --- Processing & Q&A Handlers ---
143
  def process_pdf(pdf_file, progress=gr.Progress()):
144
  global EXTRACTED_TEXT, PDF_SECTIONS, EXTRACTION_METHOD
145
+
146
  if not API_KEY:
147
+ return None, None, "❌ Set GOOGLE_API_KEY in Secrets.", ""
148
  if pdf_file is None:
149
  return None, None, "❌ No file uploaded.", ""
150
+
151
+ # Save to temp
152
+ tmp_dir = tempfile.gettempdir()
153
+ path = os.path.join(tmp_dir, pdf_file.name)
154
+ with open(path, "wb") as f:
155
+ f.write(pdf_file.read())
156
+
157
+ # Choose methods
158
  methods = []
159
+ if UNSTRUCTURED_AVAILABLE:
160
+ methods.append(("unstructured", extract_text_with_unstructured))
161
+ methods += [
162
+ ("pypdf", extract_text_with_pypdf),
163
+ ("tika", extract_text_with_tika),
164
+ ]
165
+
166
+ sections = None
167
+ for name, fn in methods:
168
+ try:
169
+ secs = fn(path)
170
+ if secs:
171
+ sections = secs
172
+ EXTRACTION_METHOD = name
173
+ break
174
+ except Exception as e:
175
+ logger.warning(f"{name} failed: {e}")
176
+ if not sections:
177
+ return None, None, "❌ Extraction failed.", ""
178
+
179
+ # Combine & store
180
+ combined = ""
181
+ structure = ""
182
+ for idx, sec in enumerate(sections, start=1):
183
+ structure += f"{idx}. {sec['title']}\n"
184
+ chunk = f"## {sec['title']}\n{sec['content']}\n\n"
185
+ if len(combined) + len(chunk) < 30000:
186
+ combined += chunk
187
+ else:
188
+ combined += f"## {sec['title']}\n[Truncated]\n\n"
189
+ structure += " [Content truncated]\n"
190
+ EXTRACTED_TEXT = combined
191
+ PDF_SECTIONS = sections
192
+
193
+ # Generate summary
194
+ summary, err = generate_greg_brockman_summary(combined)
195
+ if err:
196
+ return None, structure, f"❌ {err}", combined
197
+
198
+ return summary, structure, "βœ… PDF processed successfully", f"Used {EXTRACTION_METHOD}."
199
+
200
+
201
+ def ask_question(question):
202
+ if not API_KEY:
203
+ return "❌ Set GOOGLE_API_KEY in Secrets."
204
+ if not EXTRACTED_TEXT:
205
+ return "❌ Please upload & process a PDF first."
206
+ if not question.strip():
207
+ return "❌ Enter a question."
208
+
209
+ answer, err = answer_question_about_pdf(EXTRACTED_TEXT, question)
210
+ if err:
211
+ return f"❌ {err}"
212
+ return answer
213
+
214
+
215
+ def view_log():
216
+ try:
217
+ return open(LOG_FILE).read()
218
+ except Exception as e:
219
+ return f"Error reading log: {e}"
220
+
221
+
222
+ def save_summary(summary):
223
+ if not summary:
224
+ return "❌ No summary to save."
225
+ fn = f"summary_{datetime.now():%Y%m%d_%H%M%S}.txt"
226
+ with open(fn, "w", encoding="utf-8") as f:
227
+ f.write(summary)
228
+ return f"βœ… Saved to {fn}"
229
+
230
+
231
+ def save_qa(question, answer):
232
+ if not question or not answer:
233
+ return "❌ Nothing to save."
234
+ fn = f"qa_{datetime.now():%Y%m%d_%H%M%S}.txt"
235
+ with open(fn, "w", encoding="utf-8") as f:
236
+ f.write(f"Q: {question}\n\nA: {answer}")
237
+ return f"βœ… Saved to {fn}"
238
+
239
+
240
+ # --- Gradio UI ---
241
+ with gr.Blocks(title="PDF Analyzer with Gemini API") as app:
242
+ gr.Markdown("# πŸ“„ PDF Analyzer with Gemini API")
243
+ gr.Markdown("Upload a PDF, get a Greg Brockman style summary, and ask questions.")
244
+
245
+ with gr.Tab("Setup"):
246
+ with gr.Row():
247
+ api_key_input = gr.Textbox(
248
+ label="Google Gemini API Key",
249
+ type="password",
250
+ placeholder="Set in Secrets (GOOGLE_API_KEY)"
251
+ )
252
+ api_button = gr.Button("Configure API")
253
+ api_status = gr.Markdown("⚠️ Using environment GOOGLE_API_KEY")
254
+ api_button.click(
255
+ fn=lambda key: (genai.configure(api_key=key) or "βœ… API configured", None),
256
+ inputs=[api_key_input],
257
+ outputs=[api_status, gr.State()]
258
+ )
259
+
260
+ with gr.Tab("PDF Processing"):
261
+ with gr.Row():
262
+ pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
263
+ proc_btn = gr.Button("Process PDF", variant="primary")
264
+ status = gr.Markdown("Awaiting upload...")
265
+ summary_out = gr.Textbox(label="Summary", lines=15)
266
+ structure_out = gr.Textbox(label="Structure", lines=8)
267
+ log_info = gr.Textbox(label="Internal Log", lines=5)
268
+ proc_btn.click(
269
+ fn=process_pdf,
270
+ inputs=[pdf_file],
271
+ outputs=[summary_out, structure_out, status, log_info]
272
+ )
273
  save_sum_btn = gr.Button("Save Summary")
274
+ save_sum_status = gr.Markdown()
275
  save_sum_btn.click(save_summary, inputs=[summary_out], outputs=[save_sum_status])
276
+
277
+ with gr.Tab("Ask Questions"):
278
+ question_in = gr.Textbox(label="Your Question", lines=2)
279
+ ask_btn = gr.Button("Ask", variant="primary")
280
+ answer_out = gr.Textbox(label="Answer", lines=10)
281
+ ask_btn.click(ask_question, inputs=[question_in], outputs=[answer_out])
282
  save_qa_btn = gr.Button("Save Q&A")
283
+ save_qa_status = gr.Markdown()
284
+ save_qa_btn.click(save_qa, inputs=[question_in, answer_out], outputs=[save_qa_status])
285
+
286
+ with gr.Tab("System Log"):
287
+ refresh_btn = gr.Button("Refresh Log")
288
+ sys_log = gr.Textbox(label="System Log", lines=20)
289
+ refresh_btn.click(view_log, inputs=None, outputs=[sys_log])
290
 
291
  if __name__ == "__main__":
292
+ # On Hugging Face Spaces, share=True isn't needed; server_name="0.0.0.0" ensures external access
293
+ app.launch(server_name="0.0.0.0")