Omarrran commited on
Commit
9406eac
Β·
verified Β·
1 Parent(s): 4af5702

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -0
app.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ tempfile, time, re, logging
3
+ from datetime import datetime
4
+ import gradio as gr
5
+ import google.generativeai as genai
6
+ from PyPDF2 import PdfReader
7
+ from tika import parser
8
+ from unstructured.partition.pdf import partition_pdf
9
+
10
+ # Configure logging
11
+ tmp_log = "pdf_processor_log.txt"
12
+ logging.basicConfig(
13
+ level=logging.INFO,
14
+ format='%(asctime)s - %(levelname)s - %(message)s',
15
+ handlers=[
16
+ logging.StreamHandler(),
17
+ logging.FileHandler(tmp_log)
18
+ ]
19
+ )
20
+ logger = logging.getLogger("pdf_processor")
21
+
22
+ # Load API key from environment
23
+ API_KEY = os.getenv("GOOGLE_API_KEY", None)
24
+ if not API_KEY:
25
+ logger.warning("GOOGLE_API_KEY not set in environment.")
26
+ else:
27
+ genai.configure(api_key=API_KEY)
28
+
29
+ # Globals to store state
30
+ EXTRACTED_TEXT = ""
31
+ PDF_SECTIONS = []
32
+ EXTRACTION_METHOD = ""
33
+
34
+ # --- Extraction Functions ---
35
+ def extract_text_with_unstructured(pdf_path):
36
+ logger.info("Extracting via Unstructured.io...")
37
+ elements = partition_pdf(filename=pdf_path, extract_images_in_pdf=False)
38
+ sections, current = [], {"title":"Introduction","content":""}
39
+ for e in elements:
40
+ if hasattr(e, "text") and (t := e.text.strip()):
41
+ if len(t)<80 and (t.isupper() or t.endswith(':') or re.match(r'^[0-9]+\.?\s+', t)):
42
+ if current["content"]: sections.append(current)
43
+ current = {"title":t, "content":""}
44
+ else:
45
+ current["content"] += t + "\n\n"
46
+ if current["content"]: sections.append(current)
47
+ return sections
48
+
49
+ def extract_text_with_pypdf(pdf_path):
50
+ logger.info("Extracting via PyPDF2...")
51
+ reader = PdfReader(pdf_path)
52
+ full = ""
53
+ for i,p in enumerate(reader.pages,1):
54
+ if (txt := p.extract_text()): full += f"\n\n--- Page {i} ---\n\n{txt}"
55
+ parts = re.split(r"\n\s*([A-Z][A-Z\s]+:?|[0-9]+\.\s+[A-Z].*?)\s*\n", full)
56
+ if len(parts)>1:
57
+ return [{"title":parts[i].strip(),"content":parts[i+1].strip()} for i in range(1,len(parts),2)]
58
+ # fallback to single section
59
+ return [{"title":"Document","content":full}]
60
+
61
+ def extract_text_with_tika(pdf_path):
62
+ logger.info("Extracting via Tika...")
63
+ parsed = parser.from_file(pdf_path)
64
+ lines = parsed.get("content","").split("\n")
65
+ sections, current = [], {"title":"Introduction","content":""}
66
+ for ln in lines:
67
+ ln = ln.strip()
68
+ if not ln: continue
69
+ if len(ln)<80 and (ln.isupper() or ln.endswith(':') or re.match(r'^[0-9]+\.?\s+[A-Z]', ln)):
70
+ if current["content"]: sections.append(current)
71
+ current = {"title":ln, "content":""}
72
+ else:
73
+ current["content"] += ln + "\n\n"
74
+ if current["content"]: sections.append(current)
75
+ return sections
76
+
77
+ # --- Gemini API calls ---
78
+ def generate_greg_brockman_summary(content):
79
+ model = genai.GenerativeModel('gemini-1.5-pro')
80
+ prompt = f"""
81
+ You are an expert document analyst...
82
+ {content}
83
+ """
84
+ try:
85
+ resp = model.generate_content(prompt)
86
+ return resp.text, None
87
+ except Exception as e:
88
+ logger.error(e)
89
+ return None, str(e)
90
+
91
+ def answer_question_about_pdf(content, question):
92
+ model = genai.GenerativeModel('gemini-1.5-pro')
93
+ prompt = f"""
94
+ You are a precise document analysis assistant...
95
+ DOCUMENT CONTENT:
96
+ {content}
97
+ QUESTION: {question}
98
+ """
99
+ try:
100
+ resp = model.generate_content(prompt)
101
+ return resp.text, None
102
+ except Exception as e:
103
+ logger.error(e)
104
+ return None, str(e)
105
+
106
+ # --- Processing & Q&A ---
107
+ def process_pdf(pdf_file, progress=gr.Progress()):
108
+ global EXTRACTED_TEXT, PDF_SECTIONS, EXTRACTION_METHOD
109
+ if not API_KEY:
110
+ return None, None, "❌ Set GOOGLE_API_KEY in settings.", ""
111
+ if pdf_file is None:
112
+ return None, None, "❌ No file uploaded.", ""
113
+ tmp = tempfile.gettempdir()
114
+ path = os.path.join(tmp, pdf_file.name)
115
+ with open(path, 'wb') as f: f.write(pdf_file.read())
116
+ methods = [("unstructured", extract_text_with_unstructured),
117
+ ("pypdf", extract_text_with_pypdf),
118
+ ("tika", extract_text_with_tika)]
119
+ for name, fn in methods:
120
+ try:
121
+ secs = fn(path)
122
+ if secs:
123
+ EXTRACTION_METHOD = name
124
+ PDF_SECTIONS = secs
125
+ break
126
+ except:
127
+ continue
128
+ if not PDF_SECTIONS:
129
+ return None, None, "❌ Extraction failed.", ""
130
+ combined, struct = "", ""
131
+ for i,sec in enumerate(PDF_SECTIONS,1):
132
+ struct += f"{i}. {sec['title']}\n"
133
+ block = f"## {sec['title']}\n{sec['content']}\n\n"
134
+ combined += block if len(combined+block)<30000 else f"## {sec['title']}\n[Truncated]\n\n"
135
+ EXTRACTED_TEXT = combined
136
+ summary, err = generate_greg_brockman_summary(combined)
137
+ if err:
138
+ return None, struct, f"❌ {err}", combined
139
+ return summary, struct, "βœ… Done", f"Used {EXTRACTION_METHOD}, {len(PDF_SECTIONS)} sections"
140
+
141
+ def ask_question(question):
142
+ if not API_KEY: return "❌ Set GOOGLE_API_KEY."
143
+ if not EXTRACTED_TEXT: return "❌ Process a PDF first."
144
+ if not question.strip(): return "❌ Enter a question."
145
+ ans, err = answer_question_about_pdf(EXTRACTED_TEXT, question)
146
+ return ans if not err else f"❌ {err}"
147
+
148
+ def view_log():
149
+ try:
150
+ return open(tmp_log).read()
151
+ except:
152
+ return "Error reading log."
153
+
154
+ def save_summary(summary):
155
+ if not summary: return "❌ No summary."
156
+ fn = f"summary_{datetime.now():%Y%m%d_%H%M%S}.txt"
157
+ open(fn, 'w', encoding='utf-8').write(summary)
158
+ return f"βœ… Saved to {fn}"
159
+
160
+ def save_qa(question, answer):
161
+ if not question or not answer: return "❌ Incomplete Q&A."
162
+ fn = f"qa_{datetime.now():%Y%m%d_%H%M%S}.txt"
163
+ with open(fn,'w',encoding='utf-8') as f:
164
+ f.write(f"Q: {question}\n\nA: {answer}")
165
+ return f"βœ… Saved to {fn}"
166
+
167
+ # --- Gradio UI ---
168
+ with gr.Blocks(title="PDF Analyzer with Gemini API") as app:
169
+ gr.Markdown("# πŸ“„ PDF Analyzer with Gemini API")
170
+ gr.Markdown("Upload a PDF, get a summary, ask questions.")
171
+ with gr.Tab("PDF Processing"):
172
+ pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"], type="binary")
173
+ process_btn = gr.Button("Process PDF")
174
+ summary_out = gr.Textbox(label="Summary", lines=15)
175
+ struct_out = gr.Textbox(label="Structure", lines=8)
176
+ status = gr.Markdown("")
177
+ log_out = gr.Textbox(label="Log", lines=8)
178
+ process_btn.click(process_pdf, inputs=[pdf_file],
179
+ outputs=[summary_out, struct_out, status, log_out])
180
+ with gr.Tab("Ask Questions"):
181
+ question = gr.Textbox(label="Question", lines=2)
182
+ ask_btn = gr.Button("Ask")
183
+ answer = gr.Textbox(label="Answer", lines=10)
184
+ ask_btn.click(ask_question, inputs=[question], outputs=[answer])
185
+ with gr.Tab("System Log"):
186
+ refresh = gr.Button("Refresh Log")
187
+ syslog = gr.Textbox(label="System Log", lines=15)
188
+ refresh.click(view_log, inputs=None, outputs=[syslog])
189
+ with gr.Row():
190
+ save_sum_btn = gr.Button("Save Summary")
191
+ save_sum_status = gr.Markdown("")
192
+ save_sum_btn.click(save_summary, inputs=[summary_out], outputs=[save_sum_status])
193
+ with gr.Row():
194
+ save_qa_btn = gr.Button("Save Q&A")
195
+ save_qa_status = gr.Markdown("")
196
+ save_qa_btn.click(save_qa, inputs=[question, answer], outputs=[save_qa_status])
197
+
198
+ if __name__ == "__main__":
199
+ # For Hugging Face Spaces, set `server_name="0.0.0.0"` if needed
200
+ app.launch()