PDF_Summarizer_large_file

Running

App Files Files Community

Manasa1 commited on Apr 5

Commit

3bf3fb4

verified ·

1 Parent(s): e741bed

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -18

app.py CHANGED Viewed

@@ -4,13 +4,14 @@ import gradio as gr
 from dotenv import load_dotenv
 from groq import Groq
-# Load environment variables from a .env file
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
-# Instantiate the Groq client
 client = Groq(api_key=GROQ_API_KEY)
 def extract_text_from_pdf(pdf_file):
     text = ""
     with pdfplumber.open(pdf_file.name) as pdf:
@@ -20,37 +21,58 @@ def extract_text_from_pdf(pdf_file):
                 text += page_text
     return text
-def summarize_pdf(pdf_file):
-    text = extract_text_from_pdf(pdf_file)
-    if not text.strip():
-        return "No extractable text found in the PDF."
-    # Optional: Limit the text if needed for token limits
-    text = text[:15000]
-    prompt = f"Summarize the following PDF content:\n\n{text}"
     try:
         response = client.chat.completions.create(
-            messages=[
-                {
-                    "role": "user",
-                    "content": prompt
-                }
-            ],
-            model="llama3-8b-8192",  # Replace with your desired model ID
         )
         return response.choices[0].message.content.strip()
     except Exception as e:
         return f"Error during summarization: {e}"
 # Gradio interface
 iface = gr.Interface(
     fn=summarize_pdf,
     inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
     outputs="text",
-    title="PDF Summarizer with Groq",
-    description="Upload a PDF and get a summary using Groq's generative AI API."
 )
 if __name__ == "__main__":

 from dotenv import load_dotenv
 from groq import Groq
+# Load environment variables
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+# Instantiate Groq client
 client = Groq(api_key=GROQ_API_KEY)
+# Function to extract text from PDF
 def extract_text_from_pdf(pdf_file):
     text = ""
     with pdfplumber.open(pdf_file.name) as pdf:
                 text += page_text
     return text
+# Split text into manageable chunks (by character count)
+def split_text_into_chunks(text, max_chars=2000):
+    words = text.split()
+    chunks = []
+    chunk = ""
+    for word in words:
+        if len(chunk) + len(word) + 1 <= max_chars:
+            chunk += " " + word
+        else:
+            chunks.append(chunk.strip())
+            chunk = word
+    if chunk:
+        chunks.append(chunk.strip())
+    return chunks
+# Summarize a single chunk using Groq
+def summarize_chunk(chunk):
+    prompt = f"Summarize the following PDF section:\n\n{chunk}"
     try:
         response = client.chat.completions.create(
+            messages=[{"role": "user", "content": prompt}],
+            model="llama3-8b-8192",
         )
         return response.choices[0].message.content.strip()
     except Exception as e:
         return f"Error during summarization: {e}"
+# Main summarization function
+def summarize_pdf(pdf_file):
+    text = extract_text_from_pdf(pdf_file)
+    if not text.strip():
+        return "No extractable text found in the PDF."
+    chunks = split_text_into_chunks(text, max_chars=2000)
+    summaries = []
+    for i, chunk in enumerate(chunks):
+        summary = summarize_chunk(chunk)
+        summaries.append(f"🔹 **Section {i+1} Summary:**\n{summary}\n")
+    final_summary = "\n".join(summaries)
+    return final_summary
 # Gradio interface
 iface = gr.Interface(
     fn=summarize_pdf,
     inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
     outputs="text",
+    title="📄 PDF Summarizer with Groq",
+    description="Upload a large PDF and get section-wise AI summaries using Groq's LLaMA3 model."
 )
 if __name__ == "__main__":