|
import os |
|
import pdfplumber |
|
import gradio as gr |
|
from dotenv import load_dotenv |
|
from groq import Groq |
|
|
|
load_dotenv() |
|
GROQ_API_KEY = os.getenv("GROQ_API_KEY") |
|
print("Groq API Key:", GROQ_API_KEY) |
|
|
|
client = Groq(api_key=GROQ_API_KEY) |
|
|
|
def extract_text_from_pdf(pdf_file): |
|
text = "" |
|
with pdfplumber.open(pdf_file.name) as pdf: |
|
for page in pdf.pages: |
|
page_text = page.extract_text() |
|
if page_text: |
|
text += page_text |
|
return text |
|
|
|
def split_text_into_chunks(text, max_chars=2000): |
|
words = text.split() |
|
chunks = [] |
|
chunk = "" |
|
|
|
for word in words: |
|
if len(chunk) + len(word) + 1 <= max_chars: |
|
chunk += " " + word |
|
else: |
|
chunks.append(chunk.strip()) |
|
chunk = word |
|
if chunk: |
|
chunks.append(chunk.strip()) |
|
|
|
return chunks |
|
|
|
def summarize_chunk(chunk): |
|
prompt = f"Summarize the following PDF section:\n\n{chunk}" |
|
try: |
|
response = client.chat.completions.create( |
|
messages=[{"role": "user", "content": prompt}], |
|
model="llama3-8b-8192", |
|
) |
|
return response.choices[0].message.content.strip() |
|
except Exception as e: |
|
return f"Error during summarization: {e}" |
|
|
|
def summarize_pdf(pdf_file): |
|
text = extract_text_from_pdf(pdf_file) |
|
if not text.strip(): |
|
return "No extractable text found in the PDF." |
|
|
|
chunks = split_text_into_chunks(text, max_chars=2000) |
|
summaries = [] |
|
|
|
for i, chunk in enumerate(chunks): |
|
summary = summarize_chunk(chunk) |
|
summaries.append(f"🔹 **Section {i+1} Summary:**\n{summary}\n") |
|
|
|
final_summary = "\n".join(summaries) |
|
return final_summary |
|
|
|
iface = gr.Interface( |
|
fn=summarize_pdf, |
|
inputs=gr.File(label="Upload PDF", file_types=[".pdf"]), |
|
outputs="text", |
|
title="📄 PDF Summarizer with Groq", |
|
description="Upload a large PDF and get section-wise AI summaries using Groq's LLaMA3 model." |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|