import streamlit as st from langchain_google_genai import ChatGoogleGenerativeAI from langchain_core.messages import SystemMessage, HumanMessage from langchain_core.output_parsers import PydanticOutputParser from langchain_core.prompts import PromptTemplate from langchain.chains import LLMChain from pydantic import BaseModel, Field from typing import List from dotenv import load_dotenv import os import time from datetime import datetime import PyPDF2 from fpdf import FPDF from docx import Document import io from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings from langchain_community.vectorstores import FAISS from langchain_text_splitters import RecursiveCharacterTextSplitter load_dotenv() api_key = os.getenv("GOOGLE_API_KEY") llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", google_api_key=api_key) class KeyPoint(BaseModel): point: str = Field(description="A key point extracted from the document.") class Summary(BaseModel): summary: str = Field(description="A brief summary of the document content.") class DocumentAnalysis(BaseModel): key_points: List[KeyPoint] = Field(description="List of key points from the document.") summary: Summary = Field(description="Summary of the document.") parser = PydanticOutputParser(pydantic_object=DocumentAnalysis) prompt_template = """ Analyze the following text and extract key points and a summary. {format_instructions} Text: {text} """ prompt = PromptTemplate( template=prompt_template, input_variables=["text"], partial_variables={"format_instructions": parser.get_format_instructions()} ) chain = LLMChain(llm=llm, prompt=prompt, output_parser=parser) def analyze_text_structured(text): output = chain.run(text=text) return output def extract_text_from_pdf(pdf_file): pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() return text def json_to_text(analysis): text_output = "=== Summary ===\n" + f"{analysis.summary.summary}\n\n" text_output += "=== Key Points ===\n" for i, key_point in enumerate(analysis.key_points, start=1): text_output += f"{i}. {key_point.point}\n" return text_output def create_pdf_report(analysis): pdf = FPDF() pdf.add_page() pdf.set_font('Helvetica', '', 12) pdf.cell(200, 10, txt="PDF Analysis Report", ln=True, align='C') pdf.cell(200, 10, txt=f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=True, align='C') clean_text = json_to_text(analysis) pdf.multi_cell(0, 10, txt=clean_text) return pdf.output(dest='S') def create_word_report(analysis): doc = Document() doc.add_heading('PDF Analysis Report', 0) doc.add_paragraph(f'Generated on: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') clean_text = json_to_text(analysis) doc.add_heading('Analysis', level=1) doc.add_paragraph(clean_text) docx_bytes = io.BytesIO() doc.save(docx_bytes) docx_bytes.seek(0) return docx_bytes.getvalue() st.set_page_config(page_title="Chat With PDF", page_icon="😒") def local_css(): st.markdown(""" """, unsafe_allow_html=True) local_css() if "current_file" not in st.session_state: st.session_state.current_file = None if "pdf_summary" not in st.session_state: st.session_state.pdf_summary = None if "analysis_time" not in st.session_state: st.session_state.analysis_time = 0 if "pdf_report" not in st.session_state: st.session_state.pdf_report = None if "word_report" not in st.session_state: st.session_state.word_report = None if "vectorstore" not in st.session_state: st.session_state.vectorstore = None if "messages" not in st.session_state: st.session_state.messages = [] st.markdown('
', unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) st.title("😒 Chat With PDF") st.caption("Your AI-powered Document Analyzer") st.markdown('
', unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) uploaded_file = st.file_uploader("Upload a PDF file", type="pdf") if uploaded_file is not None: if st.session_state.current_file != uploaded_file.name: st.session_state.current_file = uploaded_file.name st.session_state.pdf_summary = None st.session_state.pdf_report = None st.session_state.word_report = None if "vectorstore" in st.session_state: del st.session_state.vectorstore if "messages" in st.session_state: st.session_state.messages = [] text = extract_text_from_pdf(uploaded_file) if st.button("Analyze Text"): start_time = time.time() with st.spinner("Analyzing..."): analysis = analyze_text_structured(text) st.session_state.pdf_summary = analysis text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) chunks = text_splitter.split_text(text) embeddings = HuggingFaceInferenceAPIEmbeddings( pi_key=os.getenv("HUGGINGFACE_ACCESS_TOKEN"), model_name="BAAI/bge-small-en-v1.5" ) st.session_state.vectorstore = FAISS.from_texts(chunks, embeddings) st.session_state.pdf_report = create_pdf_report(analysis) st.session_state.word_report = create_word_report(analysis) end_time = time.time() st.session_state.analysis_time = end_time - start_time st.subheader("Analysis Results") st.text(json_to_text(analysis)) st.download_button( label="Download PDF Report", data=st.session_state.pdf_report, file_name="analysis_report.pdf", mime="application/pdf" ) st.download_button( label="Download Word Report", data=st.session_state.word_report, file_name="analysis_report.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) st.markdown('
', unsafe_allow_html=True) if "vectorstore" in st.session_state: st.subheader("Chat with the Document") for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"]) if prompt := st.chat_input("Ask a question about the document"): st.session_state.messages.append({"role": "user", "content": prompt}) with st.chat_message("user"): st.markdown(prompt) with st.chat_message("assistant"): with st.spinner("Thinking..."): docs = st.session_state.vectorstore.similarity_search(prompt, k=3) context = "\n".join([doc.page_content for doc in docs]) messages = [ SystemMessage(content="You are a assistant. Answer the question based on the provided document context."), HumanMessage(content=f"Context: {context}\n\nQuestion: {prompt}") ] response = llm.invoke(messages) st.markdown(response.content) st.session_state.messages.append({"role": "assistant", "content": response.content}) st.markdown(f'', unsafe_allow_html=True)