import streamlit as st from langchain_google_genai import ChatGoogleGenerativeAI from langchain_core.messages import SystemMessage, HumanMessage from langchain_core.output_parsers import PydanticOutputParser from langchain_core.prompts import PromptTemplate from langchain.chains import LLMChain from pydantic import BaseModel, Field from typing import List from dotenv import load_dotenv import os import time from datetime import datetime import PyPDF2 from fpdf import FPDF from docx import Document import io from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings from langchain_community.vectorstores import FAISS from langchain_text_splitters import RecursiveCharacterTextSplitter load_dotenv() api_key = os.getenv("GOOGLE_API_KEY") llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", google_api_key=api_key) class KeyPoint(BaseModel): point: str = Field(description="A key point extracted from the document.") class Summary(BaseModel): summary: str = Field(description="A brief summary of the document content.") class DocumentAnalysis(BaseModel): key_points: List[KeyPoint] = Field(description="List of key points from the document.") summary: Summary = Field(description="Summary of the document.") parser = PydanticOutputParser(pydantic_object=DocumentAnalysis) prompt_template = """ Analyze the following text and extract key points and a summary. {format_instructions} Text: {text} """ prompt = PromptTemplate( template=prompt_template, input_variables=["text"], partial_variables={"format_instructions": parser.get_format_instructions()} ) chain = LLMChain(llm=llm, prompt=prompt, output_parser=parser) def analyze_text_structured(text): output = chain.run(text=text) return output def extract_text_from_pdf(pdf_file): pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() return text def json_to_text(analysis): text_output = "=== Summary ===\n" + f"{analysis.summary.summary}\n\n" text_output += "=== Key Points ===\n" for i, key_point in enumerate(analysis.key_points, start=1): text_output += f"{i}. {key_point.point}\n" return text_output def create_pdf_report(analysis): pdf = FPDF() pdf.add_page() pdf.set_font('Helvetica', '', 12) pdf.cell(200, 10, txt="PDF Analysis Report", ln=True, align='C') pdf.cell(200, 10, txt=f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=True, align='C') clean_text = json_to_text(analysis) pdf.multi_cell(0, 10, txt=clean_text) return pdf.output(dest='S') def create_word_report(analysis): doc = Document() doc.add_heading('PDF Analysis Report', 0) doc.add_paragraph(f'Generated on: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') clean_text = json_to_text(analysis) doc.add_heading('Analysis', level=1) doc.add_paragraph(clean_text) docx_bytes = io.BytesIO() doc.save(docx_bytes) docx_bytes.seek(0) return docx_bytes.getvalue() st.set_page_config(page_title="Chat With PDF", page_icon="😒") def local_css(): st.markdown(""" """, unsafe_allow_html=True) local_css() if "current_file" not in st.session_state: st.session_state.current_file = None if "pdf_summary" not in st.session_state: st.session_state.pdf_summary = None if "analysis_time" not in st.session_state: st.session_state.analysis_time = 0 if "pdf_report" not in st.session_state: st.session_state.pdf_report = None if "word_report" not in st.session_state: st.session_state.word_report = None if "vectorstore" not in st.session_state: st.session_state.vectorstore = None if "messages" not in st.session_state: st.session_state.messages = [] st.markdown('