Spaces:

DrishtiSharma
/

chat-w-google-patents

Running

App Files Files Community

DrishtiSharma commited on Dec 19, 2024

Commit

e90d440

verified ·

1 Parent(s): fc8a155

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -83

app.py CHANGED Viewed

@@ -2,22 +2,10 @@ import sys
 import os
 import re
 import time
 import streamlit as st
 import nltk
-from io import BytesIO
-# Force NLTK to download 'punkt' into a virtual, in-memory location
-try:
-    from nltk.data import load
-    print("Downloading 'punkt' tokenizer to memory...")
-    nltk.download("punkt")
-    load("tokenizers/punkt/english.pickle")
-    print("✅ 'punkt' successfully loaded into memory.")
-except Exception as e:
-    print(f"Error loading 'punkt': {e}")
-    raise e
-sys.path.append(os.path.abspath("."))
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
 from langchain.llms import OpenAI
@@ -27,54 +15,52 @@ from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.text_splitter import NLTKTextSplitter
 from patent_downloader import PatentDownloader
-PERSISTED_DIRECTORY = os.path.join(os.getcwd(), "chroma_db")
-# Fetch API key securely from the environment
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 if not OPENAI_API_KEY:
-    st.error("Critical Error: OpenAI API key not found in the environment variables. Please configure it.")
     st.stop()
 def load_docs(document_path):
     try:
-        loader = UnstructuredPDFLoader(
-            document_path,
-            mode="elements",
-            strategy="fast",
-            ocr_languages=None
-        )
         documents = loader.load()
         text_splitter = NLTKTextSplitter(chunk_size=1000)
         return text_splitter.split_documents(documents)
     except Exception as e:
-        st.error(f"Failed to load and process PDF: {e}")
-        st.stop()
-def already_indexed(vectordb, file_name):
-    indexed_sources = set(
-        x["source"] for x in vectordb.get(include=["metadatas"])["metadatas"]
-    )
-    return file_name in indexed_sources
-def load_chain(file_name=None):
-    loaded_patent = st.session_state.get("LOADED_PATENT")
-    vectordb = Chroma(
-        persist_directory=PERSISTED_DIRECTORY,
-        embedding_function=HuggingFaceEmbeddings(),
     )
-    if loaded_patent == file_name or already_indexed(vectordb, file_name):
-        st.write("✅ Already indexed.")
-    else:
-        vectordb.delete_collection()
-        docs = load_docs(file_name)
-        st.write("🔍 Number of Documents: ", len(docs))
-        vectordb = Chroma.from_documents(
-            docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
-        )
-        vectordb.persist()
-        st.session_state["LOADED_PATENT"] = file_name
     memory = ConversationBufferMemory(
         memory_key="chat_history",
@@ -82,6 +68,7 @@ def load_chain(file_name=None):
         input_key="question",
         output_key="answer",
     )
     return ConversationalRetrievalChain.from_llm(
         OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
         vectordb.as_retriever(search_kwargs={"k": 3}),
@@ -89,20 +76,8 @@ def load_chain(file_name=None):
         memory=memory,
     )
-def extract_patent_number(url):
-    pattern = r"/patent/([A-Z]{2}\d+)"
-    match = re.search(pattern, url)
-    return match.group(1) if match else None
-def download_pdf(patent_number):
-    try:
-        patent_downloader = PatentDownloader(verbose=True)
-        output_path = patent_downloader.download(patents=patent_number, output_path="/tmp")
-        return output_path[0]
-    except Exception as e:
-        st.error(f"Failed to download patent PDF: {e}")
-        st.stop()
 if __name__ == "__main__":
     st.set_page_config(
         page_title="Patent Chat: Google Patents Chat Demo",
@@ -110,8 +85,10 @@ if __name__ == "__main__":
         layout="wide",
         initial_sidebar_state="expanded",
     )
     st.header("📖 Patent Chat: Google Patents Chat Demo")
     patent_link = st.text_input("Enter Google Patent Link:", key="PATENT_LINK")
     if not patent_link:
@@ -123,48 +100,64 @@ if __name__ == "__main__":
         st.error("Invalid patent link format. Please provide a valid Google patent link.")
         st.stop()
-    st.write(f"Patent number: **{patent_number}**")
-    pdf_path = os.path.join("/tmp", f"{patent_number}.pdf")
-    if os.path.isfile(pdf_path):
-        st.write("✅ File already downloaded.")
-    else:
-        st.write("📥 Downloading patent file...")
         pdf_path = download_pdf(patent_number)
-        st.write(f"✅ File downloaded: {pdf_path}")
-    st.write("🔄 Loading document into the system...")
-    chain = load_chain(pdf_path)
     st.success("🚀 Document successfully loaded! You can now start asking questions.")
     if "messages" not in st.session_state:
         st.session_state["messages"] = [
             {"role": "assistant", "content": "Hello! How can I assist you with this patent?"}
         ]
     for message in st.session_state.messages:
         with st.chat_message(message["role"]):
             st.markdown(message["content"])
     if user_input := st.chat_input("What is your question?"):
         st.session_state.messages.append({"role": "user", "content": user_input})
         with st.chat_message("user"):
             st.markdown(user_input)
         with st.chat_message("assistant"):
             message_placeholder = st.empty()
-            full_response = ""
-        with st.spinner("Generating response..."):
-            try:
-                assistant_response = chain({"question": user_input})
-                for chunk in assistant_response["answer"].split():
-                    full_response += chunk + " "
-                    time.sleep(0.05)
-                    message_placeholder.markdown(full_response + "▌")
-            except Exception as e:
-                full_response = f"An error occurred: {e}"
-            finally:
-                message_placeholder.markdown(full_response)
         st.session_state.messages.append({"role": "assistant", "content": full_response})

 import os
 import re
 import time
+import tempfile
 import streamlit as st
 import nltk
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
 from langchain.llms import OpenAI
 from langchain.text_splitter import NLTKTextSplitter
 from patent_downloader import PatentDownloader
+# Download NLTK resources
+nltk.download("punkt", quiet=True)
+#fetch API key
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 if not OPENAI_API_KEY:
+    st.error("Critical Error: OpenAI API key not found in environment variables. Please configure it.")
     st.stop()
+def extract_patent_number(url):
+    """Extracts patent number from a Google patent link."""
+    pattern = r"/patent/([A-Z]{2}\d+)"
+    match = re.search(pattern, url)
+    return match.group(1) if match else None
+def download_pdf(patent_number):
+    """Downloads patent PDF using a temporary directory."""
+    try:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            patent_downloader = PatentDownloader(verbose=True)
+            output_path = patent_downloader.download(patents=patent_number, output_path=temp_dir)
+            return output_path[0]
+    except Exception as e:
+        st.error(f"Failed to download patent PDF: {e}")
+        return None
 def load_docs(document_path):
+    """Loads and splits PDF documents into chunks."""
     try:
+        loader = UnstructuredPDFLoader(document_path)
         documents = loader.load()
         text_splitter = NLTKTextSplitter(chunk_size=1000)
         return text_splitter.split_documents(documents)
     except Exception as e:
+        st.error(f"Failed to process PDF: {e}")
+        return []
+def load_chain(docs):
+    """Creates a conversational retrieval chain using in-memory ChromaDB."""
+    vectordb = Chroma.from_documents(
+        docs, HuggingFaceEmbeddings(), persist_directory=None
     )
     memory = ConversationBufferMemory(
         memory_key="chat_history",
         input_key="question",
         output_key="answer",
     )
     return ConversationalRetrievalChain.from_llm(
         OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
         vectordb.as_retriever(search_kwargs={"k": 3}),
         memory=memory,
     )
+# Streamlit UI
 if __name__ == "__main__":
     st.set_page_config(
         page_title="Patent Chat: Google Patents Chat Demo",
         layout="wide",
         initial_sidebar_state="expanded",
     )
     st.header("📖 Patent Chat: Google Patents Chat Demo")
+    # Input for Google Patent Link
     patent_link = st.text_input("Enter Google Patent Link:", key="PATENT_LINK")
     if not patent_link:
         st.error("Invalid patent link format. Please provide a valid Google patent link.")
         st.stop()
+    st.write(f"🔍 Patent Number: **{patent_number}**")
+    # Download or Upload PDF
+    st.write("📥 Downloading patent PDF...")
+    pdf_path = None
+    try:
         pdf_path = download_pdf(patent_number)
+    except Exception:
+        st.error("Automatic download failed. Please upload the PDF manually below.")
+    if not pdf_path:
+        uploaded_file = st.file_uploader("Upload the patent PDF file:", type="pdf")
+        if uploaded_file:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+                tmp_file.write(uploaded_file.read())
+                pdf_path = tmp_file.name
+            st.success("✅ PDF successfully uploaded.")
+        else:
+            st.stop()
+    # Load and Process PDF
+    st.write("🔄 Processing document...")
+    docs = load_docs(pdf_path)
+    if not docs:
+        st.error("No content found in the PDF. Exiting...")
+        st.stop()
+    chain = load_chain(docs)
     st.success("🚀 Document successfully loaded! You can now start asking questions.")
+    # Initialize chat history
     if "messages" not in st.session_state:
         st.session_state["messages"] = [
             {"role": "assistant", "content": "Hello! How can I assist you with this patent?"}
         ]
+    # Display chat history
     for message in st.session_state.messages:
         with st.chat_message(message["role"]):
             st.markdown(message["content"])
+    # Handle User Input
     if user_input := st.chat_input("What is your question?"):
         st.session_state.messages.append({"role": "user", "content": user_input})
         with st.chat_message("user"):
             st.markdown(user_input)
         with st.chat_message("assistant"):
             message_placeholder = st.empty()
+            with st.spinner("Generating response..."):
+                try:
+                    assistant_response = chain({"question": user_input})
+                    full_response = assistant_response.get("answer", "I'm sorry, I couldn't generate a response.")
+                except Exception as e:
+                    full_response = f"An error occurred: {e}"
+            message_placeholder.markdown(full_response)
         st.session_state.messages.append({"role": "assistant", "content": full_response})