Spaces:

tdurzynski
/

chat-with-your-data

Sleeping

App Files Files Community

tdurzynski commited on Feb 5

Commit

9ca2091

verified ·

1 Parent(s): 6f98b16

Update app.py

Browse files

Files changed (1) hide show

app.py +147 -131

app.py CHANGED Viewed

@@ -1,19 +1,17 @@
 import os
 import logging
 import gradio as gr
 from dotenv import load_dotenv
-from langchain.document_loaders import ArxivLoader, PyPDFLoader
 from langchain.text_splitter import TokenTextSplitter
 from langchain.vectorstores import Chroma
 from langchain_community.embeddings import HuggingFaceHubEmbeddings
-from langchain.chains import RetrievalQA
-from langchain.chains.summarize import load_summarize_chain
 from langchain_groq import ChatGroq
-from transformers import pipeline
 from PyPDF2 import PdfReader
 from huggingface_hub import login
 from groq import AsyncGroq, Groq
-import asyncio
 # Load environment variables
 load_dotenv()
@@ -34,92 +32,17 @@ login(HUGGING_API_KEY)
 # Load models and embeddings
 embedding_model = HuggingFaceHubEmbeddings(huggingfacehub_api_token=HUGGING_API_KEY)
 llm = ChatGroq(temperature=0, model_name="llama3-70b-8192", api_key=GROQ_API_KEY)
-def display_results(result):
-    """Format and display results properly."""
-    return "\n".join(result)
-def summarize_text(text):
-    """Summarize text using the Groq API."""
-    try:
-        sum_client = Groq(api_key=GROQ_API_KEY)
-        messages = [
-            {"role": "system", "content": "You are an excellent analyst who excels in summarization task. If I give you the whole text, you should summarize it."},
-            {"role": "user", "content": f"Summarize the paper: {text}"}
-        ]
-        response = sum_client.chat.completions.create(
-            messages=messages,
-            model="llama3-70b-8192",
-            temperature=0,
-            max_tokens=8192,
-            top_p=1,
-        )
-        return response.choices[0].message.content
-    except Exception as e:
-        logger.error(f"Error summarizing text: {e}")
-        return "Error in summarization."
-def summarize_pdf(pdf_file_path, max_length):
-    """Extract text from a PDF and summarize it."""
-    try:
-        reader = PdfReader(pdf_file_path)
-        text = "\n".join(page.extract_text() or "" for page in reader.pages)
-        text_splitter = TokenTextSplitter(chunk_size=8192, chunk_overlap=1000)
-        chunks = text_splitter.split_text(text)
-        summary = ""
-        for chunk in chunks:
-            summary += summarize_text(chunk)
-        return summary
-    except Exception as e:
-        logger.error(f"Error summarizing PDF: {e}")
-        return "Failed to process the PDF."
-def summarize_arxiv_pdf(query):
-    """Summarize an arXiv paper given a query."""
-    try:
-        loader = ArxivLoader(query=query, load_max_docs=10)
-        documents = loader.load()
-        text_splitter = TokenTextSplitter(chunk_size=5700, chunk_overlap=100)
-        chunks = text_splitter.split_documents(documents)
-        ref_summary = ""
-        for chunk in chunks:
-            ref_summary += summarize_text(chunk.page_content)
-        arxiv_summary = loader.get_summaries_as_docs()
-        summaries = []
-        for doc in arxiv_summary:
-            title = doc.metadata.get("Title", "Unknown Title")
-            authors = doc.metadata.get("Authors", "Unknown Authors")
-            url = doc.metadata.get("Entry ID", "No URL")
-            summaries.append(f"**{title}**\n")
-            summaries.append(f"**Authors:** {authors}\n")
-            summaries.append(f"**View full paper:** [Link to paper]({url})\n")
-            summaries.append(f"**Summary:** {doc.page_content}\n")
-            summaries.append(f"**Enhanced Summary:**\n {ref_summary}")
-        return display_results(summaries)
-    except Exception as e:
-        logger.error(f"Error summarizing arXiv paper: {e}")
-        return "Failed to process arXiv paper."
 client = AsyncGroq(api_key=GROQ_API_KEY)
 async def chat_with_replit(message, history):
-    """Chat functionality using Groq API."""
     try:
         messages = [{"role": "system", "content": "You are an assistant answering user questions."}]
-        for chat in history:
             user_msg, assistant_msg = chat
             messages.append({"role": "user", "content": user_msg})
             messages.append({"role": "assistant", "content": assistant_msg})
@@ -132,7 +55,7 @@ async def chat_with_replit(message, history):
             temperature=0,
             max_tokens=1024,
             top_p=1,
-            stream=False,  # Using non-streaming for simplicity in this integration.
         )
         return response.choices[0].message.content
@@ -140,13 +63,24 @@ async def chat_with_replit(message, history):
         logger.error(f"Chat error: {e}")
         return "Error in chat response."
-async def chat_with_replit_pdf(message, history, doi_num):
-    """Chat with arXiv papers using document retrieval."""
     try:
         loader = ArxivLoader(query=str(doi_num), load_max_docs=10)
         documents = loader.load_and_split()
         metadata = documents[0].metadata
         vector_store = Chroma.from_documents(documents, embedding_model)
         def retrieve_relevant_content(user_query):
@@ -173,63 +107,145 @@ async def chat_with_replit_pdf(message, history, doi_num):
         return response.choices[0].message.content
     except Exception as e:
-        logger.error(f"Error in chat with PDF: {e}")
-        return "Error processing chat with PDF."
-# Define a synchronous wrapper for the async chat function
-def chat_with_replit_sync(message, history):
-    return asyncio.run(chat_with_replit(message, history))
-# Gradio UI
 with gr.Blocks() as app:
-    # Tab for Local PDF Summarization
-    with gr.Tab(label="Local PDF Summarization"):
-        with gr.Row():
-            input_pdf = gr.File(label="Upload PDF file")
-            max_length_slider = gr.Slider(512, 4096, value=2048, step=512, label="Max Length")
-            summarize_pdf_btn = gr.Button(value="Summarize PDF")
-        with gr.Row():
-            output_pdf_summary = gr.Markdown(label="Summary", height=1000)
-        summarize_pdf_btn.click(summarize_pdf, inputs=[input_pdf, max_length_slider], outputs=output_pdf_summary)
-    # Tab for Arxiv Summarization
-    with gr.Tab(label="Arxiv Summarization"):
-        with gr.Column():
-            arxiv_number = gr.Textbox(label="Enter arXiv number, i.e 2502.02523")
-            summarize_btn = gr.Button(value="Summarize arXiv Paper")
-        with gr.Column():
-            output_summary = gr.Markdown(label="Summary", height=1000)
-        summarize_btn.click(summarize_arxiv_pdf, inputs=arxiv_number, outputs=output_summary)
-    # New Tab for Chat functionality
-    with gr.Tab(label="Chat with Assistant"):
         gr.Markdown("### Chat with the Assistant")
         with gr.Row():
-            chat_input = gr.Textbox(placeholder="Type your message here...", label="Your Message")
-            send_button = gr.Button("Send")
-        # A Markdown to display the conversation history (or you could use gr.Chatbot)
-        chat_output = gr.Markdown(label="Chat Output", height=300)
-        # Maintain chat history as a list of [user, assistant] pairs
-        chat_history = gr.State([])
-        # When the send button is clicked, update the chat history and get a response.
-        def update_chat(user_message, history):
-            # Append the new user message to history with an empty assistant response for now.
             history = history or []
             history.append([user_message, ""])
             return history, history
-        def update_assistant_response(history):
-            # Get the last user message and call the chat function
             user_message = history[-1][0]
             response = chat_with_replit_sync(user_message, history[:-1])
-            # Update the last entry with the assistant's response
             history[-1][1] = response
-            # Format the conversation for display
             formatted = "\n\n".join([f"**User:** {u}\n\n**Assistant:** {a}" for u, a in history])
             return history, formatted
-        send_button.click(update_chat, inputs=[chat_input, chat_history], outputs=[chat_history, chat_output])
-        send_button.click(update_assistant_response, inputs=chat_history, outputs=[chat_history, chat_output])
 app.launch()

 import os
 import logging
 import gradio as gr
+import asyncio
 from dotenv import load_dotenv
+from langchain.document_loaders import ArxivLoader
 from langchain.text_splitter import TokenTextSplitter
 from langchain.vectorstores import Chroma
 from langchain_community.embeddings import HuggingFaceHubEmbeddings
 from langchain_groq import ChatGroq
 from PyPDF2 import PdfReader
 from huggingface_hub import login
 from groq import AsyncGroq, Groq
+from langchain.docstore.document import Document  # For creating a document from PDF text
 # Load environment variables
 load_dotenv()
 # Load models and embeddings
 embedding_model = HuggingFaceHubEmbeddings(huggingfacehub_api_token=HUGGING_API_KEY)
 llm = ChatGroq(temperature=0, model_name="llama3-70b-8192", api_key=GROQ_API_KEY)
 client = AsyncGroq(api_key=GROQ_API_KEY)
+# -----------------------------
+# Chat Functionality (General)
+# -----------------------------
 async def chat_with_replit(message, history):
+    """General chat functionality using the Groq API."""
     try:
         messages = [{"role": "system", "content": "You are an assistant answering user questions."}]
+        for chat in history or []:
             user_msg, assistant_msg = chat
             messages.append({"role": "user", "content": user_msg})
             messages.append({"role": "assistant", "content": assistant_msg})
             temperature=0,
             max_tokens=1024,
             top_p=1,
+            stream=False,  # For simplicity we are not streaming
         )
         return response.choices[0].message.content
         logger.error(f"Chat error: {e}")
         return "Error in chat response."
+def chat_with_replit_sync(message, history):
+    """Synchronous wrapper for general chat."""
+    return asyncio.run(chat_with_replit(message, history))
+# -------------------------------------------------
+# Chat Functionality for ArXiv Paper (Document Chat)
+# -------------------------------------------------
+async def chat_with_replit_arxiv(message, history, doi_num):
+    """Chat answering questions using an ArXiv paper as context."""
     try:
+        # Load the ArXiv document and split it into chunks
         loader = ArxivLoader(query=str(doi_num), load_max_docs=10)
         documents = loader.load_and_split()
+        if not documents:
+            return "No documents found for the provided arXiv number."
         metadata = documents[0].metadata
+        # Create vector store for the loaded documents
         vector_store = Chroma.from_documents(documents, embedding_model)
         def retrieve_relevant_content(user_query):
         return response.choices[0].message.content
     except Exception as e:
+        logger.error(f"Error in chat with arXiv PDF: {e}")
+        return "Error processing chat with arXiv paper."
+def chat_with_replit_arxiv_sync(message, history, doi_num):
+    """Synchronous wrapper for arXiv chat."""
+    return asyncio.run(chat_with_replit_arxiv(message, history, doi_num))
+# -------------------------------------------------
+# Chat Functionality for Local PDF (Document Chat)
+# -------------------------------------------------
+async def chat_with_replit_local_pdf(message, history, pdf_file_path):
+    """Chat answering questions using a local PDF as context."""
+    try:
+        # Extract text from the uploaded PDF file
+        reader = PdfReader(pdf_file_path)
+        text = "\n".join(page.extract_text() or "" for page in reader.pages)
+        if not text.strip():
+            return "Could not extract text from PDF."
+        # Create a document from the PDF text
+        documents = [Document(page_content=text, metadata={"source": pdf_file_path})]
+        # Create a vector store using the document
+        vector_store = Chroma.from_documents(documents, embedding_model)
+        def retrieve_relevant_content(user_query):
+            results = vector_store.similarity_search(user_query, k=3)
+            return "\n\n".join(doc.page_content for doc in results)
+        relevant_content = retrieve_relevant_content(message)
+        messages = [
+            {"role": "user", "content": message},
+            {"role": "system", "content": f"Answer based on this PDF document: {pdf_file_path}.\n"
+                                          f"Relevant Content: {relevant_content}"}
+        ]
+        response = await client.chat.completions.create(
+            messages=messages,
+            model="llama3-70b-8192",
+            temperature=0,
+            max_tokens=1024,
+            top_p=1,
+            stream=False,
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        logger.error(f"Error in chat with local PDF: {e}")
+        return "Error processing chat with local PDF."
+def chat_with_replit_local_pdf_sync(message, history, pdf_file):
+    """Synchronous wrapper for local PDF chat."""
+    return asyncio.run(chat_with_replit_local_pdf(message, history, pdf_file))
+# ------------------------------------
+# Gradio UI Integration
+# ------------------------------------
 with gr.Blocks() as app:
+    # --- Tab: General Chat ---
+    with gr.Tab(label="General Chat"):
         gr.Markdown("### Chat with the Assistant")
         with gr.Row():
+            general_chat_input = gr.Textbox(placeholder="Type your message here...", label="Your Message")
+            general_send_button = gr.Button("Send")
+        general_chat_output = gr.Markdown(label="Chat Output", height=300)
+        general_chat_history = gr.State([])
+        def update_general_chat(user_message, history):
+            # Append the new message with an empty assistant reply for now.
             history = history or []
             history.append([user_message, ""])
             return history, history
+        def update_general_response(history):
             user_message = history[-1][0]
             response = chat_with_replit_sync(user_message, history[:-1])
             history[-1][1] = response
             formatted = "\n\n".join([f"**User:** {u}\n\n**Assistant:** {a}" for u, a in history])
             return history, formatted
+        general_send_button.click(update_general_chat, inputs=[general_chat_input, general_chat_history],
+                                    outputs=[general_chat_history, general_chat_output])
+        general_send_button.click(update_general_response, inputs=general_chat_history,
+                                    outputs=[general_chat_history, general_chat_output])
+    # --- Tab: Chat with ArXiv Paper ---
+    with gr.Tab(label="Chat with ArXiv Paper"):
+        gr.Markdown("### Ask Questions About an ArXiv Paper")
+        with gr.Row():
+            arxiv_input = gr.Textbox(placeholder="Enter your question here...", label="Your Question")
+            arxiv_doi = gr.Textbox(placeholder="Enter arXiv number, e.g. 2502.02523", label="ArXiv Number")
+            arxiv_send_button = gr.Button("Send")
+        arxiv_chat_output = gr.Markdown(label="Chat Output", height=300)
+        arxiv_chat_history = gr.State([])
+        def update_arxiv_chat(user_message, history):
+            history = history or []
+            history.append([user_message, ""])
+            return history, history
+        def update_arxiv_response(history, doi_num):
+            user_message = history[-1][0]
+            response = chat_with_replit_arxiv_sync(user_message, history[:-1], doi_num)
+            history[-1][1] = response
+            formatted = "\n\n".join([f"**User:** {u}\n\n**Assistant:** {a}" for u, a in history])
+            return history, formatted
+        arxiv_send_button.click(update_arxiv_chat, inputs=[arxiv_input, arxiv_chat_history],
+                                  outputs=[arxiv_chat_history, arxiv_chat_output])
+        arxiv_send_button.click(update_arxiv_response, inputs=[arxiv_chat_history, arxiv_doi],
+                                  outputs=[arxiv_chat_history, arxiv_chat_output])
+    # --- Tab: Chat with Local PDF ---
+    with gr.Tab(label="Chat with Local PDF"):
+        gr.Markdown("### Ask Questions About an Uploaded PDF")
+        with gr.Row():
+            pdf_file_input = gr.File(label="Upload PDF file")
+            pdf_chat_input = gr.Textbox(placeholder="Enter your question here...", label="Your Question")
+            pdf_send_button = gr.Button("Send")
+        pdf_chat_output = gr.Markdown(label="Chat Output", height=300)
+        pdf_chat_history = gr.State([])
+        def update_pdf_chat(user_message, history):
+            history = history or []
+            history.append([user_message, ""])
+            return history, history
+        def update_pdf_response(history, pdf_file):
+            user_message = history[-1][0]
+            response = chat_with_replit_local_pdf_sync(user_message, history[:-1], pdf_file)
+            history[-1][1] = response
+            formatted = "\n\n".join([f"**User:** {u}\n\n**Assistant:** {a}" for u, a in history])
+            return history, formatted
+        pdf_send_button.click(update_pdf_chat, inputs=[pdf_chat_input, pdf_chat_history],
+                                outputs=[pdf_chat_history, pdf_chat_output])
+        pdf_send_button.click(update_pdf_response, inputs=[pdf_chat_history, pdf_file_input],
+                                outputs=[pdf_chat_history, pdf_chat_output])
 app.launch()