Spaces:

mannadamay12
/

rag-ros2

Running

App Files Files Community

mannadamay12 commited on Dec 7, 2024

Commit

7321565

verified ·

1 Parent(s): a7422c1

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -39

app.py CHANGED Viewed

@@ -2,41 +2,31 @@ import spaces
 import os
 import gradio as gr
 import torch
 from transformers import AutoTokenizer, TextStreamer, pipeline, AutoModelForCausalLM
-from langchain.embeddings import HuggingFaceInstructEmbeddings
-from langchain.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
 from langchain.chains import RetrievalQA
-from langchain.llms import HuggingFacePipeline
 # System prompts
 DEFAULT_SYSTEM_PROMPT = """
-Based on the information in this document provided in context, answer the question as accurately as possible in 1 or 2 lines. If the information is not in the context,
-respond with "I don't know" or a similar acknowledgment that the answer is not available.
 """.strip()
-SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines.?"
-def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
     return f"""
 [INST] <<SYS>>
 {system_prompt}
 <</SYS>>
-{prompt} [/INST]
-""".strip()
-template = generate_prompt(
-    """
-{context}
 Question: {question}
-""",
-    system_prompt=SYSTEM_PROMPT,
-)
-prompt_template = PromptTemplate(template=template, input_variables=["context", "question"])
-# Initialize embeddings and database (CPU only)
 embeddings = HuggingFaceInstructEmbeddings(
     model_name="hkunlp/instructor-base",
     model_kwargs={"device": "cpu"}
@@ -55,21 +45,37 @@ def initialize_model():
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         token=token,
-        device_map="cuda"
     )
-    # if torch.cuda.is_available():
-    #     model.device = "cuda"
-    # else:
-    #     print("CUDA is not available")
     return model, tokenizer
 @spaces.GPU
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     try:
         model, tokenizer = initialize_model()
-        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         text_pipeline = pipeline(
             "text-generation",
             model=model,
@@ -81,18 +87,11 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
             streamer=streamer,
         )
-        llm = HuggingFacePipeline(pipeline=text_pipeline)
-        qa_chain = RetrievalQA.from_chain_type(
-            llm=llm,
-            chain_type="stuff",
-            retriever=db.as_retriever(search_kwargs={"k": 2}),
-            return_source_documents=False,
-            chain_type_kwargs={"prompt": prompt_template}
-        )
-        response = qa_chain.invoke({"query": message})
-        yield response["result"]
     except Exception as e:
         yield f"An error occurred: {str(e)}"
@@ -134,4 +133,4 @@ demo = gr.ChatInterface(
 )
 if __name__ == "__main__":
-    demo.launch()

 import os
 import gradio as gr
 import torch
 from transformers import AutoTokenizer, TextStreamer, pipeline, AutoModelForCausalLM
+from langchain_community.embeddings import HuggingFaceInstructEmbeddings
+from langchain_community.vectorstores import Chroma
 from langchain.prompts import PromptTemplate
 from langchain.chains import RetrievalQA
+from langchain_community.llms import HuggingFacePipeline
 # System prompts
 DEFAULT_SYSTEM_PROMPT = """
+You are a ROS2 expert assistant. Based on the context provided, give direct and concise answers.
+If the information is not in the context, respond with "I don't find that information in the available documentation."
+Keep responses to 1-2 lines maximum.
 """.strip()
+def generate_prompt(context: str, question: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
     return f"""
 [INST] <<SYS>>
 {system_prompt}
 <</SYS>>
+Context: {context}
 Question: {question}
+Answer: [/INST]
+""".strip()
+# Initialize embeddings and database
 embeddings = HuggingFaceInstructEmbeddings(
     model_name="hkunlp/instructor-base",
     model_kwargs={"device": "cpu"}
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
         token=token,
+        device_map="cuda" if torch.cuda.is_available() else "cpu"
     )
     return model, tokenizer
+class CustomTextStreamer(TextStreamer):
+    def __init__(self, tokenizer, skip_prompt=True, skip_special_tokens=True):
+        super().__init__(tokenizer, skip_prompt=skip_prompt, skip_special_tokens=skip_special_tokens)
+        self.output_text = ""
+    def put(self, value):
+        self.output_text += value
+        super().put(value)
 @spaces.GPU
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     try:
         model, tokenizer = initialize_model()
+        # Get relevant context from the database
+        retriever = db.as_retriever(search_kwargs={"k": 2})
+        docs = retriever.get_relevant_documents(message)
+        context = "\n".join([doc.page_content for doc in docs])
+        # Generate the complete prompt
+        prompt = generate_prompt(context=context, question=message, system_prompt=system_message)
+        # Set up the streamer
+        streamer = CustomTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        # Set up the pipeline
         text_pipeline = pipeline(
             "text-generation",
             model=model,
             streamer=streamer,
         )
+        # Generate response
+        _ = text_pipeline(prompt, max_new_tokens=max_tokens)
+        # Return only the generated response
+        yield streamer.output_text.strip()
     except Exception as e:
         yield f"An error occurred: {str(e)}"
 )
 if __name__ == "__main__":
+    demo.launch(share=True)