Spaces:

Pavan2k4
/

RAG-RUBIK

Sleeping

App Files Files

Pavan2k4 commited on Mar 15

Commit

23b02f3

verified ·

1 Parent(s): be98308

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -99

app.py CHANGED Viewed

@@ -32,7 +32,7 @@ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-Mi
 # Initialize Pinecone connection
 try:
     pc = PineconeVectorStore(
-        pinecone_api_key=os.environ.get('PINCE_CONE_LIGHT'),
         embedding=embedding_model,
         index_name='rag-rubic',
         namespace='vectors_lightmodel'
@@ -46,7 +46,7 @@ except Exception as e:
 # Initialize the LLM
 llm = ChatOpenAI(
     model='gpt-4o-mini',
-    api_key=os.environ.get('OPEN_AI_KEY'),
     temperature=0.2
 )
@@ -85,38 +85,9 @@ prompt = PromptTemplate(
 rag_chain = prompt | llm | StrOutputParser()
 # Web search tool for adding data from websites
-web_search_tool = TavilySearchResults(api_key=os.environ.get('TAVILY_API_KEY'), k=5)
-# Load website data
-try:
-    print("Loading web data...")
-    docs = []
-    for i in url:
-        try:
-            docs.append(WebBaseLoader(i).load())
-        except Exception as e:
-            print(f"Error loading {i}: {e}")
-    docs_list = [item for sublist in docs for item in sublist]
-    # Split documents into chunks
-    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
-        chunk_size=1000,
-        chunk_overlap=100
-    )
-    doc_splits = text_splitter.split_documents(docs_list)
-    # VectorStore from the web-scraped documents
-    vectorstore = SKLearnVectorStore.from_documents(
-        documents=doc_splits,
-        embedding=embedding_model
-    )
-    retriever_web = vectorstore.as_retriever(search_kwargs={"k": 5})
-    print(f"Loaded {len(doc_splits)} document chunks from web sources")
-except Exception as e:
-    print(f"Error in web data processing: {e}")
-    # Create a simple retriever that returns empty results if web loading fails
-    retriever_web = lambda x: []
 # Define Graph states and transitions
 class GraphState(TypedDict):
@@ -139,36 +110,24 @@ def retrieve_db(state):
     return {'documents': [], 'question': question, 'need_web_search': 'yes'}
 def grade_docs(state):
-    """Grades the docs generated by the retriever_db"""
     question = state['question']
     docs = state['documents']
-    if not docs:
-        return {"documents": [], 'question': question, 'need_web_search': 'yes'}
-    filtered_data = []
-    web_search_needed = "no"
-    try:
-        for doc in docs:
-            doc_content = doc.page_content if hasattr(doc, 'page_content') else str(doc)
-            score = retrieval_grader.invoke({'question': question, 'documents': doc_content})
-            grade = score.binary_score
-            if grade == 'yes':
-                filtered_data.append(doc)
-    except Exception as e:
-        print(f"Error in document grading: {e}")
-        web_search_needed = "yes"
-    # If no relevant documents were found, trigger web search
-    if not filtered_data:
-        web_search_needed = "yes"
-    return {
-        "documents": filtered_data,
-        'question': question,
-        'need_web_search': web_search_needed  # Updated key name
-    }
 def decide(state):
     """Decide if the generation should be based on DB or web search DATA"""
@@ -179,58 +138,55 @@ def decide(state):
         return 'generate'
 def web_search(state):
-    """Based on the Grade, will proceed with WebSearch within the given URL's."""
     question = state['question']
-    documents = state.get("documents", [])
-    try:
-        # First try website-specific retriever
-        docs = retriever_web.invoke(question)
-        if not docs:
-            # If no results, try Tavily search
-            search_results = web_search_tool.invoke(question)
-            data = "\n".join(result["content"] for result in search_results)
-            docs = [Document(page_content=data)]
-    except Exception as e:
-        print(f"Web search error: {e}")
-        # Create a fallback document if search fails
-        docs = [Document(page_content="Unable to retrieve additional information.")]
-    # Combine existing documents with new ones
-    all_docs = documents + docs
-    return {'documents': all_docs, 'question': question}
 def generate(state):
-    """Generate response based on retrieved documents"""
-    documents = state.get('documents', [])
     question = state['question']
-    # Convert documents to text for the context
-    if documents:
-        try:
-            context = "\n\n".join(
-                doc.page_content if hasattr(doc, 'page_content') else str(doc)
-                for doc in documents
-            )
-        except Exception as e:
-            print(f"Error processing documents: {e}")
-            context = "Error retrieving relevant information."
-    else:
-        context = "No relevant information found."
-    try:
-        response = rag_chain.invoke({'context': context, 'question': question})
-    except Exception as e:
-        print(f"Generation error: {e}")
-        response = "I apologize, but I encountered an error while generating a response. Please try asking your question again."
     return {
         'documents': documents,
         'question': question,
-        'generation': response
     }
 # Compile Workflow
 workflow = StateGraph(GraphState)
 workflow.add_node("retrieve", retrieve_db)
@@ -275,7 +231,7 @@ def process_query(user_input, history):
         else:
             response = "I couldn't find relevant information to answer your question."
     except Exception as e:
-        print(f"Error in crag execution: {e}")
         response = "I encountered an error while processing your request. Please try again."
     # Update the last response in history

 # Initialize Pinecone connection
 try:
     pc = PineconeVectorStore(
+        pinecone_api_key=os.environ.get('PINECONE_KEY'),
         embedding=embedding_model,
         index_name='rag-rubic',
         namespace='vectors_lightmodel'
 # Initialize the LLM
 llm = ChatOpenAI(
     model='gpt-4o-mini',
+    api_key=os.environ.get('OPENAI_KEY'),
     temperature=0.2
 )
 rag_chain = prompt | llm | StrOutputParser()
 # Web search tool for adding data from websites
+web_search_tool = TavilySearchResults(api_key=os.environ.get('TAVILY_API_KEY'), k=10)
 # Define Graph states and transitions
 class GraphState(TypedDict):
     return {'documents': [], 'question': question, 'need_web_search': 'yes'}
 def grade_docs(state):
+    """Grades the docs generated by the retriever_db
+    If 1, returns the docs if 0 proceeds for web search"""
     question = state['question']
     docs = state['documents']
+    filterd_data = []
+    web = "no"
+    for data in docs:
+        score = retrieval_grader.invoke({'question':question, 'documents':docs})
+        grade = score.binary_score
+        if grade == 'yes':
+            filterd_data.append(data)
+        else:
+            #print("----------Failed, proceeding with WebSearch------------------")
+            web = 'yes'
+    return {"documents": filterd_data, "question": question, "need_web_search": web}
 def decide(state):
     """Decide if the generation should be based on DB or web search DATA"""
         return 'generate'
 def web_search(state):
+    """Perform a web search and store both content and source URLs."""
     question = state['question']
+    documents = state["documents"]
+    # Get search results
+    results = web_search_tool.invoke({"query": question})
+    # Process results with sources
+    docs = []
+    for res in results:
+        content = res["content"]  # Extract answer content
+        source = res["url"]       # Extract source URL
+        # Create Document with metadata
+        doc = Document(page_content=content, metadata={"source": source})
+        docs.append(doc)
+    if not results:
+        #print("No results from web search. Returning default response.")
+        return {"documents": [], "question": question}
+    documents.extend(docs)
+    return {"documents": documents, "question": question}
 def generate(state):
+    #print("Inside generate function")  # Debugging
+    documents = state['documents']
     question = state['question']
+    # Generate response using retrieved documents
+    response = rag_chain.invoke({'context': documents, 'question': question})
+    # Extract source URLs
+    sources = [doc.metadata.get("source", "Unknown source") for doc in documents if "source" in doc.metadata]
+    # Format response with citations
+    formatted_response = response + "\n\nSources:\n" + "\n".join(sources) if sources else response
+    #print("Generated response:", formatted_response)  # Debugging
+    # Return response with sources
     return {
         'documents': documents,
         'question': question,
+        'generation': formatted_response  # Append sources to the response
     }
 # Compile Workflow
 workflow = StateGraph(GraphState)
 workflow.add_node("retrieve", retrieve_db)
         else:
             response = "I couldn't find relevant information to answer your question."
     except Exception as e:
+        #print(f"Error in crag execution: {e}")
         response = "I encountered an error while processing your request. Please try again."
     # Update the last response in history