Pavan2k4 commited on
Commit
23b02f3
·
verified ·
1 Parent(s): be98308

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -99
app.py CHANGED
@@ -32,7 +32,7 @@ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-Mi
32
  # Initialize Pinecone connection
33
  try:
34
  pc = PineconeVectorStore(
35
- pinecone_api_key=os.environ.get('PINCE_CONE_LIGHT'),
36
  embedding=embedding_model,
37
  index_name='rag-rubic',
38
  namespace='vectors_lightmodel'
@@ -46,7 +46,7 @@ except Exception as e:
46
  # Initialize the LLM
47
  llm = ChatOpenAI(
48
  model='gpt-4o-mini',
49
- api_key=os.environ.get('OPEN_AI_KEY'),
50
  temperature=0.2
51
  )
52
 
@@ -85,38 +85,9 @@ prompt = PromptTemplate(
85
  rag_chain = prompt | llm | StrOutputParser()
86
 
87
  # Web search tool for adding data from websites
88
- web_search_tool = TavilySearchResults(api_key=os.environ.get('TAVILY_API_KEY'), k=5)
89
 
90
- # Load website data
91
- try:
92
- print("Loading web data...")
93
- docs = []
94
- for i in url:
95
- try:
96
- docs.append(WebBaseLoader(i).load())
97
- except Exception as e:
98
- print(f"Error loading {i}: {e}")
99
-
100
- docs_list = [item for sublist in docs for item in sublist]
101
 
102
- # Split documents into chunks
103
- text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
104
- chunk_size=1000,
105
- chunk_overlap=100
106
- )
107
- doc_splits = text_splitter.split_documents(docs_list)
108
-
109
- # VectorStore from the web-scraped documents
110
- vectorstore = SKLearnVectorStore.from_documents(
111
- documents=doc_splits,
112
- embedding=embedding_model
113
- )
114
- retriever_web = vectorstore.as_retriever(search_kwargs={"k": 5})
115
- print(f"Loaded {len(doc_splits)} document chunks from web sources")
116
- except Exception as e:
117
- print(f"Error in web data processing: {e}")
118
- # Create a simple retriever that returns empty results if web loading fails
119
- retriever_web = lambda x: []
120
 
121
  # Define Graph states and transitions
122
  class GraphState(TypedDict):
@@ -139,36 +110,24 @@ def retrieve_db(state):
139
  return {'documents': [], 'question': question, 'need_web_search': 'yes'}
140
 
141
  def grade_docs(state):
142
- """Grades the docs generated by the retriever_db"""
 
143
  question = state['question']
144
  docs = state['documents']
145
-
146
- if not docs:
147
- return {"documents": [], 'question': question, 'need_web_search': 'yes'}
148
-
149
- filtered_data = []
150
- web_search_needed = "no"
151
-
152
- try:
153
- for doc in docs:
154
- doc_content = doc.page_content if hasattr(doc, 'page_content') else str(doc)
155
- score = retrieval_grader.invoke({'question': question, 'documents': doc_content})
156
- grade = score.binary_score
157
- if grade == 'yes':
158
- filtered_data.append(doc)
159
- except Exception as e:
160
- print(f"Error in document grading: {e}")
161
- web_search_needed = "yes"
162
-
163
- # If no relevant documents were found, trigger web search
164
- if not filtered_data:
165
- web_search_needed = "yes"
166
 
167
- return {
168
- "documents": filtered_data,
169
- 'question': question,
170
- 'need_web_search': web_search_needed # Updated key name
171
- }
 
 
 
 
172
 
173
  def decide(state):
174
  """Decide if the generation should be based on DB or web search DATA"""
@@ -179,58 +138,55 @@ def decide(state):
179
  return 'generate'
180
 
181
  def web_search(state):
182
- """Based on the Grade, will proceed with WebSearch within the given URL's."""
183
  question = state['question']
184
- documents = state.get("documents", [])
185
 
186
- try:
187
- # First try website-specific retriever
188
- docs = retriever_web.invoke(question)
189
- if not docs:
190
- # If no results, try Tavily search
191
- search_results = web_search_tool.invoke(question)
192
- data = "\n".join(result["content"] for result in search_results)
193
- docs = [Document(page_content=data)]
194
- except Exception as e:
195
- print(f"Web search error: {e}")
196
- # Create a fallback document if search fails
197
- docs = [Document(page_content="Unable to retrieve additional information.")]
198
 
199
- # Combine existing documents with new ones
200
- all_docs = documents + docs
201
-
202
- return {'documents': all_docs, 'question': question}
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
  def generate(state):
205
- """Generate response based on retrieved documents"""
206
- documents = state.get('documents', [])
207
  question = state['question']
208
 
209
- # Convert documents to text for the context
210
- if documents:
211
- try:
212
- context = "\n\n".join(
213
- doc.page_content if hasattr(doc, 'page_content') else str(doc)
214
- for doc in documents
215
- )
216
- except Exception as e:
217
- print(f"Error processing documents: {e}")
218
- context = "Error retrieving relevant information."
219
- else:
220
- context = "No relevant information found."
221
-
222
- try:
223
- response = rag_chain.invoke({'context': context, 'question': question})
224
- except Exception as e:
225
- print(f"Generation error: {e}")
226
- response = "I apologize, but I encountered an error while generating a response. Please try asking your question again."
227
 
 
 
 
 
 
 
 
 
 
228
  return {
229
  'documents': documents,
230
  'question': question,
231
- 'generation': response
232
  }
233
 
 
234
  # Compile Workflow
235
  workflow = StateGraph(GraphState)
236
  workflow.add_node("retrieve", retrieve_db)
@@ -275,7 +231,7 @@ def process_query(user_input, history):
275
  else:
276
  response = "I couldn't find relevant information to answer your question."
277
  except Exception as e:
278
- print(f"Error in crag execution: {e}")
279
  response = "I encountered an error while processing your request. Please try again."
280
 
281
  # Update the last response in history
 
32
  # Initialize Pinecone connection
33
  try:
34
  pc = PineconeVectorStore(
35
+ pinecone_api_key=os.environ.get('PINECONE_KEY'),
36
  embedding=embedding_model,
37
  index_name='rag-rubic',
38
  namespace='vectors_lightmodel'
 
46
  # Initialize the LLM
47
  llm = ChatOpenAI(
48
  model='gpt-4o-mini',
49
+ api_key=os.environ.get('OPENAI_KEY'),
50
  temperature=0.2
51
  )
52
 
 
85
  rag_chain = prompt | llm | StrOutputParser()
86
 
87
  # Web search tool for adding data from websites
88
+ web_search_tool = TavilySearchResults(api_key=os.environ.get('TAVILY_API_KEY'), k=10)
89
 
 
 
 
 
 
 
 
 
 
 
 
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  # Define Graph states and transitions
93
  class GraphState(TypedDict):
 
110
  return {'documents': [], 'question': question, 'need_web_search': 'yes'}
111
 
112
  def grade_docs(state):
113
+ """Grades the docs generated by the retriever_db
114
+ If 1, returns the docs if 0 proceeds for web search"""
115
  question = state['question']
116
  docs = state['documents']
117
+ filterd_data = []
118
+ web = "no"
119
+ for data in docs:
120
+ score = retrieval_grader.invoke({'question':question, 'documents':docs})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
+ grade = score.binary_score
123
+
124
+ if grade == 'yes':
125
+ filterd_data.append(data)
126
+ else:
127
+ #print("----------Failed, proceeding with WebSearch------------------")
128
+ web = 'yes'
129
+ return {"documents": filterd_data, "question": question, "need_web_search": web}
130
+
131
 
132
  def decide(state):
133
  """Decide if the generation should be based on DB or web search DATA"""
 
138
  return 'generate'
139
 
140
  def web_search(state):
141
+ """Perform a web search and store both content and source URLs."""
142
  question = state['question']
143
+ documents = state["documents"]
144
 
145
+ # Get search results
146
+ results = web_search_tool.invoke({"query": question})
 
 
 
 
 
 
 
 
 
 
147
 
148
+ # Process results with sources
149
+ docs = []
150
+ for res in results:
151
+ content = res["content"] # Extract answer content
152
+ source = res["url"] # Extract source URL
153
+
154
+ # Create Document with metadata
155
+ doc = Document(page_content=content, metadata={"source": source})
156
+ docs.append(doc)
157
+
158
+ if not results:
159
+ #print("No results from web search. Returning default response.")
160
+ return {"documents": [], "question": question}
161
+
162
+ documents.extend(docs)
163
+ return {"documents": documents, "question": question}
164
+
165
 
166
  def generate(state):
167
+ #print("Inside generate function") # Debugging
168
+ documents = state['documents']
169
  question = state['question']
170
 
171
+ # Generate response using retrieved documents
172
+ response = rag_chain.invoke({'context': documents, 'question': question})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
+ # Extract source URLs
175
+ sources = [doc.metadata.get("source", "Unknown source") for doc in documents if "source" in doc.metadata]
176
+
177
+ # Format response with citations
178
+ formatted_response = response + "\n\nSources:\n" + "\n".join(sources) if sources else response
179
+
180
+ #print("Generated response:", formatted_response) # Debugging
181
+
182
+ # Return response with sources
183
  return {
184
  'documents': documents,
185
  'question': question,
186
+ 'generation': formatted_response # Append sources to the response
187
  }
188
 
189
+
190
  # Compile Workflow
191
  workflow = StateGraph(GraphState)
192
  workflow.add_node("retrieve", retrieve_db)
 
231
  else:
232
  response = "I couldn't find relevant information to answer your question."
233
  except Exception as e:
234
+ #print(f"Error in crag execution: {e}")
235
  response = "I encountered an error while processing your request. Please try again."
236
 
237
  # Update the last response in history