T-K-O-H commited on
Commit
d91c001
·
1 Parent(s): 3076d04

Fix metrics passing in workflow and state

Browse files
Files changed (2) hide show
  1. app.py +24 -2
  2. rag_graph.py +83 -20
app.py CHANGED
@@ -16,6 +16,12 @@ from langchain_openai import OpenAIEmbeddings
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
 
 
 
 
 
 
19
  load_dotenv()
20
 
21
  # Set page config
@@ -179,8 +185,24 @@ if st.button("Submit") or question != default_question:
179
 
180
  # Display the response and metrics
181
  st.markdown(result["response"])
182
- st.write("Full Response Data:")
183
- st.json(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  # Add assistant response to chat history
186
  st.session_state.messages.append({
 
16
  logging.basicConfig(level=logging.INFO)
17
  logger = logging.getLogger(__name__)
18
 
19
+ # Create a string buffer to capture logs
20
+ log_stream = io.StringIO()
21
+ handler = logging.StreamHandler(log_stream)
22
+ handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
23
+ logger.addHandler(handler)
24
+
25
  load_dotenv()
26
 
27
  # Set page config
 
185
 
186
  # Display the response and metrics
187
  st.markdown(result["response"])
188
+
189
+ # Display the raw metrics dictionary
190
+ if "metrics" in result and result["metrics"]:
191
+ st.markdown("---") # Add a separator
192
+ st.subheader("RAGAS Metrics")
193
+ st.write("Raw metrics dictionary:")
194
+ st.json(result["metrics"])
195
+
196
+ # Display the metrics calculation log
197
+ metrics_log = log_stream.getvalue()
198
+ if "RAGAS metrics calculated" in metrics_log:
199
+ st.markdown("---")
200
+ st.subheader("Metrics Calculation Log")
201
+ st.code(metrics_log.split("RAGAS metrics calculated:")[-1].strip())
202
+ else:
203
+ st.warning("No metrics available for this response")
204
+ st.write("Debug - Full result dictionary:")
205
+ st.json(result)
206
 
207
  # Add assistant response to chat history
208
  st.session_state.messages.append({
rag_graph.py CHANGED
@@ -24,6 +24,7 @@ class AgentState(TypedDict):
24
  messages: Annotated[List[HumanMessage | AIMessage], "The messages in the conversation"]
25
  context: Annotated[str, "The retrieved context"]
26
  response: Annotated[str, "The generated response"]
 
27
  next: str
28
 
29
  # Initialize components
@@ -76,10 +77,25 @@ def retrieve(state: AgentState) -> Dict:
76
  raise ValueError("No valid context could be retrieved")
77
 
78
  logger.info(f"Retrieved context length: {len(context)} characters")
79
- return {"context": context, "next": "generate"}
 
 
 
 
80
  except Exception as e:
81
  logger.error(f"Error in retrieval: {str(e)}")
82
- return {"context": "", "next": "generate"}
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  # Define the generation function
85
  def generate(state: AgentState) -> Dict:
@@ -91,7 +107,14 @@ def generate(state: AgentState) -> Dict:
91
  logger.warning("Empty context in generation step")
92
  return {
93
  "response": "I apologize, but I couldn't find any relevant information in the knowledge base to answer your question. Please try rephrasing your question or upload more relevant documents.",
94
- "metrics": {}, # Add empty metrics
 
 
 
 
 
 
 
95
  "next": "evaluate"
96
  }
97
 
@@ -123,6 +146,7 @@ def generate(state: AgentState) -> Dict:
123
 
124
  # Calculate metrics directly in generate
125
  try:
 
126
  dataset = Dataset.from_dict({
127
  "question": [messages[-1].content],
128
  "contexts": [[context]],
@@ -130,6 +154,7 @@ def generate(state: AgentState) -> Dict:
130
  "ground_truth": [context]
131
  })
132
 
 
133
  metrics_dict = {}
134
  result = evaluate(dataset, metrics=[faithfulness, answer_relevancy, context_precision, context_recall, answer_correctness])
135
 
@@ -139,21 +164,35 @@ def generate(state: AgentState) -> Dict:
139
  metrics_dict["context_recall"] = float(np.mean(result["context_recall"]))
140
  metrics_dict["answer_correctness"] = float(np.mean(result["answer_correctness"]))
141
 
142
- logger.info(f"Metrics calculated: {metrics_dict}")
143
  except Exception as e:
144
  logger.error(f"Error calculating metrics: {str(e)}")
145
- metrics_dict = {}
 
 
 
 
 
 
 
146
 
147
  return {
148
  "response": response,
149
- "metrics": metrics_dict, # Include metrics in the response
150
  "next": "evaluate"
151
  }
152
  except Exception as e:
153
  logger.error(f"Error in generation: {str(e)}")
154
  return {
155
  "response": "I apologize, but I encountered an error while generating a response. Please try again.",
156
- "metrics": {}, # Add empty metrics
 
 
 
 
 
 
 
157
  "next": "evaluate"
158
  }
159
 
@@ -172,23 +211,33 @@ def evaluate_rag(state: AgentState) -> Dict:
172
  logger.info(f"Context preview: {context[:200]}...")
173
  logger.info(f"Response preview: {response[:200]}...")
174
 
 
 
 
 
 
175
  # Validate inputs
176
  if not context.strip():
177
  logger.error("Empty context detected")
178
- return {"context": context, "response": response, "metrics": {}, "next": END}
 
 
 
 
 
 
 
179
 
180
  if not response.strip():
181
  logger.error("Empty response detected")
182
- return {"context": context, "response": response, "metrics": {}, "next": END}
183
-
184
- # Check for minimum content requirements
185
- if len(context) < 50:
186
- logger.error(f"Context too short: {len(context)} characters")
187
- return {"context": context, "response": response, "metrics": {}, "next": END}
188
-
189
- if len(response) < 20:
190
- logger.error(f"Response too short: {len(response)} characters")
191
- return {"context": context, "response": response, "metrics": {}, "next": END}
192
 
193
  logger.info("Creating evaluation dataset...")
194
  try:
@@ -251,12 +300,26 @@ def evaluate_rag(state: AgentState) -> Dict:
251
  except Exception as eval_error:
252
  logger.error(f"Error during RAGAS evaluation: {str(eval_error)}")
253
  logger.error(f"Error type: {type(eval_error)}")
254
- return {"context": context, "response": response, "metrics": {}, "next": END}
 
 
 
 
 
 
 
255
 
256
  except Exception as e:
257
  logger.error(f"Error in RAGAS evaluation: {str(e)}")
258
  logger.error(f"Error type: {type(e)}")
259
- return {"context": context, "response": response, "metrics": {}, "next": END}
 
 
 
 
 
 
 
260
 
261
  # Create the workflow
262
  def create_rag_graph():
 
24
  messages: Annotated[List[HumanMessage | AIMessage], "The messages in the conversation"]
25
  context: Annotated[str, "The retrieved context"]
26
  response: Annotated[str, "The generated response"]
27
+ metrics: Annotated[Dict, "The RAGAS metrics"]
28
  next: str
29
 
30
  # Initialize components
 
77
  raise ValueError("No valid context could be retrieved")
78
 
79
  logger.info(f"Retrieved context length: {len(context)} characters")
80
+ return {
81
+ "context": context,
82
+ "metrics": {}, # Initialize empty metrics
83
+ "next": "generate"
84
+ }
85
  except Exception as e:
86
  logger.error(f"Error in retrieval: {str(e)}")
87
+ return {
88
+ "context": "",
89
+ "metrics": {
90
+ "error": str(e),
91
+ "faithfulness": 0.0,
92
+ "answer_relevancy": 0.0,
93
+ "context_precision": 0.0,
94
+ "context_recall": 0.0,
95
+ "answer_correctness": 0.0
96
+ },
97
+ "next": "generate"
98
+ }
99
 
100
  # Define the generation function
101
  def generate(state: AgentState) -> Dict:
 
107
  logger.warning("Empty context in generation step")
108
  return {
109
  "response": "I apologize, but I couldn't find any relevant information in the knowledge base to answer your question. Please try rephrasing your question or upload more relevant documents.",
110
+ "metrics": {
111
+ "faithfulness": 0.0,
112
+ "answer_relevancy": 0.0,
113
+ "context_precision": 0.0,
114
+ "context_recall": 0.0,
115
+ "answer_correctness": 0.0,
116
+ "note": "No context available for evaluation"
117
+ },
118
  "next": "evaluate"
119
  }
120
 
 
146
 
147
  # Calculate metrics directly in generate
148
  try:
149
+ logger.info("Creating dataset for metrics calculation")
150
  dataset = Dataset.from_dict({
151
  "question": [messages[-1].content],
152
  "contexts": [[context]],
 
154
  "ground_truth": [context]
155
  })
156
 
157
+ logger.info("Calculating RAGAS metrics")
158
  metrics_dict = {}
159
  result = evaluate(dataset, metrics=[faithfulness, answer_relevancy, context_precision, context_recall, answer_correctness])
160
 
 
164
  metrics_dict["context_recall"] = float(np.mean(result["context_recall"]))
165
  metrics_dict["answer_correctness"] = float(np.mean(result["answer_correctness"]))
166
 
167
+ logger.info(f"RAGAS metrics calculated: {metrics_dict}")
168
  except Exception as e:
169
  logger.error(f"Error calculating metrics: {str(e)}")
170
+ metrics_dict = {
171
+ "error": str(e),
172
+ "faithfulness": 0.0,
173
+ "answer_relevancy": 0.0,
174
+ "context_precision": 0.0,
175
+ "context_recall": 0.0,
176
+ "answer_correctness": 0.0
177
+ }
178
 
179
  return {
180
  "response": response,
181
+ "metrics": metrics_dict,
182
  "next": "evaluate"
183
  }
184
  except Exception as e:
185
  logger.error(f"Error in generation: {str(e)}")
186
  return {
187
  "response": "I apologize, but I encountered an error while generating a response. Please try again.",
188
+ "metrics": {
189
+ "error": str(e),
190
+ "faithfulness": 0.0,
191
+ "answer_relevancy": 0.0,
192
+ "context_precision": 0.0,
193
+ "context_recall": 0.0,
194
+ "answer_correctness": 0.0
195
+ },
196
  "next": "evaluate"
197
  }
198
 
 
211
  logger.info(f"Context preview: {context[:200]}...")
212
  logger.info(f"Response preview: {response[:200]}...")
213
 
214
+ # Check if metrics are already in state
215
+ if "metrics" in state:
216
+ logger.info(f"Metrics found in state: {state['metrics']}")
217
+ return {"context": context, "response": response, "metrics": state["metrics"], "next": END}
218
+
219
  # Validate inputs
220
  if not context.strip():
221
  logger.error("Empty context detected")
222
+ return {"context": context, "response": response, "metrics": {
223
+ "faithfulness": 0.0,
224
+ "answer_relevancy": 0.0,
225
+ "context_precision": 0.0,
226
+ "context_recall": 0.0,
227
+ "answer_correctness": 0.0,
228
+ "note": "Empty context"
229
+ }, "next": END}
230
 
231
  if not response.strip():
232
  logger.error("Empty response detected")
233
+ return {"context": context, "response": response, "metrics": {
234
+ "faithfulness": 0.0,
235
+ "answer_relevancy": 0.0,
236
+ "context_precision": 0.0,
237
+ "context_recall": 0.0,
238
+ "answer_correctness": 0.0,
239
+ "note": "Empty response"
240
+ }, "next": END}
 
 
241
 
242
  logger.info("Creating evaluation dataset...")
243
  try:
 
300
  except Exception as eval_error:
301
  logger.error(f"Error during RAGAS evaluation: {str(eval_error)}")
302
  logger.error(f"Error type: {type(eval_error)}")
303
+ return {"context": context, "response": response, "metrics": {
304
+ "faithfulness": 0.0,
305
+ "answer_relevancy": 0.0,
306
+ "context_precision": 0.0,
307
+ "context_recall": 0.0,
308
+ "answer_correctness": 0.0,
309
+ "error": str(eval_error)
310
+ }, "next": END}
311
 
312
  except Exception as e:
313
  logger.error(f"Error in RAGAS evaluation: {str(e)}")
314
  logger.error(f"Error type: {type(e)}")
315
+ return {"context": context, "response": response, "metrics": {
316
+ "faithfulness": 0.0,
317
+ "answer_relevancy": 0.0,
318
+ "context_precision": 0.0,
319
+ "context_recall": 0.0,
320
+ "answer_correctness": 0.0,
321
+ "error": str(e)
322
+ }, "next": END}
323
 
324
  # Create the workflow
325
  def create_rag_graph():