ganesh3 commited on
Commit
9da39b7
·
verified ·
1 Parent(s): e2cbf8c

Update app/evaluation.py

Browse files
Files changed (1) hide show
  1. app/evaluation.py +108 -103
app/evaluation.py CHANGED
@@ -2,16 +2,46 @@ from sklearn.metrics.pairwise import cosine_similarity
2
  import numpy as np
3
  import pandas as pd
4
  import json
5
- import ollama
6
  import requests
7
- import sqlite3
8
  from tqdm import tqdm
9
  import csv
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  class EvaluationSystem:
12
  def __init__(self, data_processor, database_handler):
13
  self.data_processor = data_processor
14
  self.db_handler = database_handler
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  def relevance_scoring(self, query, retrieved_docs, top_k=5):
17
  query_embedding = self.data_processor.embedding_model.encode(query)
@@ -35,44 +65,31 @@ class EvaluationSystem:
35
  result = cursor.fetchone()
36
  return result[0] if result[0] is not None else 0
37
 
38
- def evaluate_rag_performance(self, rag_system, test_queries, reference_answers, index_name):
39
- relevance_scores = []
40
- similarity_scores = []
41
- human_scores = []
42
-
43
- for query, reference in zip(test_queries, reference_answers):
44
- retrieved_docs = rag_system.data_processor.search(query, num_results=5, method='hybrid', index_name=index_name)
45
- generated_answer, _ = rag_system.query(query, search_method='hybrid', index_name=index_name)
46
-
47
- relevance_scores.append(self.relevance_scoring(query, retrieved_docs))
48
- similarity_scores.append(self.answer_similarity(generated_answer, reference))
49
- human_scores.append(self.human_evaluation(index_name, query))
50
-
51
- return {
52
- "avg_relevance_score": np.mean(relevance_scores),
53
- "avg_similarity_score": np.mean(similarity_scores),
54
- "avg_human_score": np.mean(human_scores)
55
- }
56
-
57
  def llm_as_judge(self, question, generated_answer, prompt_template):
58
- prompt = prompt_template.format(question=question, answer_llm=generated_answer)
59
-
 
 
60
  try:
61
- response = ollama.chat(
62
- model='phi3.5',
63
- messages=[{"role": "user", "content": prompt}]
64
- )
65
- evaluation = json.loads(response['message']['content'])
66
- return evaluation
 
 
 
 
67
  except Exception as e:
68
- print(f"Error in LLM evaluation: {str(e)}")
69
  return None
70
 
71
  def evaluate_rag(self, rag_system, ground_truth_file, prompt_template=None):
72
  try:
73
  ground_truth = pd.read_csv(ground_truth_file)
74
  except FileNotFoundError:
75
- print("Ground truth file not found. Please generate ground truth data first.")
76
  return None
77
 
78
  evaluations = []
@@ -84,13 +101,13 @@ class EvaluationSystem:
84
  index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
85
 
86
  if not index_name:
87
- print(f"No index found for video {video_id}. Skipping this question.")
88
  continue
89
 
90
  try:
91
  answer_llm, _ = rag_system.query(question, search_method='hybrid', index_name=index_name)
92
  except ValueError as e:
93
- print(f"Error querying RAG system: {str(e)}")
94
  continue
95
 
96
  if prompt_template:
@@ -114,79 +131,25 @@ class EvaluationSystem:
114
  })
115
 
116
  # Save evaluations to CSV
117
- csv_path = 'data/evaluation_results.csv'
118
- with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
119
- fieldnames = ['video_id', 'question', 'answer', 'relevance', 'explanation']
120
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
121
- writer.writeheader()
122
- for eval_data in evaluations:
123
- writer.writerow(eval_data)
124
-
125
- print(f"Evaluation results saved to {csv_path}")
126
-
127
- # Save evaluations to database
128
- self.save_evaluations_to_db(evaluations)
129
 
130
  return evaluations
131
 
132
  def save_evaluations_to_db(self, evaluations):
133
- with sqlite3.connect(self.db_handler.db_path) as conn:
134
- cursor = conn.cursor()
135
- cursor.execute('''
136
- CREATE TABLE IF NOT EXISTS rag_evaluations (
137
- id INTEGER PRIMARY KEY AUTOINCREMENT,
138
- video_id TEXT,
139
- question TEXT,
140
- answer TEXT,
141
- relevance TEXT,
142
- explanation TEXT
143
- )
144
- ''')
145
- for eval_data in evaluations:
146
- cursor.execute('''
147
- INSERT INTO rag_evaluations (video_id, question, answer, relevance, explanation)
148
- VALUES (?, ?, ?, ?, ?)
149
- ''', (eval_data['video_id'], eval_data['question'], eval_data['answer'],
150
- eval_data['relevance'], eval_data['explanation']))
151
- conn.commit()
152
- print("Evaluation results saved to database")
153
-
154
- def run_full_evaluation(self, rag_system, ground_truth_file, prompt_template=None):
155
- # Load ground truth
156
- ground_truth = pd.read_csv(ground_truth_file)
157
-
158
- # Evaluate RAG
159
- rag_evaluations = self.evaluate_rag(rag_system, ground_truth_file, prompt_template)
160
-
161
- # Evaluate search performance
162
- def search_function(query, video_id):
163
- index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
164
- if index_name:
165
- return rag_system.data_processor.search(query, num_results=10, method='hybrid', index_name=index_name)
166
- return []
167
-
168
- search_performance = self.evaluate_search(ground_truth, search_function)
169
-
170
- # Optimize search parameters
171
- param_ranges = {'content': (0.0, 3.0)} # Example parameter range
172
-
173
- def objective_function(params):
174
- def parameterized_search(query, video_id):
175
- index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
176
- if index_name:
177
- return rag_system.data_processor.search(query, num_results=10, method='hybrid', index_name=index_name, boost_dict=params)
178
- return []
179
- return self.evaluate_search(ground_truth, parameterized_search)['mrr']
180
-
181
- best_params, best_score = self.simple_optimize(param_ranges, objective_function)
182
-
183
- return {
184
- "rag_evaluations": rag_evaluations,
185
- "search_performance": search_performance,
186
- "best_params": best_params,
187
- "best_score": best_score
188
- }
189
-
190
 
191
  def hit_rate(self, relevance_total):
192
  return sum(any(line) for line in relevance_total) / len(relevance_total)
@@ -207,7 +170,7 @@ class EvaluationSystem:
207
  best_score = float('-inf')
208
  for _ in range(n_iterations):
209
  current_params = {param: np.random.uniform(min_val, max_val)
210
- for param, (min_val, max_val) in param_ranges.items()}
211
  current_score = objective_function(current_params)
212
  if current_score > best_score:
213
  best_score = current_score
@@ -224,4 +187,46 @@ class EvaluationSystem:
224
  return {
225
  'hit_rate': self.hit_rate(relevance_total),
226
  'mrr': self.mrr(relevance_total),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  }
 
2
  import numpy as np
3
  import pandas as pd
4
  import json
 
5
  import requests
 
6
  from tqdm import tqdm
7
  import csv
8
+ import logging
9
+ import sys
10
+ from transformers import pipeline
11
+
12
+ # Configure logging
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
16
+ stream=sys.stdout
17
+ )
18
+ logger = logging.getLogger(__name__)
19
 
20
  class EvaluationSystem:
21
  def __init__(self, data_processor, database_handler):
22
  self.data_processor = data_processor
23
  self.db_handler = database_handler
24
+ # Initialize the model
25
+ self.model = pipeline(
26
+ "text-generation",
27
+ model="google/flan-t5-base",
28
+ device=-1 # Use CPU
29
+ )
30
+ logger.info("Initialized evaluation system with flan-t5-base model")
31
+
32
+ def generate_llm_response(self, prompt):
33
+ """Generate response using Hugging Face model"""
34
+ try:
35
+ response = self.model(
36
+ prompt,
37
+ max_length=512,
38
+ min_length=64,
39
+ num_return_sequences=1
40
+ )[0]['generated_text']
41
+ return response
42
+ except Exception as e:
43
+ logger.error(f"Error generating response: {str(e)}")
44
+ return None
45
 
46
  def relevance_scoring(self, query, retrieved_docs, top_k=5):
47
  query_embedding = self.data_processor.embedding_model.encode(query)
 
65
  result = cursor.fetchone()
66
  return result[0] if result[0] is not None else 0
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  def llm_as_judge(self, question, generated_answer, prompt_template):
69
+ prompt = prompt_template.format(
70
+ question=question,
71
+ answer_llm=generated_answer
72
+ )
73
  try:
74
+ response = self.generate_llm_response(prompt)
75
+ if response:
76
+ # Try to parse JSON response
77
+ try:
78
+ evaluation = json.loads(response)
79
+ return evaluation
80
+ except json.JSONDecodeError:
81
+ logger.error("Failed to parse LLM response as JSON")
82
+ return None
83
+ return None
84
  except Exception as e:
85
+ logger.error(f"Error in LLM evaluation: {str(e)}")
86
  return None
87
 
88
  def evaluate_rag(self, rag_system, ground_truth_file, prompt_template=None):
89
  try:
90
  ground_truth = pd.read_csv(ground_truth_file)
91
  except FileNotFoundError:
92
+ logger.error("Ground truth file not found. Please generate ground truth data first.")
93
  return None
94
 
95
  evaluations = []
 
101
  index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
102
 
103
  if not index_name:
104
+ logger.warning(f"No index found for video {video_id}. Skipping this question.")
105
  continue
106
 
107
  try:
108
  answer_llm, _ = rag_system.query(question, search_method='hybrid', index_name=index_name)
109
  except ValueError as e:
110
+ logger.error(f"Error querying RAG system: {str(e)}")
111
  continue
112
 
113
  if prompt_template:
 
131
  })
132
 
133
  # Save evaluations to CSV
134
+ if evaluations:
135
+ csv_path = 'data/evaluation_results.csv'
136
+ with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
137
+ fieldnames = ['video_id', 'question', 'answer', 'relevance', 'explanation']
138
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
139
+ writer.writeheader()
140
+ for eval_data in evaluations:
141
+ writer.writerow(eval_data)
142
+ logger.info(f"Evaluation results saved to {csv_path}")
143
+
144
+ # Save evaluations to database
145
+ self.save_evaluations_to_db(evaluations)
146
 
147
  return evaluations
148
 
149
  def save_evaluations_to_db(self, evaluations):
150
+ for eval_data in evaluations:
151
+ self.db_handler.save_rag_evaluation(eval_data)
152
+ logger.info("Evaluation results saved to database")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
  def hit_rate(self, relevance_total):
155
  return sum(any(line) for line in relevance_total) / len(relevance_total)
 
170
  best_score = float('-inf')
171
  for _ in range(n_iterations):
172
  current_params = {param: np.random.uniform(min_val, max_val)
173
+ for param, (min_val, max_val) in param_ranges.items()}
174
  current_score = objective_function(current_params)
175
  if current_score > best_score:
176
  best_score = current_score
 
187
  return {
188
  'hit_rate': self.hit_rate(relevance_total),
189
  'mrr': self.mrr(relevance_total),
190
+ }
191
+
192
+ def run_full_evaluation(self, rag_system, ground_truth_file, prompt_template=None):
193
+ # Load ground truth
194
+ ground_truth = pd.read_csv(ground_truth_file)
195
+
196
+ # Evaluate RAG
197
+ rag_evaluations = self.evaluate_rag(rag_system, ground_truth_file, prompt_template)
198
+
199
+ # Evaluate search performance
200
+ def search_function(query, video_id):
201
+ index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
202
+ if index_name:
203
+ return rag_system.data_processor.search(query, num_results=10, method='hybrid', index_name=index_name)
204
+ return []
205
+
206
+ search_performance = self.evaluate_search(ground_truth, search_function)
207
+
208
+ # Optimize search parameters
209
+ param_ranges = {'content': (0.0, 3.0)} # Example parameter range
210
+
211
+ def objective_function(params):
212
+ def parameterized_search(query, video_id):
213
+ index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
214
+ if index_name:
215
+ return rag_system.data_processor.search(
216
+ query,
217
+ num_results=10,
218
+ method='hybrid',
219
+ index_name=index_name,
220
+ boost_dict=params
221
+ )
222
+ return []
223
+ return self.evaluate_search(ground_truth, parameterized_search)['mrr']
224
+
225
+ best_params, best_score = self.simple_optimize(param_ranges, objective_function)
226
+
227
+ return {
228
+ "rag_evaluations": rag_evaluations,
229
+ "search_performance": search_performance,
230
+ "best_params": best_params,
231
+ "best_score": best_score
232
  }