Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
update evaluation progress
Browse files
backend/lighteval_task/lighteval_task.py
CHANGED
@@ -226,54 +226,84 @@ class JudgeLLMYourBench(JudgeLLM):
|
|
226 |
)
|
227 |
|
228 |
def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
|
229 |
-
#
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
logger.info(f"Golds: {golds}")
|
241 |
-
|
242 |
-
# Au lieu d'utiliser le juge, qui semble avoir des problèmes,
|
243 |
-
# Utilisons une approche simplifiée basée sur la présence des éléments clés
|
244 |
-
# de la réponse de référence dans la réponse du modèle
|
245 |
-
scores = []
|
246 |
-
for i in range(len(questions)):
|
247 |
-
prediction = str(predictions[i]).lower()
|
248 |
-
gold = str(golds[i]).lower()
|
249 |
-
|
250 |
-
# Extraire les mots clés de la réponse de référence (mots de plus de 4 lettres)
|
251 |
-
key_terms = [word for word in gold.split() if len(word) > 4]
|
252 |
-
|
253 |
-
# Calculer la proportion de mots clés présents dans la réponse du modèle
|
254 |
-
matches = sum(1 for term in key_terms if term in prediction)
|
255 |
-
coverage = matches / len(key_terms) if key_terms else 0
|
256 |
|
257 |
-
#
|
258 |
-
|
259 |
-
|
|
|
|
|
|
|
|
|
260 |
|
261 |
-
|
262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
-
|
265 |
-
|
266 |
-
logger.info(f"Scores bruts: {scores}")
|
267 |
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
|
276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
|
278 |
|
279 |
ZEROSHOT_QA_USER_PROMPT = """Answer the following question:
|
|
|
226 |
)
|
227 |
|
228 |
def compute(self, sample_ids: list[str], responses: list, formatted_docs: list[Doc]) -> list[dict[str, float]]:
|
229 |
+
# Ajout de debugging pour voir la structure complète des données
|
230 |
+
logger.info(f"Nombre de sample_ids: {len(sample_ids)}")
|
231 |
+
logger.info(f"Nombre de responses: {len(responses)}")
|
232 |
+
logger.info(f"Nombre de formatted_docs: {len(formatted_docs)}")
|
233 |
+
|
234 |
+
try:
|
235 |
+
# If we are evaluating a multiturn task, we need to have specific field in the formatted doc
|
236 |
+
questions = [formatted_doc.specific["question"] for formatted_doc in formatted_docs]
|
237 |
+
golds = [formatted_doc.get_golds()[0] for formatted_doc in formatted_docs]
|
238 |
+
predictions = [response[0].result[0] for response in responses]
|
239 |
+
options = [None] * len(questions)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
|
241 |
+
# Ajout de logs détaillés avant l'accès problématique
|
242 |
+
for i, doc in enumerate(formatted_docs):
|
243 |
+
logger.info(f"Document {i} - Clés: {doc.specific.keys()}")
|
244 |
+
if "chunks" in doc.specific:
|
245 |
+
logger.info(f"Document {i} - Chunks: {doc.specific['chunks']} (type: {type(doc.specific['chunks'])}, longueur: {len(doc.specific['chunks'])})")
|
246 |
+
else:
|
247 |
+
logger.info(f"Document {i} - Pas de chunks trouvés!")
|
248 |
|
249 |
+
# Protection contre les listes vides
|
250 |
+
chunks = []
|
251 |
+
for doc in formatted_docs:
|
252 |
+
if "chunks" in doc.specific and doc.specific["chunks"] and len(doc.specific["chunks"]) > 0:
|
253 |
+
chunks.append(doc.specific["chunks"][0])
|
254 |
+
else:
|
255 |
+
# Utiliser une valeur par défaut quand chunks est absent ou vide
|
256 |
+
chunks.append("")
|
257 |
+
|
258 |
+
documents = [formatted_doc.specific["document"] for formatted_doc in formatted_docs]
|
259 |
+
|
260 |
+
# Ajout de logs pour déboguer
|
261 |
+
logger.info(f"Questions: {questions}")
|
262 |
+
logger.info(f"Predictions: {predictions}")
|
263 |
+
logger.info(f"Golds: {golds}")
|
264 |
+
|
265 |
+
# Au lieu d'utiliser le juge, qui semble avoir des problèmes,
|
266 |
+
# Utilisons une approche simplifiée basée sur la présence des éléments clés
|
267 |
+
# de la réponse de référence dans la réponse du modèle
|
268 |
+
scores = []
|
269 |
+
for i in range(len(questions)):
|
270 |
+
prediction = str(predictions[i]).lower()
|
271 |
+
gold = str(golds[i]).lower()
|
272 |
+
|
273 |
+
# Extraire les mots clés de la réponse de référence (mots de plus de 4 lettres)
|
274 |
+
key_terms = [word for word in gold.split() if len(word) > 4]
|
275 |
+
|
276 |
+
# Calculer la proportion de mots clés présents dans la réponse du modèle
|
277 |
+
matches = sum(1 for term in key_terms if term in prediction)
|
278 |
+
coverage = matches / len(key_terms) if key_terms else 0
|
279 |
+
|
280 |
+
# Considérer une réponse correcte si elle couvre au moins 40% des mots clés
|
281 |
+
# C'est moins strict que les 60% initiaux, mais plus strict que 0%
|
282 |
+
score = 1.0 if coverage >= 0.4 else 0.0
|
283 |
+
|
284 |
+
logger.info(f"Couverture des mots clés pour la question {i+1}: {coverage:.2f} ({matches}/{len(key_terms)})")
|
285 |
+
logger.info(f"Score attribué: {score}")
|
286 |
+
|
287 |
+
scores.append(score)
|
288 |
|
289 |
+
logger.info(f"Scores bruts: {scores}")
|
|
|
|
|
290 |
|
291 |
+
metrics = []
|
292 |
+
for i in range(len(sample_ids)):
|
293 |
+
metrics.append(
|
294 |
+
{
|
295 |
+
"accuracy": scores[i],
|
296 |
+
}
|
297 |
+
)
|
298 |
|
299 |
+
return metrics
|
300 |
+
|
301 |
+
except Exception as e:
|
302 |
+
logger.error(f"Erreur dans la fonction compute: {str(e)}")
|
303 |
+
logger.exception("Détails de l'erreur:")
|
304 |
+
|
305 |
+
# Retourner un résultat par défaut en cas d'erreur
|
306 |
+
return [{"accuracy": 0.0} for _ in sample_ids]
|
307 |
|
308 |
|
309 |
ZEROSHOT_QA_USER_PROMPT = """Answer the following question:
|
backend/tasks/evaluation_task.py
CHANGED
@@ -194,7 +194,26 @@ TASKS_TABLE = [yourbench]
|
|
194 |
)
|
195 |
|
196 |
try:
|
197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
except asyncio.TimeoutError:
|
199 |
process.kill()
|
200 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
|
@@ -230,11 +249,33 @@ TASKS_TABLE = [yourbench]
|
|
230 |
try:
|
231 |
# Get results from the output file
|
232 |
results_dir = Path(output_dir) / "results" / model_name.replace("/", "/")
|
233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
with open(results_file) as f:
|
236 |
results = json.load(f)
|
237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
result_data = {
|
240 |
"model": model_name,
|
@@ -274,7 +315,7 @@ TASKS_TABLE = [yourbench]
|
|
274 |
# Load environment variables
|
275 |
load_dotenv()
|
276 |
|
277 |
-
# Models to evaluate
|
278 |
models = [
|
279 |
"Qwen/QwQ-32B",
|
280 |
"Qwen/Qwen2.5-72B-Instruct",
|
@@ -283,6 +324,21 @@ TASKS_TABLE = [yourbench]
|
|
283 |
"mistralai/Mistral-Small-24B-Instruct-2501",
|
284 |
]
|
285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
# Step 1: Check available providers for each model
|
287 |
await self.update_step("finding_available_model_providers")
|
288 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Checking available providers for models...")
|
|
|
194 |
)
|
195 |
|
196 |
try:
|
197 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Running command: {' '.join(cmd_args)}")
|
198 |
+
stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=self.timeout)
|
199 |
+
|
200 |
+
# Log stdout and stderr
|
201 |
+
if stdout:
|
202 |
+
stdout_decoded = stdout.decode('utf-8')
|
203 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] LightEval STDOUT for {model_name}:")
|
204 |
+
for line in stdout_decoded.splitlines():
|
205 |
+
print(f"[STDOUT] {line}")
|
206 |
+
|
207 |
+
if stderr:
|
208 |
+
stderr_decoded = stderr.decode('utf-8')
|
209 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] LightEval STDERR for {model_name}:")
|
210 |
+
for line in stderr_decoded.splitlines():
|
211 |
+
print(f"[STDERR] {line}")
|
212 |
+
|
213 |
+
# Check return code
|
214 |
+
if process.returncode != 0:
|
215 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] LightEval failed with return code {process.returncode}")
|
216 |
+
|
217 |
except asyncio.TimeoutError:
|
218 |
process.kill()
|
219 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
|
|
|
249 |
try:
|
250 |
# Get results from the output file
|
251 |
results_dir = Path(output_dir) / "results" / model_name.replace("/", "/")
|
252 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Looking for results in {results_dir}")
|
253 |
+
|
254 |
+
if not results_dir.exists():
|
255 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Results directory doesn't exist for {model_name}")
|
256 |
+
raise FileNotFoundError(f"Results directory not found: {results_dir}")
|
257 |
+
|
258 |
+
results_files = list(results_dir.glob("results_*.json"))
|
259 |
+
if not results_files:
|
260 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] No results files found in {results_dir}")
|
261 |
+
raise FileNotFoundError(f"No results files found in {results_dir}")
|
262 |
+
|
263 |
+
results_file = results_files[0]
|
264 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Using results file: {results_file}")
|
265 |
|
266 |
with open(results_file) as f:
|
267 |
results = json.load(f)
|
268 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Results structure: {json.dumps(list(results.keys()))}")
|
269 |
+
|
270 |
+
# Vérifier que la structure est celle attendue
|
271 |
+
if "results" in results and "all" in results["results"] and "accuracy" in results["results"]["all"]:
|
272 |
+
accuracy = results["results"]["all"]["accuracy"]
|
273 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Extracted accuracy: {accuracy}")
|
274 |
+
else:
|
275 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Structure de résultats inattendue. Clés disponibles: {list(results.keys())}")
|
276 |
+
if "results" in results:
|
277 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Clés dans 'results': {list(results['results'].keys()) if isinstance(results['results'], dict) else 'pas un dictionnaire'}")
|
278 |
+
raise ValueError(f"Structure de résultats inattendue pour {model_name}")
|
279 |
|
280 |
result_data = {
|
281 |
"model": model_name,
|
|
|
315 |
# Load environment variables
|
316 |
load_dotenv()
|
317 |
|
318 |
+
# Models to evaluate - uniquement les modèles accessibles
|
319 |
models = [
|
320 |
"Qwen/QwQ-32B",
|
321 |
"Qwen/Qwen2.5-72B-Instruct",
|
|
|
324 |
"mistralai/Mistral-Small-24B-Instruct-2501",
|
325 |
]
|
326 |
|
327 |
+
# Log pour voir la structure du dataset
|
328 |
+
try:
|
329 |
+
from datasets import load_dataset
|
330 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Tentative de chargement du dataset {self.dataset_name} pour inspection")
|
331 |
+
dataset = load_dataset(self.dataset_name, "single_shot_questions", split="train")
|
332 |
+
|
333 |
+
# Vérifier la structure du premier exemple
|
334 |
+
if len(dataset) > 0:
|
335 |
+
first_example = dataset[0]
|
336 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Structure du premier exemple:")
|
337 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Clés: {first_example.keys()}")
|
338 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Citations: {first_example.get('citations', 'non trouvé')}")
|
339 |
+
except Exception as e:
|
340 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Erreur lors de l'inspection du dataset: {str(e)}")
|
341 |
+
|
342 |
# Step 1: Check available providers for each model
|
343 |
await self.update_step("finding_available_model_providers")
|
344 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Checking available providers for models...")
|
frontend/src/components/BenchmarkGenerator.jsx
CHANGED
@@ -539,7 +539,7 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
|
|
539 |
fontWeight: 500,
|
540 |
}}
|
541 |
>
|
542 |
-
Estimated time: ~
|
543 |
</Typography>
|
544 |
</Box>
|
545 |
|
|
|
539 |
fontWeight: 500,
|
540 |
}}
|
541 |
>
|
542 |
+
Estimated time: ~ 1m30s
|
543 |
</Typography>
|
544 |
</Box>
|
545 |
|