PabloTJ commited on
Commit
4136261
·
verified ·
1 Parent(s): be7a374

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -69
app.py CHANGED
@@ -7,35 +7,39 @@ import pandas as pd
7
  # Set a seed for reproducibility
8
  set_seed(42)
9
 
10
- # List of premium generation models (as suggested from the Vellum AI leaderboard)
11
- generation_model_names = [
12
- "mistralai/Mistral-7B-v0.1",
13
- "mistralai/Mixtral-8x7B-v0.1",
14
- "meta-llama/Llama-4-Scout",
15
- "meta-llama/Llama-4-Maverick",
16
- "Qwen/Qwen2.5-72B",
17
- "HuggingFaceH4/zephyr-7b-beta",
18
- "01-ai/Yi-34B",
19
- "deepseek-ai/deepseek-llm-67b-base",
20
- "HuggingFaceH4/zephyr-7b-alpha",
21
- "microsoft/Marcoroni-7B-v3"
22
  ]
23
 
24
- # List of cost-effective grammar evaluation models
 
 
 
 
 
 
 
 
 
25
  grammar_model_names = [
26
  "vennify/t5-base-grammar-correction",
27
  "hassaanik/grammar-correction-model"
28
  ]
29
 
30
- # Load a generation pipeline given the model name.
31
  def load_generation_pipeline(model_name):
32
  try:
 
33
  return pipeline("text-generation", model=model_name)
34
  except Exception as e:
35
  print(f"Error loading generation model {model_name}: {e}")
36
  return None
37
 
38
- # Load a grammar evaluation pipeline (text2text-generation)
39
  def load_grammar_pipeline(model_name):
40
  try:
41
  return pipeline("text2text-generation", model=model_name)
@@ -43,19 +47,14 @@ def load_grammar_pipeline(model_name):
43
  print(f"Error loading grammar model {model_name}: {e}")
44
  return None
45
 
46
- # Pre-load grammar evaluator models (assumed to be cost-effective and stable)
47
  rater_models = []
48
  for model_name in grammar_model_names:
49
  p = load_grammar_pipeline(model_name)
50
  if p is not None:
51
  rater_models.append(p)
52
 
53
- # Language dictionary
54
- languages = {
55
- "en": "English", "es": "Spanish", "fr": "French", "de": "German", "it": "Italian",
56
- "pt": "Portuguese", "ru": "Russian", "ar": "Arabic", "hi": "Hindi", "ja": "Japanese"
57
- }
58
-
59
  def clean_text(text):
60
  return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
61
 
@@ -73,63 +72,67 @@ def extract_score(text):
73
  return min(max(score, 0), 100)
74
  return 0
75
 
76
- def run_benchmark(selected_model):
77
- # Load the selected premium generation pipeline
78
- gen_model = load_generation_pipeline(selected_model)
79
- if gen_model is None:
80
- return "Error loading generation model."
81
-
82
  results = []
83
- for code, lang in languages.items():
84
- prompt = (
85
- f"Write the longest original palindrome you can in {lang}. "
86
- f"It should be creative and not a known palindrome. "
87
- f"If it is not a correct palindrome, you will lose points according to how correct it is."
88
- )
89
- try:
90
- gen_output = gen_model(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
91
- except Exception as e:
92
- gen_output = f"Error generating text: {e}"
93
- valid = is_palindrome(gen_output)
94
- cleaned_len = len(clean_text(gen_output))
95
-
96
- scores = []
97
- for rater in rater_models:
98
- rprompt = grammar_prompt(gen_output, lang)
99
  try:
100
- # For a text2text model, we assume the output contains a number (0-100)
101
- rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text']
102
- score = extract_score(rtext)
103
- scores.append(score)
104
  except Exception as e:
105
- scores.append(0)
106
- avg_score = np.mean(scores) if scores else 0
107
- penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
108
- final_score = round(cleaned_len * penalty, 2)
109
-
110
- results.append({
111
- "Language": lang,
112
- "Palindrome": gen_output,
113
- "Valid": "✅" if valid else "❌",
114
- "Length": cleaned_len,
115
- "Grammar Score": avg_score,
116
- "Final Score": final_score
117
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
120
  return gr.Dataframe(df)
121
 
122
- # Build the Gradio UI using Blocks (canvas layout)
123
- with gr.Blocks(title="LLM Palindrome Benchmark - Premium Generation Models") as demo:
124
- gr.Markdown("# LLM Palindrome Benchmark")
125
- gr.Markdown("Select one of the premium generation models below (for non-commercial, educational usage) and run the benchmark.")
126
 
127
  with gr.Row():
128
- model_dropdown = gr.Dropdown(choices=generation_model_names, label="Select Premium Generation Model")
129
- run_button = gr.Button("Run Benchmark")
130
-
131
  output_table = gr.Dataframe(label="Benchmark Results")
132
 
133
- run_button.click(fn=run_benchmark, inputs=model_dropdown, outputs=output_table)
134
 
135
  demo.launch()
 
7
  # Set a seed for reproducibility
8
  set_seed(42)
9
 
10
+ # Define five small models for generation (free, lightweight)
11
+ small_models = [
12
+ "distilgpt2", # ~82M parameters
13
+ "gpt2", # ~124M parameters
14
+ "EleutherAI/gpt-neo-125M", # ~125M parameters
15
+ "sshleifer/tiny-gpt2", # extremely small variant
16
+ "microsoft/DialoGPT-small" # dialoGPT in small size
 
 
 
 
 
17
  ]
18
 
19
+ # Define five languages (English, German, Spanish, French, Portuguese)
20
+ languages = {
21
+ "en": "English",
22
+ "de": "German",
23
+ "es": "Spanish",
24
+ "fr": "French",
25
+ "pt": "Portuguese"
26
+ }
27
+
28
+ # Define two cost-effective grammar evaluation models
29
  grammar_model_names = [
30
  "vennify/t5-base-grammar-correction",
31
  "hassaanik/grammar-correction-model"
32
  ]
33
 
34
+ # Functions to load pipelines on demand
35
  def load_generation_pipeline(model_name):
36
  try:
37
+ # Use text-generation pipeline for causal LM models
38
  return pipeline("text-generation", model=model_name)
39
  except Exception as e:
40
  print(f"Error loading generation model {model_name}: {e}")
41
  return None
42
 
 
43
  def load_grammar_pipeline(model_name):
44
  try:
45
  return pipeline("text2text-generation", model=model_name)
 
47
  print(f"Error loading grammar model {model_name}: {e}")
48
  return None
49
 
50
+ # Pre-load grammar evaluator pipelines
51
  rater_models = []
52
  for model_name in grammar_model_names:
53
  p = load_grammar_pipeline(model_name)
54
  if p is not None:
55
  rater_models.append(p)
56
 
57
+ # Utility functions for checking palindromes and cleaning text
 
 
 
 
 
58
  def clean_text(text):
59
  return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
60
 
 
72
  return min(max(score, 0), 100)
73
  return 0
74
 
75
+ # Main benchmark function that runs all tests at once
76
+ def run_benchmark_all():
 
 
 
 
77
  results = []
78
+ for model_name in small_models:
79
+ # Load the generation pipeline for the current small model
80
+ gen_pipeline = load_generation_pipeline(model_name)
81
+ if gen_pipeline is None:
82
+ continue # Skip if model fails to load
83
+
84
+ for code, lang in languages.items():
85
+ # Prompt for generating a palindrome in the given language
86
+ prompt = (
87
+ f"Write the longest original palindrome you can in {lang}. "
88
+ "It should be creative and not a known palindrome. "
89
+ "If it is not a correct palindrome, you will lose points according to how correct it is."
90
+ )
 
 
 
91
  try:
92
+ gen_output = gen_pipeline(prompt, max_new_tokens=50, do_sample=True)[0]['generated_text'].strip()
 
 
 
93
  except Exception as e:
94
+ gen_output = f"Error generating text: {e}"
95
+
96
+ valid = is_palindrome(gen_output)
97
+ cleaned_len = len(clean_text(gen_output))
98
+
99
+ # Measure grammar evaluation using both rater models
100
+ scores = []
101
+ for rater in rater_models:
102
+ rprompt = grammar_prompt(gen_output, lang)
103
+ try:
104
+ rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text']
105
+ score = extract_score(rtext)
106
+ scores.append(score)
107
+ except Exception as e:
108
+ scores.append(0)
109
+ avg_score = np.mean(scores) if scores else 0
110
+ # Apply a penalty if the text is not a valid palindrome
111
+ penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
112
+ final_score = round(cleaned_len * penalty, 2)
113
+
114
+ results.append({
115
+ "Model": model_name,
116
+ "Language": lang,
117
+ "Palindrome": gen_output,
118
+ "Valid": "✅" if valid else "❌",
119
+ "Length": cleaned_len,
120
+ "Grammar Score": avg_score,
121
+ "Final Score": final_score
122
+ })
123
 
124
  df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
125
  return gr.Dataframe(df)
126
 
127
+ # Build Gradio UI using Blocks (canvas layout)
128
+ with gr.Blocks(title="Small Model Palindrome Benchmark") as demo:
129
+ gr.Markdown("# Small Model Palindrome Benchmark")
130
+ gr.Markdown("This benchmark runs automatically during the night over 5 small text-generation models and 5 languages (English, German, Spanish, French, Portuguese). All tests are run at once.")
131
 
132
  with gr.Row():
133
+ run_button = gr.Button("Run All Benchmarks")
 
 
134
  output_table = gr.Dataframe(label="Benchmark Results")
135
 
136
+ run_button.click(fn=run_benchmark_all, inputs=[], outputs=output_table)
137
 
138
  demo.launch()