PabloTJ commited on
Commit
4963b4f
·
verified ·
1 Parent(s): 3fb2bff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -33
app.py CHANGED
@@ -7,16 +7,13 @@ import pandas as pd
7
  # Set a seed for reproducibility
8
  set_seed(42)
9
 
10
- # Define five small models for generation (free, lightweight)
11
- small_models = [
12
- "distilgpt2", # ~82M parameters
13
- "gpt2", # ~124M parameters
14
- "EleutherAI/gpt-neo-125M", # ~125M parameters
15
- "sshleifer/tiny-gpt2", # extremely small variant
16
- "microsoft/DialoGPT-small" # DialoGPT small
17
  ]
18
 
19
- # Define five languages: English, German, Spanish, French, Portuguese
20
  languages = {
21
  "en": "English",
22
  "de": "German",
@@ -25,16 +22,16 @@ languages = {
25
  "pt": "Portuguese"
26
  }
27
 
28
- # Define two cost-effective grammar evaluation models (unchanged)
29
  grammar_model_names = [
30
  "vennify/t5-base-grammar-correction",
31
  "hassaanik/grammar-correction-model"
32
  ]
33
 
34
- # Functions to load pipelines on demand
35
  def load_generation_pipeline(model_name):
36
  try:
37
- # Using text-generation pipeline for causal LM models
38
  return pipeline("text-generation", model=model_name)
39
  except Exception as e:
40
  print(f"Error loading generation model {model_name}: {e}")
@@ -42,19 +39,20 @@ def load_generation_pipeline(model_name):
42
 
43
  def load_grammar_pipeline(model_name):
44
  try:
 
45
  return pipeline("text2text-generation", model=model_name)
46
  except Exception as e:
47
  print(f"Error loading grammar model {model_name}: {e}")
48
  return None
49
 
50
- # Pre-load grammar evaluator pipelines
51
  rater_models = []
52
  for model_name in grammar_model_names:
53
  p = load_grammar_pipeline(model_name)
54
  if p is not None:
55
  rater_models.append(p)
56
 
57
- # Utility functions for checking palindromes and cleaning text
58
  def clean_text(text):
59
  return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
60
 
@@ -62,17 +60,21 @@ def is_palindrome(text):
62
  cleaned = clean_text(text)
63
  return cleaned == cleaned[::-1]
64
 
65
- # Updated prompt that instructs the model to output ONLY the palindrome.
66
  def build_prompt(lang):
67
  return (
68
- f"Instruction: Write the longest original palindrome you can in {lang}. "
69
- "The output should contain nothing else but the palindrome. "
70
- "Do not include any additional commentary or repeated instructions. "
71
  "Palindrome: "
72
  )
73
 
74
  def grammar_prompt(pal, lang):
75
- return f'''Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. Only return a number with no explanation:\n\n"{pal}"\n'''
 
 
 
 
76
 
77
  def extract_score(text):
78
  match = re.search(r"\d{1,3}", text)
@@ -81,28 +83,29 @@ def extract_score(text):
81
  return min(max(score, 0), 100)
82
  return 0
83
 
84
- # Main benchmark function that runs all tests at once
85
  def run_benchmark_all():
86
  results = []
87
- # Iterate over each small model
88
- for model_name in small_models:
 
89
  gen_pipeline = load_generation_pipeline(model_name)
90
  if gen_pipeline is None:
91
- continue # Skip this model if it fails to load
92
-
93
- # Iterate over each language
94
  for code, lang in languages.items():
95
  prompt = build_prompt(lang)
96
  try:
97
- # Generate text with a moderate max token limit
98
- gen_output = gen_pipeline(prompt, max_new_tokens=50, do_sample=True)[0]['generated_text'].strip()
99
  except Exception as e:
100
  gen_output = f"Error generating text: {e}"
101
- # Check if the generated output is a palindrome
102
  valid = is_palindrome(gen_output)
103
  cleaned_len = len(clean_text(gen_output))
104
 
105
- # Evaluate grammar using both grammar models
106
  scores = []
107
  for rater in rater_models:
108
  rprompt = grammar_prompt(gen_output, lang)
@@ -113,7 +116,7 @@ def run_benchmark_all():
113
  except Exception as e:
114
  scores.append(0)
115
  avg_score = np.mean(scores) if scores else 0
116
- # Penalize if the generated text is not a valid palindrome
117
  penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
118
  final_score = round(cleaned_len * penalty, 2)
119
 
@@ -130,10 +133,10 @@ def run_benchmark_all():
130
  df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
131
  return gr.Dataframe(df)
132
 
133
- # Build the Gradio UI using Blocks (canvas layout)
134
- with gr.Blocks(title="Small Model Palindrome Benchmark") as demo:
135
- gr.Markdown("# Small Model Palindrome Benchmark")
136
- gr.Markdown("This benchmark automatically runs over 5 small text-generation models and 5 languages (English, German, Spanish, French, Portuguese).")
137
 
138
  with gr.Row():
139
  run_button = gr.Button("Run All Benchmarks")
@@ -142,3 +145,4 @@ with gr.Blocks(title="Small Model Palindrome Benchmark") as demo:
142
  run_button.click(fn=run_benchmark_all, inputs=[], outputs=output_table)
143
 
144
  demo.launch()
 
 
7
  # Set a seed for reproducibility
8
  set_seed(42)
9
 
10
+ # Define two premium generation models for better quality outputs.
11
+ premium_models = [
12
+ "mistralai/Mistral-7B-v0.1",
13
+ "HuggingFaceH4/zephyr-7b-beta"
 
 
 
14
  ]
15
 
16
+ # Define five languages: English, German, Spanish, French, Portuguese.
17
  languages = {
18
  "en": "English",
19
  "de": "German",
 
22
  "pt": "Portuguese"
23
  }
24
 
25
+ # Define two cost-effective grammar evaluation models.
26
  grammar_model_names = [
27
  "vennify/t5-base-grammar-correction",
28
  "hassaanik/grammar-correction-model"
29
  ]
30
 
31
+ # Functions to load pipelines on demand.
32
  def load_generation_pipeline(model_name):
33
  try:
34
+ # The text-generation pipeline loads a causal LM.
35
  return pipeline("text-generation", model=model_name)
36
  except Exception as e:
37
  print(f"Error loading generation model {model_name}: {e}")
 
39
 
40
  def load_grammar_pipeline(model_name):
41
  try:
42
+ # Using text2text-generation for grammar correction.
43
  return pipeline("text2text-generation", model=model_name)
44
  except Exception as e:
45
  print(f"Error loading grammar model {model_name}: {e}")
46
  return None
47
 
48
+ # Pre-load grammar evaluator pipelines.
49
  rater_models = []
50
  for model_name in grammar_model_names:
51
  p = load_grammar_pipeline(model_name)
52
  if p is not None:
53
  rater_models.append(p)
54
 
55
+ # Utility functions.
56
  def clean_text(text):
57
  return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
58
 
 
60
  cleaned = clean_text(text)
61
  return cleaned == cleaned[::-1]
62
 
63
+ # Updated prompt which explicitly instructs the model to output only a palindrome.
64
  def build_prompt(lang):
65
  return (
66
+ f"Instruction: Generate a single original palindrome in {lang}.\n"
67
+ "Output only the palindrome. The palindrome should be a continuous text that reads the same forward and backward.\n"
68
+ "Do not output any additional text, commentary, or the prompt itself.\n"
69
  "Palindrome: "
70
  )
71
 
72
  def grammar_prompt(pal, lang):
73
+ return (
74
+ f"Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. "
75
+ "Return only a number with no explanation.\n\n"
76
+ f'"{pal}"\n'
77
+ )
78
 
79
  def extract_score(text):
80
  match = re.search(r"\d{1,3}", text)
 
83
  return min(max(score, 0), 100)
84
  return 0
85
 
86
+ # Main benchmark function running all tests at once.
87
  def run_benchmark_all():
88
  results = []
89
+
90
+ # Iterate over each premium model.
91
+ for model_name in premium_models:
92
  gen_pipeline = load_generation_pipeline(model_name)
93
  if gen_pipeline is None:
94
+ continue # Skip if model loading failed.
95
+
96
+ # Iterate over the five languages.
97
  for code, lang in languages.items():
98
  prompt = build_prompt(lang)
99
  try:
100
+ # Generate output with a moderate token limit; adjust max_new_tokens if needed.
101
+ gen_output = gen_pipeline(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
102
  except Exception as e:
103
  gen_output = f"Error generating text: {e}"
104
+
105
  valid = is_palindrome(gen_output)
106
  cleaned_len = len(clean_text(gen_output))
107
 
108
+ # Evaluate grammar using both grammar models.
109
  scores = []
110
  for rater in rater_models:
111
  rprompt = grammar_prompt(gen_output, lang)
 
116
  except Exception as e:
117
  scores.append(0)
118
  avg_score = np.mean(scores) if scores else 0
119
+ # Apply penalty if the output is not a valid palindrome.
120
  penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
121
  final_score = round(cleaned_len * penalty, 2)
122
 
 
133
  df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
134
  return gr.Dataframe(df)
135
 
136
+ # Gradio UI built with Blocks for a canvas-style layout.
137
+ with gr.Blocks(title="Premium Model Palindrome Benchmark") as demo:
138
+ gr.Markdown("# Premium Model Palindrome Benchmark")
139
+ gr.Markdown("This benchmark runs automatically over two premium text-generation models across 5 languages (English, German, Spanish, French, Portuguese).")
140
 
141
  with gr.Row():
142
  run_button = gr.Button("Run All Benchmarks")
 
145
  run_button.click(fn=run_benchmark_all, inputs=[], outputs=output_table)
146
 
147
  demo.launch()
148
+