PabloTJ commited on
Commit
3fb2bff
·
verified ·
1 Parent(s): 4136261

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -17
app.py CHANGED
@@ -13,10 +13,10 @@ small_models = [
13
  "gpt2", # ~124M parameters
14
  "EleutherAI/gpt-neo-125M", # ~125M parameters
15
  "sshleifer/tiny-gpt2", # extremely small variant
16
- "microsoft/DialoGPT-small" # dialoGPT in small size
17
  ]
18
 
19
- # Define five languages (English, German, Spanish, French, Portuguese)
20
  languages = {
21
  "en": "English",
22
  "de": "German",
@@ -25,7 +25,7 @@ languages = {
25
  "pt": "Portuguese"
26
  }
27
 
28
- # Define two cost-effective grammar evaluation models
29
  grammar_model_names = [
30
  "vennify/t5-base-grammar-correction",
31
  "hassaanik/grammar-correction-model"
@@ -34,7 +34,7 @@ grammar_model_names = [
34
  # Functions to load pipelines on demand
35
  def load_generation_pipeline(model_name):
36
  try:
37
- # Use text-generation pipeline for causal LM models
38
  return pipeline("text-generation", model=model_name)
39
  except Exception as e:
40
  print(f"Error loading generation model {model_name}: {e}")
@@ -62,6 +62,15 @@ def is_palindrome(text):
62
  cleaned = clean_text(text)
63
  return cleaned == cleaned[::-1]
64
 
 
 
 
 
 
 
 
 
 
65
  def grammar_prompt(pal, lang):
66
  return f'''Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. Only return a number with no explanation:\n\n"{pal}"\n'''
67
 
@@ -75,28 +84,25 @@ def extract_score(text):
75
  # Main benchmark function that runs all tests at once
76
  def run_benchmark_all():
77
  results = []
 
78
  for model_name in small_models:
79
- # Load the generation pipeline for the current small model
80
  gen_pipeline = load_generation_pipeline(model_name)
81
  if gen_pipeline is None:
82
- continue # Skip if model fails to load
83
 
 
84
  for code, lang in languages.items():
85
- # Prompt for generating a palindrome in the given language
86
- prompt = (
87
- f"Write the longest original palindrome you can in {lang}. "
88
- "It should be creative and not a known palindrome. "
89
- "If it is not a correct palindrome, you will lose points according to how correct it is."
90
- )
91
  try:
 
92
  gen_output = gen_pipeline(prompt, max_new_tokens=50, do_sample=True)[0]['generated_text'].strip()
93
  except Exception as e:
94
  gen_output = f"Error generating text: {e}"
95
-
96
  valid = is_palindrome(gen_output)
97
  cleaned_len = len(clean_text(gen_output))
98
 
99
- # Measure grammar evaluation using both rater models
100
  scores = []
101
  for rater in rater_models:
102
  rprompt = grammar_prompt(gen_output, lang)
@@ -107,7 +113,7 @@ def run_benchmark_all():
107
  except Exception as e:
108
  scores.append(0)
109
  avg_score = np.mean(scores) if scores else 0
110
- # Apply a penalty if the text is not a valid palindrome
111
  penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
112
  final_score = round(cleaned_len * penalty, 2)
113
 
@@ -124,10 +130,10 @@ def run_benchmark_all():
124
  df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
125
  return gr.Dataframe(df)
126
 
127
- # Build Gradio UI using Blocks (canvas layout)
128
  with gr.Blocks(title="Small Model Palindrome Benchmark") as demo:
129
  gr.Markdown("# Small Model Palindrome Benchmark")
130
- gr.Markdown("This benchmark runs automatically during the night over 5 small text-generation models and 5 languages (English, German, Spanish, French, Portuguese). All tests are run at once.")
131
 
132
  with gr.Row():
133
  run_button = gr.Button("Run All Benchmarks")
 
13
  "gpt2", # ~124M parameters
14
  "EleutherAI/gpt-neo-125M", # ~125M parameters
15
  "sshleifer/tiny-gpt2", # extremely small variant
16
+ "microsoft/DialoGPT-small" # DialoGPT small
17
  ]
18
 
19
+ # Define five languages: English, German, Spanish, French, Portuguese
20
  languages = {
21
  "en": "English",
22
  "de": "German",
 
25
  "pt": "Portuguese"
26
  }
27
 
28
+ # Define two cost-effective grammar evaluation models (unchanged)
29
  grammar_model_names = [
30
  "vennify/t5-base-grammar-correction",
31
  "hassaanik/grammar-correction-model"
 
34
  # Functions to load pipelines on demand
35
  def load_generation_pipeline(model_name):
36
  try:
37
+ # Using text-generation pipeline for causal LM models
38
  return pipeline("text-generation", model=model_name)
39
  except Exception as e:
40
  print(f"Error loading generation model {model_name}: {e}")
 
62
  cleaned = clean_text(text)
63
  return cleaned == cleaned[::-1]
64
 
65
+ # Updated prompt that instructs the model to output ONLY the palindrome.
66
+ def build_prompt(lang):
67
+ return (
68
+ f"Instruction: Write the longest original palindrome you can in {lang}. "
69
+ "The output should contain nothing else but the palindrome. "
70
+ "Do not include any additional commentary or repeated instructions. "
71
+ "Palindrome: "
72
+ )
73
+
74
  def grammar_prompt(pal, lang):
75
  return f'''Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. Only return a number with no explanation:\n\n"{pal}"\n'''
76
 
 
84
  # Main benchmark function that runs all tests at once
85
  def run_benchmark_all():
86
  results = []
87
+ # Iterate over each small model
88
  for model_name in small_models:
 
89
  gen_pipeline = load_generation_pipeline(model_name)
90
  if gen_pipeline is None:
91
+ continue # Skip this model if it fails to load
92
 
93
+ # Iterate over each language
94
  for code, lang in languages.items():
95
+ prompt = build_prompt(lang)
 
 
 
 
 
96
  try:
97
+ # Generate text with a moderate max token limit
98
  gen_output = gen_pipeline(prompt, max_new_tokens=50, do_sample=True)[0]['generated_text'].strip()
99
  except Exception as e:
100
  gen_output = f"Error generating text: {e}"
101
+ # Check if the generated output is a palindrome
102
  valid = is_palindrome(gen_output)
103
  cleaned_len = len(clean_text(gen_output))
104
 
105
+ # Evaluate grammar using both grammar models
106
  scores = []
107
  for rater in rater_models:
108
  rprompt = grammar_prompt(gen_output, lang)
 
113
  except Exception as e:
114
  scores.append(0)
115
  avg_score = np.mean(scores) if scores else 0
116
+ # Penalize if the generated text is not a valid palindrome
117
  penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
118
  final_score = round(cleaned_len * penalty, 2)
119
 
 
130
  df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
131
  return gr.Dataframe(df)
132
 
133
+ # Build the Gradio UI using Blocks (canvas layout)
134
  with gr.Blocks(title="Small Model Palindrome Benchmark") as demo:
135
  gr.Markdown("# Small Model Palindrome Benchmark")
136
+ gr.Markdown("This benchmark automatically runs over 5 small text-generation models and 5 languages (English, German, Spanish, French, Portuguese).")
137
 
138
  with gr.Row():
139
  run_button = gr.Button("Run All Benchmarks")