PabloTJ commited on
Commit
ec2f5cd
·
verified ·
1 Parent(s): 4963b4f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -16
app.py CHANGED
@@ -31,7 +31,6 @@ grammar_model_names = [
31
  # Functions to load pipelines on demand.
32
  def load_generation_pipeline(model_name):
33
  try:
34
- # The text-generation pipeline loads a causal LM.
35
  return pipeline("text-generation", model=model_name)
36
  except Exception as e:
37
  print(f"Error loading generation model {model_name}: {e}")
@@ -39,7 +38,6 @@ def load_generation_pipeline(model_name):
39
 
40
  def load_grammar_pipeline(model_name):
41
  try:
42
- # Using text2text-generation for grammar correction.
43
  return pipeline("text2text-generation", model=model_name)
44
  except Exception as e:
45
  print(f"Error loading grammar model {model_name}: {e}")
@@ -52,7 +50,6 @@ for model_name in grammar_model_names:
52
  if p is not None:
53
  rater_models.append(p)
54
 
55
- # Utility functions.
56
  def clean_text(text):
57
  return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
58
 
@@ -60,7 +57,7 @@ def is_palindrome(text):
60
  cleaned = clean_text(text)
61
  return cleaned == cleaned[::-1]
62
 
63
- # Updated prompt which explicitly instructs the model to output only a palindrome.
64
  def build_prompt(lang):
65
  return (
66
  f"Instruction: Generate a single original palindrome in {lang}.\n"
@@ -83,21 +80,18 @@ def extract_score(text):
83
  return min(max(score, 0), 100)
84
  return 0
85
 
86
- # Main benchmark function running all tests at once.
87
  def run_benchmark_all():
88
  results = []
89
 
90
- # Iterate over each premium model.
91
  for model_name in premium_models:
92
  gen_pipeline = load_generation_pipeline(model_name)
93
  if gen_pipeline is None:
94
- continue # Skip if model loading failed.
95
 
96
- # Iterate over the five languages.
97
  for code, lang in languages.items():
98
  prompt = build_prompt(lang)
99
  try:
100
- # Generate output with a moderate token limit; adjust max_new_tokens if needed.
101
  gen_output = gen_pipeline(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
102
  except Exception as e:
103
  gen_output = f"Error generating text: {e}"
@@ -105,7 +99,6 @@ def run_benchmark_all():
105
  valid = is_palindrome(gen_output)
106
  cleaned_len = len(clean_text(gen_output))
107
 
108
- # Evaluate grammar using both grammar models.
109
  scores = []
110
  for rater in rater_models:
111
  rprompt = grammar_prompt(gen_output, lang)
@@ -116,7 +109,6 @@ def run_benchmark_all():
116
  except Exception as e:
117
  scores.append(0)
118
  avg_score = np.mean(scores) if scores else 0
119
- # Apply penalty if the output is not a valid palindrome.
120
  penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
121
  final_score = round(cleaned_len * penalty, 2)
122
 
@@ -130,19 +122,27 @@ def run_benchmark_all():
130
  "Final Score": final_score
131
  })
132
 
 
133
  df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
134
- return gr.Dataframe(df)
 
 
 
 
 
135
 
136
- # Gradio UI built with Blocks for a canvas-style layout.
137
  with gr.Blocks(title="Premium Model Palindrome Benchmark") as demo:
138
  gr.Markdown("# Premium Model Palindrome Benchmark")
139
- gr.Markdown("This benchmark runs automatically over two premium text-generation models across 5 languages (English, German, Spanish, French, Portuguese).")
140
 
141
  with gr.Row():
142
  run_button = gr.Button("Run All Benchmarks")
 
 
143
  output_table = gr.Dataframe(label="Benchmark Results")
 
144
 
145
- run_button.click(fn=run_benchmark_all, inputs=[], outputs=output_table)
146
 
147
  demo.launch()
148
-
 
31
  # Functions to load pipelines on demand.
32
  def load_generation_pipeline(model_name):
33
  try:
 
34
  return pipeline("text-generation", model=model_name)
35
  except Exception as e:
36
  print(f"Error loading generation model {model_name}: {e}")
 
38
 
39
  def load_grammar_pipeline(model_name):
40
  try:
 
41
  return pipeline("text2text-generation", model=model_name)
42
  except Exception as e:
43
  print(f"Error loading grammar model {model_name}: {e}")
 
50
  if p is not None:
51
  rater_models.append(p)
52
 
 
53
  def clean_text(text):
54
  return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
55
 
 
57
  cleaned = clean_text(text)
58
  return cleaned == cleaned[::-1]
59
 
60
+ # Updated prompt that instructs the model to output ONLY the palindrome.
61
  def build_prompt(lang):
62
  return (
63
  f"Instruction: Generate a single original palindrome in {lang}.\n"
 
80
  return min(max(score, 0), 100)
81
  return 0
82
 
83
+ # Main benchmark function that runs all tests at once and saves results to a CSV file.
84
  def run_benchmark_all():
85
  results = []
86
 
 
87
  for model_name in premium_models:
88
  gen_pipeline = load_generation_pipeline(model_name)
89
  if gen_pipeline is None:
90
+ continue
91
 
 
92
  for code, lang in languages.items():
93
  prompt = build_prompt(lang)
94
  try:
 
95
  gen_output = gen_pipeline(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
96
  except Exception as e:
97
  gen_output = f"Error generating text: {e}"
 
99
  valid = is_palindrome(gen_output)
100
  cleaned_len = len(clean_text(gen_output))
101
 
 
102
  scores = []
103
  for rater in rater_models:
104
  rprompt = grammar_prompt(gen_output, lang)
 
109
  except Exception as e:
110
  scores.append(0)
111
  avg_score = np.mean(scores) if scores else 0
 
112
  penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
113
  final_score = round(cleaned_len * penalty, 2)
114
 
 
122
  "Final Score": final_score
123
  })
124
 
125
+ # Create DataFrame and sort by Final Score.
126
  df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
127
+
128
+ # Save results to CSV file.
129
+ csv_path = "benchmark_results.csv"
130
+ df.to_csv(csv_path, index=False)
131
+
132
+ return gr.Dataframe(df), csv_path
133
 
134
+ # Gradio UI using Blocks for a canvas layout.
135
  with gr.Blocks(title="Premium Model Palindrome Benchmark") as demo:
136
  gr.Markdown("# Premium Model Palindrome Benchmark")
137
+ gr.Markdown("This benchmark runs automatically over 2 premium text-generation models across 5 languages (English, German, Spanish, French, Portuguese), and saves the results to a CSV file for later review.")
138
 
139
  with gr.Row():
140
  run_button = gr.Button("Run All Benchmarks")
141
+
142
+ # The interface now outputs both a DataFrame and a File Download.
143
  output_table = gr.Dataframe(label="Benchmark Results")
144
+ output_file = gr.File(label="Download CSV Results")
145
 
146
+ run_button.click(fn=run_benchmark_all, inputs=[], outputs=[output_table, output_file])
147
 
148
  demo.launch()