PabloTJ commited on
Commit
fa826ee
·
verified ·
1 Parent(s): 171b1a5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -23
app.py CHANGED
@@ -5,12 +5,16 @@ import numpy as np
5
  import pandas as pd
6
  import os
7
 
8
- # Set a seed for reproducibility
9
  set_seed(42)
10
 
11
- # Define two premium generation models for better quality outputs.
12
  premium_models = [
13
- "mistralai/Mistral-7B-v0.1",
 
 
 
 
14
  "HuggingFaceH4/zephyr-7b-beta"
15
  ]
16
 
@@ -23,13 +27,13 @@ languages = {
23
  "pt": "Portuguese"
24
  }
25
 
26
- # Define two cost-effective grammar evaluation models.
27
  grammar_model_names = [
28
  "vennify/t5-base-grammar-correction",
29
  "hassaanik/grammar-correction-model"
30
  ]
31
 
32
- # Functions to load pipelines on demand.
33
  def load_generation_pipeline(model_name):
34
  try:
35
  return pipeline("text-generation", model=model_name)
@@ -37,6 +41,7 @@ def load_generation_pipeline(model_name):
37
  print(f"Error loading generation model {model_name}: {e}")
38
  return None
39
 
 
40
  def load_grammar_pipeline(model_name):
41
  try:
42
  return pipeline("text2text-generation", model=model_name)
@@ -44,13 +49,14 @@ def load_grammar_pipeline(model_name):
44
  print(f"Error loading grammar model {model_name}: {e}")
45
  return None
46
 
47
- # Pre-load grammar evaluator pipelines.
48
  rater_models = []
49
  for model_name in grammar_model_names:
50
  p = load_grammar_pipeline(model_name)
51
  if p is not None:
52
  rater_models.append(p)
53
 
 
54
  def clean_text(text):
55
  return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
56
 
@@ -58,15 +64,16 @@ def is_palindrome(text):
58
  cleaned = clean_text(text)
59
  return cleaned == cleaned[::-1]
60
 
61
- # Updated prompt that instructs the model to output ONLY the palindrome.
62
  def build_prompt(lang):
63
  return (
64
  f"Instruction: Generate a single original palindrome in {lang}.\n"
65
  "Output only the palindrome. The palindrome should be a continuous text that reads the same forward and backward.\n"
66
- "Do not output any additional text, commentary, or the prompt itself.\n"
67
  "Palindrome: "
68
  )
69
 
 
70
  def grammar_prompt(pal, lang):
71
  return (
72
  f"Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. "
@@ -74,6 +81,7 @@ def grammar_prompt(pal, lang):
74
  f'"{pal}"\n'
75
  )
76
 
 
77
  def extract_score(text):
78
  match = re.search(r"\d{1,3}", text)
79
  if match:
@@ -81,25 +89,23 @@ def extract_score(text):
81
  return min(max(score, 0), 100)
82
  return 0
83
 
84
- # Main benchmark function that runs all tests at once and saves results automatically.
85
  def run_benchmark_all():
86
  results = []
87
-
88
  for model_name in premium_models:
89
  gen_pipeline = load_generation_pipeline(model_name)
90
  if gen_pipeline is None:
91
  continue
92
-
93
  for code, lang in languages.items():
94
  prompt = build_prompt(lang)
95
  try:
96
  gen_output = gen_pipeline(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
97
  except Exception as e:
98
  gen_output = f"Error generating text: {e}"
99
-
100
  valid = is_palindrome(gen_output)
101
  cleaned_len = len(clean_text(gen_output))
102
 
 
103
  scores = []
104
  for rater in rater_models:
105
  rprompt = grammar_prompt(gen_output, lang)
@@ -123,28 +129,23 @@ def run_benchmark_all():
123
  "Final Score": final_score
124
  })
125
 
126
- # Create DataFrame and sort by Final Score.
127
  df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
128
-
129
- # Automatically save results to a CSV file.
130
  csv_path = "benchmark_results.csv"
131
  df.to_csv(csv_path, index=False)
132
- print(f"CSV file saved to {os.path.abspath(csv_path)}")
133
-
134
- # Return both the DataFrame and the CSV file path for download.
135
  return gr.Dataframe(df), csv_path
136
 
137
- # Build the Gradio UI using Blocks for a canvas layout.
138
  with gr.Blocks(title="Premium Model Palindrome Benchmark") as demo:
139
  gr.Markdown("# Premium Model Palindrome Benchmark")
140
- gr.Markdown("This benchmark runs automatically over 2 premium text-generation models across 5 languages (English, German, Spanish, French, Portuguese) and saves the results to a CSV file when done.")
141
-
 
 
142
  with gr.Row():
143
  run_button = gr.Button("Run All Benchmarks")
144
-
145
  output_table = gr.Dataframe(label="Benchmark Results")
146
  output_file = gr.File(label="Download CSV Results")
147
-
148
  run_button.click(fn=run_benchmark_all, inputs=[], outputs=[output_table, output_file])
149
 
150
  demo.launch()
 
5
  import pandas as pd
6
  import os
7
 
8
+ # Set seed for reproducibility
9
  set_seed(42)
10
 
11
+ # Define the six premium generation models:
12
  premium_models = [
13
+ "Qwen/Qwen2.5-Omni-7B",
14
+ "Qwen/Qwen2.5-VL-7B-Instruct",
15
+ "deepseek-ai/Janus-Pro-7B",
16
+ "meta-llama/Llama-2-7b-hf",
17
+ "Alibaba-NLP/gte-Qwen2-7B-instruct",
18
  "HuggingFaceH4/zephyr-7b-beta"
19
  ]
20
 
 
27
  "pt": "Portuguese"
28
  }
29
 
30
+ # Define two cost-effective grammar evaluation models:
31
  grammar_model_names = [
32
  "vennify/t5-base-grammar-correction",
33
  "hassaanik/grammar-correction-model"
34
  ]
35
 
36
+ # Function to load generation pipelines on demand
37
  def load_generation_pipeline(model_name):
38
  try:
39
  return pipeline("text-generation", model=model_name)
 
41
  print(f"Error loading generation model {model_name}: {e}")
42
  return None
43
 
44
+ # Function to load grammar evaluation pipelines on demand
45
  def load_grammar_pipeline(model_name):
46
  try:
47
  return pipeline("text2text-generation", model=model_name)
 
49
  print(f"Error loading grammar model {model_name}: {e}")
50
  return None
51
 
52
+ # Pre-load grammar evaluators
53
  rater_models = []
54
  for model_name in grammar_model_names:
55
  p = load_grammar_pipeline(model_name)
56
  if p is not None:
57
  rater_models.append(p)
58
 
59
+ # Utility functions to clean text and check for palindromes
60
  def clean_text(text):
61
  return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
62
 
 
64
  cleaned = clean_text(text)
65
  return cleaned == cleaned[::-1]
66
 
67
+ # Build prompt with clear instructions to output only the palindrome.
68
  def build_prompt(lang):
69
  return (
70
  f"Instruction: Generate a single original palindrome in {lang}.\n"
71
  "Output only the palindrome. The palindrome should be a continuous text that reads the same forward and backward.\n"
72
+ "Do not output any additional text or commentary.\n"
73
  "Palindrome: "
74
  )
75
 
76
+ # Build prompt for grammar evaluation
77
  def grammar_prompt(pal, lang):
78
  return (
79
  f"Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. "
 
81
  f'"{pal}"\n'
82
  )
83
 
84
+ # Extract numeric score from text output
85
  def extract_score(text):
86
  match = re.search(r"\d{1,3}", text)
87
  if match:
 
89
  return min(max(score, 0), 100)
90
  return 0
91
 
92
+ # Main benchmark function - runs all tests and saves CSV automatically.
93
  def run_benchmark_all():
94
  results = []
 
95
  for model_name in premium_models:
96
  gen_pipeline = load_generation_pipeline(model_name)
97
  if gen_pipeline is None:
98
  continue
 
99
  for code, lang in languages.items():
100
  prompt = build_prompt(lang)
101
  try:
102
  gen_output = gen_pipeline(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
103
  except Exception as e:
104
  gen_output = f"Error generating text: {e}"
 
105
  valid = is_palindrome(gen_output)
106
  cleaned_len = len(clean_text(gen_output))
107
 
108
+ # Evaluate grammar using both grammar models
109
  scores = []
110
  for rater in rater_models:
111
  rprompt = grammar_prompt(gen_output, lang)
 
129
  "Final Score": final_score
130
  })
131
 
 
132
  df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
 
 
133
  csv_path = "benchmark_results.csv"
134
  df.to_csv(csv_path, index=False)
135
+ print(f"CSV saved to {os.path.abspath(csv_path)}")
 
 
136
  return gr.Dataframe(df), csv_path
137
 
138
+ # Build the Gradio UI using a Blocks layout
139
  with gr.Blocks(title="Premium Model Palindrome Benchmark") as demo:
140
  gr.Markdown("# Premium Model Palindrome Benchmark")
141
+ gr.Markdown(
142
+ "This benchmark runs automatically over 6 premium text-generation models across 5 languages "
143
+ "(English, German, Spanish, French, Portuguese) and saves the results to a CSV file upon completion."
144
+ )
145
  with gr.Row():
146
  run_button = gr.Button("Run All Benchmarks")
 
147
  output_table = gr.Dataframe(label="Benchmark Results")
148
  output_file = gr.File(label="Download CSV Results")
 
149
  run_button.click(fn=run_benchmark_all, inputs=[], outputs=[output_table, output_file])
150
 
151
  demo.launch()