Spaces:

PabloTJ
/

palindroms

Running

App Files Files Community

PabloTJ commited on Apr 10

Commit

ec2f5cd

verified ·

1 Parent(s): 4963b4f

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -16

app.py CHANGED Viewed

@@ -31,7 +31,6 @@ grammar_model_names = [
 # Functions to load pipelines on demand.
 def load_generation_pipeline(model_name):
     try:
-        # The text-generation pipeline loads a causal LM.
         return pipeline("text-generation", model=model_name)
     except Exception as e:
         print(f"Error loading generation model {model_name}: {e}")
@@ -39,7 +38,6 @@ def load_generation_pipeline(model_name):
 def load_grammar_pipeline(model_name):
     try:
-        # Using text2text-generation for grammar correction.
         return pipeline("text2text-generation", model=model_name)
     except Exception as e:
         print(f"Error loading grammar model {model_name}: {e}")
@@ -52,7 +50,6 @@ for model_name in grammar_model_names:
     if p is not None:
         rater_models.append(p)
-# Utility functions.
 def clean_text(text):
     return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
@@ -60,7 +57,7 @@ def is_palindrome(text):
     cleaned = clean_text(text)
     return cleaned == cleaned[::-1]
-# Updated prompt which explicitly instructs the model to output only a palindrome.
 def build_prompt(lang):
     return (
         f"Instruction: Generate a single original palindrome in {lang}.\n"
@@ -83,21 +80,18 @@ def extract_score(text):
         return min(max(score, 0), 100)
     return 0
-# Main benchmark function running all tests at once.
 def run_benchmark_all():
     results = []
-    # Iterate over each premium model.
     for model_name in premium_models:
         gen_pipeline = load_generation_pipeline(model_name)
         if gen_pipeline is None:
-            continue  # Skip if model loading failed.
-        # Iterate over the five languages.
         for code, lang in languages.items():
             prompt = build_prompt(lang)
             try:
-                # Generate output with a moderate token limit; adjust max_new_tokens if needed.
                 gen_output = gen_pipeline(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
             except Exception as e:
                 gen_output = f"Error generating text: {e}"
@@ -105,7 +99,6 @@ def run_benchmark_all():
             valid = is_palindrome(gen_output)
             cleaned_len = len(clean_text(gen_output))
-            # Evaluate grammar using both grammar models.
             scores = []
             for rater in rater_models:
                 rprompt = grammar_prompt(gen_output, lang)
@@ -116,7 +109,6 @@ def run_benchmark_all():
                 except Exception as e:
                     scores.append(0)
             avg_score = np.mean(scores) if scores else 0
-            # Apply penalty if the output is not a valid palindrome.
             penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
             final_score = round(cleaned_len * penalty, 2)
@@ -130,19 +122,27 @@ def run_benchmark_all():
                 "Final Score": final_score
             })
     df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
-    return gr.Dataframe(df)
-# Gradio UI built with Blocks for a canvas-style layout.
 with gr.Blocks(title="Premium Model Palindrome Benchmark") as demo:
     gr.Markdown("# Premium Model Palindrome Benchmark")
-    gr.Markdown("This benchmark runs automatically over two premium text-generation models across 5 languages (English, German, Spanish, French, Portuguese).")
     with gr.Row():
         run_button = gr.Button("Run All Benchmarks")
     output_table = gr.Dataframe(label="Benchmark Results")
-    run_button.click(fn=run_benchmark_all, inputs=[], outputs=output_table)
 demo.launch()

 # Functions to load pipelines on demand.
 def load_generation_pipeline(model_name):
     try:
         return pipeline("text-generation", model=model_name)
     except Exception as e:
         print(f"Error loading generation model {model_name}: {e}")
 def load_grammar_pipeline(model_name):
     try:
         return pipeline("text2text-generation", model=model_name)
     except Exception as e:
         print(f"Error loading grammar model {model_name}: {e}")
     if p is not None:
         rater_models.append(p)
 def clean_text(text):
     return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
     cleaned = clean_text(text)
     return cleaned == cleaned[::-1]
+# Updated prompt that instructs the model to output ONLY the palindrome.
 def build_prompt(lang):
     return (
         f"Instruction: Generate a single original palindrome in {lang}.\n"
         return min(max(score, 0), 100)
     return 0
+# Main benchmark function that runs all tests at once and saves results to a CSV file.
 def run_benchmark_all():
     results = []
     for model_name in premium_models:
         gen_pipeline = load_generation_pipeline(model_name)
         if gen_pipeline is None:
+            continue
         for code, lang in languages.items():
             prompt = build_prompt(lang)
             try:
                 gen_output = gen_pipeline(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
             except Exception as e:
                 gen_output = f"Error generating text: {e}"
             valid = is_palindrome(gen_output)
             cleaned_len = len(clean_text(gen_output))
             scores = []
             for rater in rater_models:
                 rprompt = grammar_prompt(gen_output, lang)
                 except Exception as e:
                     scores.append(0)
             avg_score = np.mean(scores) if scores else 0
             penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
             final_score = round(cleaned_len * penalty, 2)
                 "Final Score": final_score
             })
+    # Create DataFrame and sort by Final Score.
     df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
+    # Save results to CSV file.
+    csv_path = "benchmark_results.csv"
+    df.to_csv(csv_path, index=False)
+    return gr.Dataframe(df), csv_path
+# Gradio UI using Blocks for a canvas layout.
 with gr.Blocks(title="Premium Model Palindrome Benchmark") as demo:
     gr.Markdown("# Premium Model Palindrome Benchmark")
+    gr.Markdown("This benchmark runs automatically over 2 premium text-generation models across 5 languages (English, German, Spanish, French, Portuguese), and saves the results to a CSV file for later review.")
     with gr.Row():
         run_button = gr.Button("Run All Benchmarks")
+    # The interface now outputs both a DataFrame and a File Download.
     output_table = gr.Dataframe(label="Benchmark Results")
+    output_file = gr.File(label="Download CSV Results")
+    run_button.click(fn=run_benchmark_all, inputs=[], outputs=[output_table, output_file])
 demo.launch()