Spaces:

PabloTJ
/

palindroms

Running

App Files Files Community

PabloTJ commited on 24 days ago

Commit

4963b4f

verified ·

1 Parent(s): 3fb2bff

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -33

app.py CHANGED Viewed

@@ -7,16 +7,13 @@ import pandas as pd
 # Set a seed for reproducibility
 set_seed(42)
-# Define five small models for generation (free, lightweight)
-small_models = [
-    "distilgpt2",                    # ~82M parameters
-    "gpt2",                          # ~124M parameters
-    "EleutherAI/gpt-neo-125M",         # ~125M parameters
-    "sshleifer/tiny-gpt2",           # extremely small variant
-    "microsoft/DialoGPT-small"       # DialoGPT small
 ]
-# Define five languages: English, German, Spanish, French, Portuguese
 languages = {
     "en": "English",
     "de": "German",
@@ -25,16 +22,16 @@ languages = {
     "pt": "Portuguese"
 }
-# Define two cost-effective grammar evaluation models (unchanged)
 grammar_model_names = [
     "vennify/t5-base-grammar-correction",
     "hassaanik/grammar-correction-model"
 ]
-# Functions to load pipelines on demand
 def load_generation_pipeline(model_name):
     try:
-        # Using text-generation pipeline for causal LM models
         return pipeline("text-generation", model=model_name)
     except Exception as e:
         print(f"Error loading generation model {model_name}: {e}")
@@ -42,19 +39,20 @@ def load_generation_pipeline(model_name):
 def load_grammar_pipeline(model_name):
     try:
         return pipeline("text2text-generation", model=model_name)
     except Exception as e:
         print(f"Error loading grammar model {model_name}: {e}")
         return None
-# Pre-load grammar evaluator pipelines
 rater_models = []
 for model_name in grammar_model_names:
     p = load_grammar_pipeline(model_name)
     if p is not None:
         rater_models.append(p)
-# Utility functions for checking palindromes and cleaning text
 def clean_text(text):
     return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
@@ -62,17 +60,21 @@ def is_palindrome(text):
     cleaned = clean_text(text)
     return cleaned == cleaned[::-1]
-# Updated prompt that instructs the model to output ONLY the palindrome.
 def build_prompt(lang):
     return (
-        f"Instruction: Write the longest original palindrome you can in {lang}. "
-        "The output should contain nothing else but the palindrome. "
-        "Do not include any additional commentary or repeated instructions. "
         "Palindrome: "
     )
 def grammar_prompt(pal, lang):
-    return f'''Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. Only return a number with no explanation:\n\n"{pal}"\n'''
 def extract_score(text):
     match = re.search(r"\d{1,3}", text)
@@ -81,28 +83,29 @@ def extract_score(text):
         return min(max(score, 0), 100)
     return 0
-# Main benchmark function that runs all tests at once
 def run_benchmark_all():
     results = []
-    # Iterate over each small model
-    for model_name in small_models:
         gen_pipeline = load_generation_pipeline(model_name)
         if gen_pipeline is None:
-            continue  # Skip this model if it fails to load
-        # Iterate over each language
         for code, lang in languages.items():
             prompt = build_prompt(lang)
             try:
-                # Generate text with a moderate max token limit
-                gen_output = gen_pipeline(prompt, max_new_tokens=50, do_sample=True)[0]['generated_text'].strip()
             except Exception as e:
                 gen_output = f"Error generating text: {e}"
-            # Check if the generated output is a palindrome
             valid = is_palindrome(gen_output)
             cleaned_len = len(clean_text(gen_output))
-            # Evaluate grammar using both grammar models
             scores = []
             for rater in rater_models:
                 rprompt = grammar_prompt(gen_output, lang)
@@ -113,7 +116,7 @@ def run_benchmark_all():
                 except Exception as e:
                     scores.append(0)
             avg_score = np.mean(scores) if scores else 0
-            # Penalize if the generated text is not a valid palindrome
             penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
             final_score = round(cleaned_len * penalty, 2)
@@ -130,10 +133,10 @@ def run_benchmark_all():
     df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
     return gr.Dataframe(df)
-# Build the Gradio UI using Blocks (canvas layout)
-with gr.Blocks(title="Small Model Palindrome Benchmark") as demo:
-    gr.Markdown("# Small Model Palindrome Benchmark")
-    gr.Markdown("This benchmark automatically runs over 5 small text-generation models and 5 languages (English, German, Spanish, French, Portuguese).")
     with gr.Row():
         run_button = gr.Button("Run All Benchmarks")
@@ -142,3 +145,4 @@ with gr.Blocks(title="Small Model Palindrome Benchmark") as demo:
     run_button.click(fn=run_benchmark_all, inputs=[], outputs=output_table)
 demo.launch()

 # Set a seed for reproducibility
 set_seed(42)
+# Define two premium generation models for better quality outputs.
+premium_models = [
+    "mistralai/Mistral-7B-v0.1",
+    "HuggingFaceH4/zephyr-7b-beta"
 ]
+# Define five languages: English, German, Spanish, French, Portuguese.
 languages = {
     "en": "English",
     "de": "German",
     "pt": "Portuguese"
 }
+# Define two cost-effective grammar evaluation models.
 grammar_model_names = [
     "vennify/t5-base-grammar-correction",
     "hassaanik/grammar-correction-model"
 ]
+# Functions to load pipelines on demand.
 def load_generation_pipeline(model_name):
     try:
+        # The text-generation pipeline loads a causal LM.
         return pipeline("text-generation", model=model_name)
     except Exception as e:
         print(f"Error loading generation model {model_name}: {e}")
 def load_grammar_pipeline(model_name):
     try:
+        # Using text2text-generation for grammar correction.
         return pipeline("text2text-generation", model=model_name)
     except Exception as e:
         print(f"Error loading grammar model {model_name}: {e}")
         return None
+# Pre-load grammar evaluator pipelines.
 rater_models = []
 for model_name in grammar_model_names:
     p = load_grammar_pipeline(model_name)
     if p is not None:
         rater_models.append(p)
+# Utility functions.
 def clean_text(text):
     return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
     cleaned = clean_text(text)
     return cleaned == cleaned[::-1]
+# Updated prompt which explicitly instructs the model to output only a palindrome.
 def build_prompt(lang):
     return (
+        f"Instruction: Generate a single original palindrome in {lang}.\n"
+        "Output only the palindrome. The palindrome should be a continuous text that reads the same forward and backward.\n"
+        "Do not output any additional text, commentary, or the prompt itself.\n"
         "Palindrome: "
     )
 def grammar_prompt(pal, lang):
+    return (
+        f"Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. "
+        "Return only a number with no explanation.\n\n"
+        f'"{pal}"\n'
+    )
 def extract_score(text):
     match = re.search(r"\d{1,3}", text)
         return min(max(score, 0), 100)
     return 0
+# Main benchmark function running all tests at once.
 def run_benchmark_all():
     results = []
+    # Iterate over each premium model.
+    for model_name in premium_models:
         gen_pipeline = load_generation_pipeline(model_name)
         if gen_pipeline is None:
+            continue  # Skip if model loading failed.
+        # Iterate over the five languages.
         for code, lang in languages.items():
             prompt = build_prompt(lang)
             try:
+                # Generate output with a moderate token limit; adjust max_new_tokens if needed.
+                gen_output = gen_pipeline(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
             except Exception as e:
                 gen_output = f"Error generating text: {e}"
             valid = is_palindrome(gen_output)
             cleaned_len = len(clean_text(gen_output))
+            # Evaluate grammar using both grammar models.
             scores = []
             for rater in rater_models:
                 rprompt = grammar_prompt(gen_output, lang)
                 except Exception as e:
                     scores.append(0)
             avg_score = np.mean(scores) if scores else 0
+            # Apply penalty if the output is not a valid palindrome.
             penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
             final_score = round(cleaned_len * penalty, 2)
     df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
     return gr.Dataframe(df)
+# Gradio UI built with Blocks for a canvas-style layout.
+with gr.Blocks(title="Premium Model Palindrome Benchmark") as demo:
+    gr.Markdown("# Premium Model Palindrome Benchmark")
+    gr.Markdown("This benchmark runs automatically over two premium text-generation models across 5 languages (English, German, Spanish, French, Portuguese).")
     with gr.Row():
         run_button = gr.Button("Run All Benchmarks")
     run_button.click(fn=run_benchmark_all, inputs=[], outputs=output_table)
 demo.launch()