Spaces:

PabloTJ
/

palindroms

Running

App Files Files Community

PabloTJ commited on 23 days ago

Commit

3fb2bff

verified ·

1 Parent(s): 4136261

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -17

app.py CHANGED Viewed

@@ -13,10 +13,10 @@ small_models = [
     "gpt2",                          # ~124M parameters
     "EleutherAI/gpt-neo-125M",         # ~125M parameters
     "sshleifer/tiny-gpt2",           # extremely small variant
-    "microsoft/DialoGPT-small"       # dialoGPT in small size
 ]
-# Define five languages (English, German, Spanish, French, Portuguese)
 languages = {
     "en": "English",
     "de": "German",
@@ -25,7 +25,7 @@ languages = {
     "pt": "Portuguese"
 }
-# Define two cost-effective grammar evaluation models
 grammar_model_names = [
     "vennify/t5-base-grammar-correction",
     "hassaanik/grammar-correction-model"
@@ -34,7 +34,7 @@ grammar_model_names = [
 # Functions to load pipelines on demand
 def load_generation_pipeline(model_name):
     try:
-        # Use text-generation pipeline for causal LM models
         return pipeline("text-generation", model=model_name)
     except Exception as e:
         print(f"Error loading generation model {model_name}: {e}")
@@ -62,6 +62,15 @@ def is_palindrome(text):
     cleaned = clean_text(text)
     return cleaned == cleaned[::-1]
 def grammar_prompt(pal, lang):
     return f'''Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. Only return a number with no explanation:\n\n"{pal}"\n'''
@@ -75,28 +84,25 @@ def extract_score(text):
 # Main benchmark function that runs all tests at once
 def run_benchmark_all():
     results = []
     for model_name in small_models:
-        # Load the generation pipeline for the current small model
         gen_pipeline = load_generation_pipeline(model_name)
         if gen_pipeline is None:
-            continue  # Skip if model fails to load
         for code, lang in languages.items():
-            # Prompt for generating a palindrome in the given language
-            prompt = (
-                f"Write the longest original palindrome you can in {lang}. "
-                "It should be creative and not a known palindrome. "
-                "If it is not a correct palindrome, you will lose points according to how correct it is."
-            )
             try:
                 gen_output = gen_pipeline(prompt, max_new_tokens=50, do_sample=True)[0]['generated_text'].strip()
             except Exception as e:
                 gen_output = f"Error generating text: {e}"
             valid = is_palindrome(gen_output)
             cleaned_len = len(clean_text(gen_output))
-            # Measure grammar evaluation using both rater models
             scores = []
             for rater in rater_models:
                 rprompt = grammar_prompt(gen_output, lang)
@@ -107,7 +113,7 @@ def run_benchmark_all():
                 except Exception as e:
                     scores.append(0)
             avg_score = np.mean(scores) if scores else 0
-            # Apply a penalty if the text is not a valid palindrome
             penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
             final_score = round(cleaned_len * penalty, 2)
@@ -124,10 +130,10 @@ def run_benchmark_all():
     df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
     return gr.Dataframe(df)
-# Build Gradio UI using Blocks (canvas layout)
 with gr.Blocks(title="Small Model Palindrome Benchmark") as demo:
     gr.Markdown("# Small Model Palindrome Benchmark")
-    gr.Markdown("This benchmark runs automatically during the night over 5 small text-generation models and 5 languages (English, German, Spanish, French, Portuguese). All tests are run at once.")
     with gr.Row():
         run_button = gr.Button("Run All Benchmarks")

     "gpt2",                          # ~124M parameters
     "EleutherAI/gpt-neo-125M",         # ~125M parameters
     "sshleifer/tiny-gpt2",           # extremely small variant
+    "microsoft/DialoGPT-small"       # DialoGPT small
 ]
+# Define five languages: English, German, Spanish, French, Portuguese
 languages = {
     "en": "English",
     "de": "German",
     "pt": "Portuguese"
 }
+# Define two cost-effective grammar evaluation models (unchanged)
 grammar_model_names = [
     "vennify/t5-base-grammar-correction",
     "hassaanik/grammar-correction-model"
 # Functions to load pipelines on demand
 def load_generation_pipeline(model_name):
     try:
+        # Using text-generation pipeline for causal LM models
         return pipeline("text-generation", model=model_name)
     except Exception as e:
         print(f"Error loading generation model {model_name}: {e}")
     cleaned = clean_text(text)
     return cleaned == cleaned[::-1]
+# Updated prompt that instructs the model to output ONLY the palindrome.
+def build_prompt(lang):
+    return (
+        f"Instruction: Write the longest original palindrome you can in {lang}. "
+        "The output should contain nothing else but the palindrome. "
+        "Do not include any additional commentary or repeated instructions. "
+        "Palindrome: "
+    )
 def grammar_prompt(pal, lang):
     return f'''Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. Only return a number with no explanation:\n\n"{pal}"\n'''
 # Main benchmark function that runs all tests at once
 def run_benchmark_all():
     results = []
+    # Iterate over each small model
     for model_name in small_models:
         gen_pipeline = load_generation_pipeline(model_name)
         if gen_pipeline is None:
+            continue  # Skip this model if it fails to load
+        # Iterate over each language
         for code, lang in languages.items():
+            prompt = build_prompt(lang)
             try:
+                # Generate text with a moderate max token limit
                 gen_output = gen_pipeline(prompt, max_new_tokens=50, do_sample=True)[0]['generated_text'].strip()
             except Exception as e:
                 gen_output = f"Error generating text: {e}"
+            # Check if the generated output is a palindrome
             valid = is_palindrome(gen_output)
             cleaned_len = len(clean_text(gen_output))
+            # Evaluate grammar using both grammar models
             scores = []
             for rater in rater_models:
                 rprompt = grammar_prompt(gen_output, lang)
                 except Exception as e:
                     scores.append(0)
             avg_score = np.mean(scores) if scores else 0
+            # Penalize if the generated text is not a valid palindrome
             penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
             final_score = round(cleaned_len * penalty, 2)
     df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
     return gr.Dataframe(df)
+# Build the Gradio UI using Blocks (canvas layout)
 with gr.Blocks(title="Small Model Palindrome Benchmark") as demo:
     gr.Markdown("# Small Model Palindrome Benchmark")
+    gr.Markdown("This benchmark automatically runs over 5 small text-generation models and 5 languages (English, German, Spanish, French, Portuguese).")
     with gr.Row():
         run_button = gr.Button("Run All Benchmarks")