Spaces:
Running
Running
Upload 3 files
Browse files- README.md +2 -13
- app.py +74 -0
- requirements.txt +5 -0
README.md
CHANGED
@@ -1,14 +1,3 @@
|
|
1 |
-
|
2 |
-
title: Palindroms
|
3 |
-
emoji: 🏃
|
4 |
-
colorFrom: pink
|
5 |
-
colorTo: green
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 5.24.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: mit
|
11 |
-
short_description: LLM benchmark for palidrom generation
|
12 |
-
---
|
13 |
|
14 |
-
|
|
|
1 |
+
# 🔁 LLM Palindrome Benchmark
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
This app generates long, original palindromes in 10 languages using Hugging Face LLMs, rates their grammar using other models, and scores them based on correctness and fluency.
|
app.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import gradio as gr
|
3 |
+
from transformers import pipeline
|
4 |
+
import re
|
5 |
+
from langdetect import detect
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
|
9 |
+
# Load models for generation and rating
|
10 |
+
gen_model = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1")
|
11 |
+
rater_models = [
|
12 |
+
pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta"),
|
13 |
+
pipeline("text-generation", model="google/flan-t5-large")
|
14 |
+
]
|
15 |
+
|
16 |
+
# Language list
|
17 |
+
languages = {
|
18 |
+
"en": "English", "es": "Spanish", "fr": "French", "de": "German", "it": "Italian",
|
19 |
+
"pt": "Portuguese", "ru": "Russian", "ar": "Arabic", "hi": "Hindi", "ja": "Japanese"
|
20 |
+
}
|
21 |
+
|
22 |
+
def clean_text(text):
|
23 |
+
return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
|
24 |
+
|
25 |
+
def is_palindrome(text):
|
26 |
+
cleaned = clean_text(text)
|
27 |
+
return cleaned == cleaned[::-1]
|
28 |
+
|
29 |
+
def grammar_prompt(pal, lang):
|
30 |
+
return f'''Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. Only return a number with no explanation:\n\n"{pal}"\n'''
|
31 |
+
|
32 |
+
def extract_score(text):
|
33 |
+
match = re.search(r"\d{1,3}", text)
|
34 |
+
if match:
|
35 |
+
score = int(match.group())
|
36 |
+
return min(max(score, 0), 100)
|
37 |
+
return 0
|
38 |
+
|
39 |
+
def run_benchmark():
|
40 |
+
results = []
|
41 |
+
for code, lang in languages.items():
|
42 |
+
prompt = f'''Write the longest original palindrome you can in {lang}. It should be creative and not a known palindrome. If it is not a correct palindrome, you will lose points according to how correct it is.'''
|
43 |
+
|
44 |
+
gen_output = gen_model(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
|
45 |
+
valid = is_palindrome(gen_output)
|
46 |
+
cleaned_len = len(clean_text(gen_output))
|
47 |
+
detected_lang = detect(gen_output)
|
48 |
+
|
49 |
+
scores = []
|
50 |
+
for rater in rater_models:
|
51 |
+
rprompt = grammar_prompt(gen_output, lang)
|
52 |
+
rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text']
|
53 |
+
score = extract_score(rtext)
|
54 |
+
scores.append(score)
|
55 |
+
|
56 |
+
avg_score = np.mean(scores)
|
57 |
+
penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
|
58 |
+
final_score = round(cleaned_len * penalty, 2)
|
59 |
+
|
60 |
+
results.append({
|
61 |
+
"Language": lang,
|
62 |
+
"Palindrome": gen_output,
|
63 |
+
"Valid": "✅" if valid else "❌",
|
64 |
+
"Length": cleaned_len,
|
65 |
+
"Grammar Score": avg_score,
|
66 |
+
"Final Score": final_score,
|
67 |
+
"Detected Lang": detected_lang
|
68 |
+
})
|
69 |
+
|
70 |
+
df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
|
71 |
+
return gr.Dataframe(df)
|
72 |
+
|
73 |
+
iface = gr.Interface(fn=run_benchmark, inputs=[], outputs="dataframe", title="🔁 LLM Palindrome Benchmark")
|
74 |
+
iface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
gradio
|
3 |
+
langdetect
|
4 |
+
pandas
|
5 |
+
accelerate
|