PabloTJ commited on
Commit
12a6276
·
verified ·
1 Parent(s): 2cbbb89

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +2 -13
  2. app.py +74 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,14 +1,3 @@
1
- ---
2
- title: Palindroms
3
- emoji: 🏃
4
- colorFrom: pink
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 5.24.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: LLM benchmark for palidrom generation
12
- ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # 🔁 LLM Palindrome Benchmark
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ This app generates long, original palindromes in 10 languages using Hugging Face LLMs, rates their grammar using other models, and scores them based on correctness and fluency.
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from transformers import pipeline
4
+ import re
5
+ from langdetect import detect
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ # Load models for generation and rating
10
+ gen_model = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1")
11
+ rater_models = [
12
+ pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta"),
13
+ pipeline("text-generation", model="google/flan-t5-large")
14
+ ]
15
+
16
+ # Language list
17
+ languages = {
18
+ "en": "English", "es": "Spanish", "fr": "French", "de": "German", "it": "Italian",
19
+ "pt": "Portuguese", "ru": "Russian", "ar": "Arabic", "hi": "Hindi", "ja": "Japanese"
20
+ }
21
+
22
+ def clean_text(text):
23
+ return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
24
+
25
+ def is_palindrome(text):
26
+ cleaned = clean_text(text)
27
+ return cleaned == cleaned[::-1]
28
+
29
+ def grammar_prompt(pal, lang):
30
+ return f'''Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. Only return a number with no explanation:\n\n"{pal}"\n'''
31
+
32
+ def extract_score(text):
33
+ match = re.search(r"\d{1,3}", text)
34
+ if match:
35
+ score = int(match.group())
36
+ return min(max(score, 0), 100)
37
+ return 0
38
+
39
+ def run_benchmark():
40
+ results = []
41
+ for code, lang in languages.items():
42
+ prompt = f'''Write the longest original palindrome you can in {lang}. It should be creative and not a known palindrome. If it is not a correct palindrome, you will lose points according to how correct it is.'''
43
+
44
+ gen_output = gen_model(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
45
+ valid = is_palindrome(gen_output)
46
+ cleaned_len = len(clean_text(gen_output))
47
+ detected_lang = detect(gen_output)
48
+
49
+ scores = []
50
+ for rater in rater_models:
51
+ rprompt = grammar_prompt(gen_output, lang)
52
+ rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text']
53
+ score = extract_score(rtext)
54
+ scores.append(score)
55
+
56
+ avg_score = np.mean(scores)
57
+ penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
58
+ final_score = round(cleaned_len * penalty, 2)
59
+
60
+ results.append({
61
+ "Language": lang,
62
+ "Palindrome": gen_output,
63
+ "Valid": "✅" if valid else "❌",
64
+ "Length": cleaned_len,
65
+ "Grammar Score": avg_score,
66
+ "Final Score": final_score,
67
+ "Detected Lang": detected_lang
68
+ })
69
+
70
+ df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
71
+ return gr.Dataframe(df)
72
+
73
+ iface = gr.Interface(fn=run_benchmark, inputs=[], outputs="dataframe", title="🔁 LLM Palindrome Benchmark")
74
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ gradio
3
+ langdetect
4
+ pandas
5
+ accelerate