Spaces:
Running
Running
File size: 5,474 Bytes
12a6276 9d5d030 12a6276 ad47898 e5a040d 0dfed7a e5a040d 8d80fc8 0dfed7a 9d5d030 fa826ee 4963b4f fa826ee 4963b4f 9d5d030 4963b4f 4136261 fa826ee 9d5d030 12a6276 0dfed7a 9d5d030 e5a040d 9d5d030 0dfed7a 9d5d030 e5a040d 9d5d030 e5a040d 9d5d030 12a6276 0dfed7a 3fb2bff 4963b4f 0dfed7a 3fb2bff 12a6276 4963b4f 12a6276 e5a040d 4136261 12a6276 4963b4f 4136261 ec2f5cd 4136261 3fb2bff 9d5d030 4963b4f 9d5d030 4136261 9d5d030 12a6276 ec2f5cd fa826ee ec2f5cd 12a6276 0dfed7a 4963b4f fa826ee e5a040d fa826ee 9d5d030 4136261 9d5d030 ec2f5cd 9d5d030 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import gradio as gr
from transformers import pipeline, set_seed
import re
import numpy as np
import pandas as pd
import os
import torch
# Check GPU availability (for debugging)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
print("GPU Name:", torch.cuda.get_device_name(0))
else:
print("No GPU detected. Running on CPU.")
# Set seed for reproducibility
set_seed(42)
# Define the six premium generation models:
premium_models = [
"Qwen/Qwen2.5-Omni-7B",
"Qwen/Qwen2.5-VL-7B-Instruct",
"deepseek-ai/Janus-Pro-7B",
"meta-llama/Llama-2-7b-hf",
"Alibaba-NLP/gte-Qwen2-7B-instruct",
"HuggingFaceH4/zephyr-7b-beta"
]
# Define five languages: English, German, Spanish, French, Portuguese.
languages = {
"en": "English",
"de": "German",
"es": "Spanish",
"fr": "French",
"pt": "Portuguese"
}
# Define two cost-effective grammar evaluation models:
grammar_model_names = [
"vennify/t5-base-grammar-correction",
"hassaanik/grammar-correction-model"
]
# Determine device: Use GPU (0) if available, otherwise CPU (-1)
device = 0 if torch.cuda.is_available() else -1
# Function to load generation pipelines with appropriate device setting.
def load_generation_pipeline(model_name):
try:
return pipeline("text-generation", model=model_name, device=device)
except Exception as e:
print(f"Error loading generation model {model_name}: {e}")
return None
# Function to load grammar evaluation pipelines with appropriate device setting.
def load_grammar_pipeline(model_name):
try:
return pipeline("text2text-generation", model=model_name, device=device)
except Exception as e:
print(f"Error loading grammar model {model_name}: {e}")
return None
# Pre-load grammar evaluators.
rater_models = []
for model_name in grammar_model_names:
p = load_grammar_pipeline(model_name)
if p is not None:
rater_models.append(p)
def clean_text(text):
return re.sub(r'[^a-zA-Z0-9]', '', text.lower())
def is_palindrome(text):
cleaned = clean_text(text)
return cleaned == cleaned[::-1]
# Updated prompt instructs the model to output only the palindrome.
def build_prompt(lang):
return (
f"Instruction: Generate a single original palindrome in {lang}.\n"
"Output only the palindrome. The palindrome should be a continuous text that reads the same forward and backward.\n"
"Do not output any additional text or commentary.\n"
"Palindrome: "
)
def grammar_prompt(pal, lang):
return (
f"Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. "
"Return only a number with no explanation.\n\n"
f'"{pal}"\n'
)
def extract_score(text):
match = re.search(r"\d{1,3}", text)
if match:
score = int(match.group())
return min(max(score, 0), 100)
return 0
# Main benchmark function that runs tests and saves CSV results.
def run_benchmark_all():
results = []
for model_name in premium_models:
gen_pipeline = load_generation_pipeline(model_name)
if gen_pipeline is None:
continue
for code, lang in languages.items():
prompt = build_prompt(lang)
try:
gen_output = gen_pipeline(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
except Exception as e:
gen_output = f"Error generating text: {e}"
valid = is_palindrome(gen_output)
cleaned_len = len(clean_text(gen_output))
scores = []
for rater in rater_models:
rprompt = grammar_prompt(gen_output, lang)
try:
rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text']
score = extract_score(rtext)
scores.append(score)
except Exception as e:
scores.append(0)
avg_score = np.mean(scores) if scores else 0
penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
final_score = round(cleaned_len * penalty, 2)
results.append({
"Model": model_name,
"Language": lang,
"Palindrome": gen_output,
"Valid": "✅" if valid else "❌",
"Length": cleaned_len,
"Grammar Score": avg_score,
"Final Score": final_score
})
df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
csv_path = "benchmark_results.csv"
df.to_csv(csv_path, index=False)
print(f"CSV saved to {os.path.abspath(csv_path)}")
return gr.Dataframe(df), csv_path
# Build the Gradio UI using Blocks for a canvas layout.
with gr.Blocks(title="Premium Model Palindrome Benchmark") as demo:
gr.Markdown("# Premium Model Palindrome Benchmark")
gr.Markdown(
"This benchmark runs automatically over 6 premium text-generation models across 5 languages and saves the results "
"to a CSV file upon completion."
)
with gr.Row():
run_button = gr.Button("Run All Benchmarks")
output_table = gr.Dataframe(label="Benchmark Results")
output_file = gr.File(label="Download CSV Results")
run_button.click(fn=run_benchmark_all, inputs=[], outputs=[output_table, output_file])
demo.launch()
|