File size: 5,474 Bytes
12a6276
9d5d030
12a6276
 
 
ad47898
e5a040d
 
0dfed7a
e5a040d
8d80fc8
 
 
 
 
0dfed7a
9d5d030
 
fa826ee
4963b4f
fa826ee
 
 
 
 
4963b4f
9d5d030
 
4963b4f
4136261
 
 
 
 
 
 
 
fa826ee
9d5d030
 
 
12a6276
 
0dfed7a
 
 
 
9d5d030
 
e5a040d
9d5d030
 
 
 
0dfed7a
9d5d030
 
e5a040d
9d5d030
 
 
 
e5a040d
9d5d030
 
 
 
 
 
12a6276
 
 
 
 
 
 
0dfed7a
3fb2bff
 
4963b4f
 
0dfed7a
3fb2bff
 
 
12a6276
4963b4f
 
 
 
 
12a6276
 
 
 
 
 
 
 
e5a040d
4136261
12a6276
4963b4f
4136261
 
ec2f5cd
4136261
3fb2bff
9d5d030
4963b4f
9d5d030
4136261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d5d030
12a6276
ec2f5cd
 
fa826ee
ec2f5cd
12a6276
0dfed7a
4963b4f
 
fa826ee
e5a040d
 
fa826ee
9d5d030
4136261
9d5d030
ec2f5cd
 
9d5d030
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import gradio as gr
from transformers import pipeline, set_seed
import re
import numpy as np
import pandas as pd
import os
import torch

# Check GPU availability (for debugging)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected. Running on CPU.")

# Set seed for reproducibility
set_seed(42)

# Define the six premium generation models:
premium_models = [
    "Qwen/Qwen2.5-Omni-7B",
    "Qwen/Qwen2.5-VL-7B-Instruct",
    "deepseek-ai/Janus-Pro-7B",
    "meta-llama/Llama-2-7b-hf",
    "Alibaba-NLP/gte-Qwen2-7B-instruct",
    "HuggingFaceH4/zephyr-7b-beta"
]

# Define five languages: English, German, Spanish, French, Portuguese.
languages = {
    "en": "English",
    "de": "German",
    "es": "Spanish",
    "fr": "French",
    "pt": "Portuguese"
}

# Define two cost-effective grammar evaluation models:
grammar_model_names = [
    "vennify/t5-base-grammar-correction",
    "hassaanik/grammar-correction-model"
]

# Determine device: Use GPU (0) if available, otherwise CPU (-1)
device = 0 if torch.cuda.is_available() else -1

# Function to load generation pipelines with appropriate device setting.
def load_generation_pipeline(model_name):
    try:
        return pipeline("text-generation", model=model_name, device=device)
    except Exception as e:
        print(f"Error loading generation model {model_name}: {e}")
        return None

# Function to load grammar evaluation pipelines with appropriate device setting.
def load_grammar_pipeline(model_name):
    try:
        return pipeline("text2text-generation", model=model_name, device=device)
    except Exception as e:
        print(f"Error loading grammar model {model_name}: {e}")
        return None

# Pre-load grammar evaluators.
rater_models = []
for model_name in grammar_model_names:
    p = load_grammar_pipeline(model_name)
    if p is not None:
        rater_models.append(p)

def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9]', '', text.lower())

def is_palindrome(text):
    cleaned = clean_text(text)
    return cleaned == cleaned[::-1]

# Updated prompt instructs the model to output only the palindrome.
def build_prompt(lang):
    return (
        f"Instruction: Generate a single original palindrome in {lang}.\n"
        "Output only the palindrome. The palindrome should be a continuous text that reads the same forward and backward.\n"
        "Do not output any additional text or commentary.\n"
        "Palindrome: "
    )

def grammar_prompt(pal, lang):
    return (
        f"Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. "
        "Return only a number with no explanation.\n\n"
        f'"{pal}"\n'
    )

def extract_score(text):
    match = re.search(r"\d{1,3}", text)
    if match:
        score = int(match.group())
        return min(max(score, 0), 100)
    return 0

# Main benchmark function that runs tests and saves CSV results.
def run_benchmark_all():
    results = []
    for model_name in premium_models:
        gen_pipeline = load_generation_pipeline(model_name)
        if gen_pipeline is None:
            continue
        for code, lang in languages.items():
            prompt = build_prompt(lang)
            try:
                gen_output = gen_pipeline(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
            except Exception as e:
                gen_output = f"Error generating text: {e}"
            valid = is_palindrome(gen_output)
            cleaned_len = len(clean_text(gen_output))
            
            scores = []
            for rater in rater_models:
                rprompt = grammar_prompt(gen_output, lang)
                try:
                    rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text']
                    score = extract_score(rtext)
                    scores.append(score)
                except Exception as e:
                    scores.append(0)
            avg_score = np.mean(scores) if scores else 0
            penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
            final_score = round(cleaned_len * penalty, 2)
            
            results.append({
                "Model": model_name,
                "Language": lang,
                "Palindrome": gen_output,
                "Valid": "✅" if valid else "❌",
                "Length": cleaned_len,
                "Grammar Score": avg_score,
                "Final Score": final_score
            })
    
    df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
    csv_path = "benchmark_results.csv"
    df.to_csv(csv_path, index=False)
    print(f"CSV saved to {os.path.abspath(csv_path)}")
    return gr.Dataframe(df), csv_path

# Build the Gradio UI using Blocks for a canvas layout.
with gr.Blocks(title="Premium Model Palindrome Benchmark") as demo:
    gr.Markdown("# Premium Model Palindrome Benchmark")
    gr.Markdown(
        "This benchmark runs automatically over 6 premium text-generation models across 5 languages and saves the results "
        "to a CSV file upon completion."
    )
    with gr.Row():
        run_button = gr.Button("Run All Benchmarks")
    output_table = gr.Dataframe(label="Benchmark Results")
    output_file = gr.File(label="Download CSV Results")
    run_button.click(fn=run_benchmark_all, inputs=[], outputs=[output_table, output_file])

demo.launch()