import gradio as gr from transformers import pipeline, set_seed import re import numpy as np import pandas as pd import os import torch # Check GPU availability (for debugging) print("CUDA available:", torch.cuda.is_available()) if torch.cuda.is_available(): print("GPU Name:", torch.cuda.get_device_name(0)) else: print("No GPU detected. Running on CPU.") # Set seed for reproducibility set_seed(42) # Define the six premium generation models: premium_models = [ "Qwen/Qwen2.5-Omni-7B", "Qwen/Qwen2.5-VL-7B-Instruct", "deepseek-ai/Janus-Pro-7B", "meta-llama/Llama-2-7b-hf", "Alibaba-NLP/gte-Qwen2-7B-instruct", "HuggingFaceH4/zephyr-7b-beta" ] # Define five languages: English, German, Spanish, French, Portuguese. languages = { "en": "English", "de": "German", "es": "Spanish", "fr": "French", "pt": "Portuguese" } # Define two cost-effective grammar evaluation models: grammar_model_names = [ "vennify/t5-base-grammar-correction", "hassaanik/grammar-correction-model" ] # Determine device: Use GPU (0) if available, otherwise CPU (-1) device = 0 if torch.cuda.is_available() else -1 # Function to load generation pipelines with appropriate device setting. def load_generation_pipeline(model_name): try: return pipeline("text-generation", model=model_name, device=device) except Exception as e: print(f"Error loading generation model {model_name}: {e}") return None # Function to load grammar evaluation pipelines with appropriate device setting. def load_grammar_pipeline(model_name): try: return pipeline("text2text-generation", model=model_name, device=device) except Exception as e: print(f"Error loading grammar model {model_name}: {e}") return None # Pre-load grammar evaluators. rater_models = [] for model_name in grammar_model_names: p = load_grammar_pipeline(model_name) if p is not None: rater_models.append(p) def clean_text(text): return re.sub(r'[^a-zA-Z0-9]', '', text.lower()) def is_palindrome(text): cleaned = clean_text(text) return cleaned == cleaned[::-1] # Updated prompt instructs the model to output only the palindrome. def build_prompt(lang): return ( f"Instruction: Generate a single original palindrome in {lang}.\n" "Output only the palindrome. The palindrome should be a continuous text that reads the same forward and backward.\n" "Do not output any additional text or commentary.\n" "Palindrome: " ) def grammar_prompt(pal, lang): return ( f"Rate from 0 to 100 how grammatically correct this palindrome is in {lang}. " "Return only a number with no explanation.\n\n" f'"{pal}"\n' ) def extract_score(text): match = re.search(r"\d{1,3}", text) if match: score = int(match.group()) return min(max(score, 0), 100) return 0 # Main benchmark function that runs tests and saves CSV results. def run_benchmark_all(): results = [] for model_name in premium_models: gen_pipeline = load_generation_pipeline(model_name) if gen_pipeline is None: continue for code, lang in languages.items(): prompt = build_prompt(lang) try: gen_output = gen_pipeline(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip() except Exception as e: gen_output = f"Error generating text: {e}" valid = is_palindrome(gen_output) cleaned_len = len(clean_text(gen_output)) scores = [] for rater in rater_models: rprompt = grammar_prompt(gen_output, lang) try: rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text'] score = extract_score(rtext) scores.append(score) except Exception as e: scores.append(0) avg_score = np.mean(scores) if scores else 0 penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5 final_score = round(cleaned_len * penalty, 2) results.append({ "Model": model_name, "Language": lang, "Palindrome": gen_output, "Valid": "✅" if valid else "❌", "Length": cleaned_len, "Grammar Score": avg_score, "Final Score": final_score }) df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True) csv_path = "benchmark_results.csv" df.to_csv(csv_path, index=False) print(f"CSV saved to {os.path.abspath(csv_path)}") return gr.Dataframe(df), csv_path # Build the Gradio UI using Blocks for a canvas layout. with gr.Blocks(title="Premium Model Palindrome Benchmark") as demo: gr.Markdown("# Premium Model Palindrome Benchmark") gr.Markdown( "This benchmark runs automatically over 6 premium text-generation models across 5 languages and saves the results " "to a CSV file upon completion." ) with gr.Row(): run_button = gr.Button("Run All Benchmarks") output_table = gr.Dataframe(label="Benchmark Results") output_file = gr.File(label="Download CSV Results") run_button.click(fn=run_benchmark_all, inputs=[], outputs=[output_table, output_file]) demo.launch()