Spaces:
Sleeping
Sleeping
import os | |
import json | |
import sqlite3 | |
import glob | |
import pandas as pd | |
import gradio as gr | |
from datetime import datetime | |
from typing import Dict, List | |
# Directory to store SQLite results | |
db_dir = "results/" | |
def find_or_download_db(): | |
"""Check if SQLite .db files exist; if not, attempt to download from cloud storage.""" | |
if not os.path.exists(db_dir): | |
os.makedirs(db_dir) | |
db_files = glob.glob(os.path.join(db_dir, "*.db")) | |
# Ensure the random bot database exists | |
if "results/random_None.db" not in db_files: | |
raise FileNotFoundError("Please upload results for the random agent in a file named 'random_None.db'.") | |
return db_files | |
def extract_agent_info(filename: str): | |
"""Extract agent type and model name from the filename.""" | |
base_name = os.path.basename(filename).replace(".db", "") | |
parts = base_name.split("_", 1) | |
if len(parts) == 2: | |
agent_type, model_name = parts | |
else: | |
agent_type, model_name = parts[0], "Unknown" | |
return agent_type, model_name | |
def get_available_games(include_aggregated=True) -> List[str]: | |
"""Extracts all unique game names from all SQLite databases. Includes 'Aggregated Performance' only when required.""" | |
db_files = find_or_download_db() | |
game_names = set() | |
for db_file in db_files: | |
conn = sqlite3.connect(db_file) | |
try: | |
query = "SELECT DISTINCT game_name FROM moves" | |
df = pd.read_sql_query(query, conn) | |
game_names.update(df["game_name"].tolist()) | |
except Exception: | |
pass # Ignore errors if table doesn't exist | |
finally: | |
conn.close() | |
game_list = sorted(game_names) if game_names else ["No Games Found"] | |
if include_aggregated: | |
game_list.insert(0, "Aggregated Performance") # Ensure 'Aggregated Performance' is always first | |
return game_list | |
def extract_illegal_moves_summary()-> pd.DataFrame: | |
"""Extracts the number of illegal moves made by each LLM agent. | |
Returns: | |
pd.DataFrame: DataFrame with columns [agent_name, illegal_moves]. | |
""" | |
db_files = find_or_download_db() | |
summary = [] | |
for db_file in db_files: | |
agent_type, model_name = extract_agent_info(db_file) | |
if agent_type == "random": | |
continue # Skip the random agent from this analysis | |
conn = sqlite3.connect(db_file) | |
try: | |
# Count number of illegal moves from the illegal_moves table | |
df = pd.read_sql_query("SELECT COUNT(*) AS illegal_moves FROM illegal_moves", conn) | |
count = int(df["illegal_moves"].iloc[0]) if not df.empty else 0 | |
except Exception: | |
count = 0 # If the table does not exist or error occurs | |
summary.append({"agent_name": model_name, "illegal_moves": count}) | |
conn.close() | |
return pd.DataFrame(summary) | |
def extract_leaderboard_stats(game_name: str) -> pd.DataFrame: | |
"""Extract and aggregate leaderboard stats from all SQLite databases.""" | |
db_files = find_or_download_db() | |
all_stats = [] | |
for db_file in db_files: | |
conn = sqlite3.connect(db_file) | |
agent_type, model_name = extract_agent_info(db_file) | |
# Skip random agent rows | |
if agent_type == "random": | |
conn.close() | |
continue | |
if game_name == "Aggregated Performance": | |
query = "SELECT COUNT(DISTINCT episode) AS games_played, " \ | |
"SUM(reward) AS total_rewards " \ | |
"FROM game_results" | |
df = pd.read_sql_query(query, conn) | |
# Use avg_generation_time from a specific game (e.g., Kuhn Poker) | |
game_query = "SELECT AVG(generation_time) FROM moves WHERE game_name = 'kuhn_poker'" | |
avg_gen_time = conn.execute(game_query).fetchone()[0] or 0 | |
else: | |
query = "SELECT COUNT(DISTINCT episode) AS games_played, " \ | |
"SUM(reward) AS total_rewards " \ | |
"FROM game_results WHERE game_name = ?" | |
df = pd.read_sql_query(query, conn, params=(game_name,)) | |
# Fetch average generation time from moves table | |
gen_time_query = "SELECT AVG(generation_time) FROM moves WHERE game_name = ?" | |
avg_gen_time = conn.execute(gen_time_query, (game_name,)).fetchone()[0] or 0 | |
# Keep division by 2 for total rewards | |
df["total_rewards"] = df["total_rewards"].fillna(0).astype(float) / 2 | |
# Ensure avg_gen_time has decimals | |
avg_gen_time = round(avg_gen_time, 3) | |
# Calculate win rate against random bot using moves table | |
vs_random_query = """ | |
SELECT COUNT(DISTINCT gr.episode) FROM game_results gr | |
JOIN moves m ON gr.game_name = m.game_name AND gr.episode = m.episode | |
WHERE m.opponent = 'random_None' AND gr.reward > 0 | |
""" | |
total_vs_random_query = """ | |
SELECT COUNT(DISTINCT gr.episode) FROM game_results gr | |
JOIN moves m ON gr.game_name = m.game_name AND gr.episode = m.episode | |
WHERE m.opponent = 'random_None' | |
""" | |
wins_vs_random = conn.execute(vs_random_query).fetchone()[0] or 0 | |
total_vs_random = conn.execute(total_vs_random_query).fetchone()[0] or 0 | |
vs_random_rate = (wins_vs_random / total_vs_random * 100) if total_vs_random > 0 else 0 | |
df.insert(0, "agent_name", model_name) # Ensure agent_name is the first column | |
df.insert(1, "agent_type", agent_type) # Ensure agent_type is second column | |
df["avg_generation_time (sec)"] = avg_gen_time | |
df["win vs_random (%)"] = round(vs_random_rate, 2) | |
all_stats.append(df) | |
conn.close() | |
leaderboard_df = pd.concat(all_stats, ignore_index=True) if all_stats else pd.DataFrame() | |
if leaderboard_df.empty: | |
leaderboard_df = pd.DataFrame(columns=["agent_name", "agent_type", "# games", "total rewards", "avg_generation_time (sec)", "win-rate", "win vs_random (%)"]) | |
return leaderboard_df | |
########################################################## | |
with gr.Blocks() as interface: | |
# Tab for playing games against LLMs | |
with gr.Tab("Game Arena"): | |
gr.Markdown("# Play Against LLMs\nChoose a game and an opponent to play!") | |
# Dropdown to select a game, excluding 'Aggregated Performance' | |
game_dropdown = gr.Dropdown(get_available_games(include_aggregated=False), label="Select a Game") | |
# Dropdown to choose an opponent (Random Bot or LLM) | |
opponent_dropdown = gr.Dropdown(["Random Bot", "LLM"], label="Choose Opponent") | |
# Button to start the game | |
play_button = gr.Button("Start Game") | |
# Textbox to display the game log | |
game_output = gr.Textbox(label="Game Log") | |
# Event to start the game when the button is clicked | |
play_button.click(lambda game, opponent: f"Game {game} started against {opponent}", inputs=[game_dropdown, opponent_dropdown], outputs=[game_output]) | |
# Tab for leaderboard and performance tracking | |
with gr.Tab("Leaderboard"): | |
gr.Markdown("# LLM Model Leaderboard\nTrack performance across different games!") | |
# Dropdown to select a game, including 'Aggregated Performance' | |
leaderboard_game_dropdown = gr.Dropdown(choices=get_available_games(), label="Select Game", value="Aggregated Performance") | |
# Table to display leaderboard statistics | |
leaderboard_table = gr.Dataframe(value=extract_leaderboard_stats("Aggregated Performance"), headers=["agent_name", "agent_type", "# games", "total rewards", "avg_generation_time (sec)", "win-rate", "win vs_random (%)"], every=5) | |
# Update the leaderboard when a new game is selected | |
leaderboard_game_dropdown.change(fn=extract_leaderboard_stats, inputs=[leaderboard_game_dropdown], outputs=[leaderboard_table]) | |
# Tab for visual insights and performance metrics | |
with gr.Tab("Metrics Dashboard"): | |
gr.Markdown("# π Metrics Dashboard\nVisual summaries of LLM performance across games.") | |
# Extract data for visualizations | |
metrics_df = extract_leaderboard_stats("Aggregated Performance") | |
with gr.Row(): | |
gr.BarPlot( | |
value=metrics_df, | |
x="win vs_random (%)", | |
y="agent_name", | |
title="Win Rate vs Random Bot", | |
x_label="LLM Model", | |
y_label="Win Rate (%)" | |
) | |
with gr.Row(): | |
gr.BarPlot( | |
value=metrics_df, | |
x="avg_generation_time (sec)", | |
y="agent_name", | |
title="Average Generation Time", | |
x_label="LLM Model", | |
y_label="Time (sec)" | |
) | |
with gr.Row(): | |
gr.Dataframe(value=metrics_df, label="Performance Summary") | |
# Tab for LLM reasoning and illegal move analysis | |
with gr.Tab("Analysis of LLM Reasoning"): | |
gr.Markdown("# π§ Analysis of LLM Reasoning\nInsights into move legality and decision behavior.") | |
# Load illegal move stats using global function | |
illegal_df = extract_illegal_moves_summary() | |
with gr.Row(): | |
gr.BarPlot( | |
value=illegal_df, | |
x="illegal_moves", | |
y="agent_name", | |
title="Illegal Moves by Model", | |
x_label="LLM Model", | |
y_label="# of Illegal Moves" | |
) | |
with gr.Row(): | |
gr.Dataframe(value=illegal_df, label="Illegal Move Summary") | |
# Launch the Gradio interface | |
interface.launch() | |