Spaces:

lcipolina
/

LLM_OpenSpiel_Arena

Sleeping

App Files Files Community

LLM_OpenSpiel_Arena / app.py

lcipolina

Changed the axis of the plots

613e785 verified about 2 months ago

raw

history blame contribute delete

9.6 kB

	import os
	import json
	import sqlite3
	import glob
	import pandas as pd
	import gradio as gr
	from datetime import datetime
	from typing import Dict, List

	# Directory to store SQLite results
	db_dir = "results/"

	def find_or_download_db():
	"""Check if SQLite .db files exist; if not, attempt to download from cloud storage."""
	if not os.path.exists(db_dir):
	os.makedirs(db_dir)
	db_files = glob.glob(os.path.join(db_dir, "*.db"))

	# Ensure the random bot database exists
	if "results/random_None.db" not in db_files:
	raise FileNotFoundError("Please upload results for the random agent in a file named 'random_None.db'.")

	return db_files

	def extract_agent_info(filename: str):
	"""Extract agent type and model name from the filename."""
	base_name = os.path.basename(filename).replace(".db", "")
	parts = base_name.split("_", 1)
	if len(parts) == 2:
	agent_type, model_name = parts
	else:
	agent_type, model_name = parts[0], "Unknown"
	return agent_type, model_name

	def get_available_games(include_aggregated=True) -> List[str]:
	"""Extracts all unique game names from all SQLite databases. Includes 'Aggregated Performance' only when required."""
	db_files = find_or_download_db()
	game_names = set()

	for db_file in db_files:
	conn = sqlite3.connect(db_file)
	try:
	query = "SELECT DISTINCT game_name FROM moves"
	df = pd.read_sql_query(query, conn)
	game_names.update(df["game_name"].tolist())
	except Exception:
	pass # Ignore errors if table doesn't exist
	finally:
	conn.close()

	game_list = sorted(game_names) if game_names else ["No Games Found"]
	if include_aggregated:
	game_list.insert(0, "Aggregated Performance") # Ensure 'Aggregated Performance' is always first
	return game_list

	def extract_illegal_moves_summary()-> pd.DataFrame:
	"""Extracts the number of illegal moves made by each LLM agent.

	Returns:
	pd.DataFrame: DataFrame with columns [agent_name, illegal_moves].
	"""
	db_files = find_or_download_db()
	summary = []
	for db_file in db_files:
	agent_type, model_name = extract_agent_info(db_file)
	if agent_type == "random":
	continue # Skip the random agent from this analysis
	conn = sqlite3.connect(db_file)
	try:
	# Count number of illegal moves from the illegal_moves table
	df = pd.read_sql_query("SELECT COUNT(*) AS illegal_moves FROM illegal_moves", conn)
	count = int(df["illegal_moves"].iloc[0]) if not df.empty else 0
	except Exception:
	count = 0 # If the table does not exist or error occurs
	summary.append({"agent_name": model_name, "illegal_moves": count})
	conn.close()
	return pd.DataFrame(summary)

	def extract_leaderboard_stats(game_name: str) -> pd.DataFrame:
	"""Extract and aggregate leaderboard stats from all SQLite databases."""
	db_files = find_or_download_db()
	all_stats = []

	for db_file in db_files:
	conn = sqlite3.connect(db_file)
	agent_type, model_name = extract_agent_info(db_file)

	# Skip random agent rows
	if agent_type == "random":
	conn.close()
	continue

	if game_name == "Aggregated Performance":
	query = "SELECT COUNT(DISTINCT episode) AS games_played, " \
	"SUM(reward) AS total_rewards " \
	"FROM game_results"
	df = pd.read_sql_query(query, conn)

	# Use avg_generation_time from a specific game (e.g., Kuhn Poker)
	game_query = "SELECT AVG(generation_time) FROM moves WHERE game_name = 'kuhn_poker'"
	avg_gen_time = conn.execute(game_query).fetchone()[0] or 0
	else:
	query = "SELECT COUNT(DISTINCT episode) AS games_played, " \
	"SUM(reward) AS total_rewards " \
	"FROM game_results WHERE game_name = ?"
	df = pd.read_sql_query(query, conn, params=(game_name,))

	# Fetch average generation time from moves table
	gen_time_query = "SELECT AVG(generation_time) FROM moves WHERE game_name = ?"
	avg_gen_time = conn.execute(gen_time_query, (game_name,)).fetchone()[0] or 0

	# Keep division by 2 for total rewards
	df["total_rewards"] = df["total_rewards"].fillna(0).astype(float) / 2

	# Ensure avg_gen_time has decimals
	avg_gen_time = round(avg_gen_time, 3)

	# Calculate win rate against random bot using moves table
	vs_random_query = """
	SELECT COUNT(DISTINCT gr.episode) FROM game_results gr
	JOIN moves m ON gr.game_name = m.game_name AND gr.episode = m.episode
	WHERE m.opponent = 'random_None' AND gr.reward > 0
	"""
	total_vs_random_query = """
	SELECT COUNT(DISTINCT gr.episode) FROM game_results gr
	JOIN moves m ON gr.game_name = m.game_name AND gr.episode = m.episode
	WHERE m.opponent = 'random_None'
	"""
	wins_vs_random = conn.execute(vs_random_query).fetchone()[0] or 0
	total_vs_random = conn.execute(total_vs_random_query).fetchone()[0] or 0
	vs_random_rate = (wins_vs_random / total_vs_random * 100) if total_vs_random > 0 else 0

	df.insert(0, "agent_name", model_name) # Ensure agent_name is the first column
	df.insert(1, "agent_type", agent_type) # Ensure agent_type is second column
	df["avg_generation_time (sec)"] = avg_gen_time
	df["win vs_random (%)"] = round(vs_random_rate, 2)

	all_stats.append(df)
	conn.close()

	leaderboard_df = pd.concat(all_stats, ignore_index=True) if all_stats else pd.DataFrame()

	if leaderboard_df.empty:
	leaderboard_df = pd.DataFrame(columns=["agent_name", "agent_type", "# games", "total rewards", "avg_generation_time (sec)", "win-rate", "win vs_random (%)"])

	return leaderboard_df


	##########################################################
	with gr.Blocks() as interface:
	# Tab for playing games against LLMs
	with gr.Tab("Game Arena"):
	gr.Markdown("# Play Against LLMs\nChoose a game and an opponent to play!")
	# Dropdown to select a game, excluding 'Aggregated Performance'
	game_dropdown = gr.Dropdown(get_available_games(include_aggregated=False), label="Select a Game")
	# Dropdown to choose an opponent (Random Bot or LLM)
	opponent_dropdown = gr.Dropdown(["Random Bot", "LLM"], label="Choose Opponent")
	# Button to start the game
	play_button = gr.Button("Start Game")
	# Textbox to display the game log
	game_output = gr.Textbox(label="Game Log")

	# Event to start the game when the button is clicked
	play_button.click(lambda game, opponent: f"Game {game} started against {opponent}", inputs=[game_dropdown, opponent_dropdown], outputs=[game_output])

	# Tab for leaderboard and performance tracking
	with gr.Tab("Leaderboard"):
	gr.Markdown("# LLM Model Leaderboard\nTrack performance across different games!")
	# Dropdown to select a game, including 'Aggregated Performance'
	leaderboard_game_dropdown = gr.Dropdown(choices=get_available_games(), label="Select Game", value="Aggregated Performance")
	# Table to display leaderboard statistics
	leaderboard_table = gr.Dataframe(value=extract_leaderboard_stats("Aggregated Performance"), headers=["agent_name", "agent_type", "# games", "total rewards", "avg_generation_time (sec)", "win-rate", "win vs_random (%)"], every=5)
	# Update the leaderboard when a new game is selected
	leaderboard_game_dropdown.change(fn=extract_leaderboard_stats, inputs=[leaderboard_game_dropdown], outputs=[leaderboard_table])

	# Tab for visual insights and performance metrics
	with gr.Tab("Metrics Dashboard"):
	gr.Markdown("# 📊 Metrics Dashboard\nVisual summaries of LLM performance across games.")

	# Extract data for visualizations
	metrics_df = extract_leaderboard_stats("Aggregated Performance")

	with gr.Row():
	gr.BarPlot(
	value=metrics_df,
	x="win vs_random (%)",
	y="agent_name",
	title="Win Rate vs Random Bot",
	x_label="LLM Model",
	y_label="Win Rate (%)"
	)

	with gr.Row():
	gr.BarPlot(
	value=metrics_df,
	x="avg_generation_time (sec)",
	y="agent_name",
	title="Average Generation Time",
	x_label="LLM Model",
	y_label="Time (sec)"
	)

	with gr.Row():
	gr.Dataframe(value=metrics_df, label="Performance Summary")

	# Tab for LLM reasoning and illegal move analysis
	with gr.Tab("Analysis of LLM Reasoning"):
	gr.Markdown("# 🧠 Analysis of LLM Reasoning\nInsights into move legality and decision behavior.")

	# Load illegal move stats using global function
	illegal_df = extract_illegal_moves_summary()

	with gr.Row():
	gr.BarPlot(
	value=illegal_df,
	x="illegal_moves",
	y="agent_name",
	title="Illegal Moves by Model",
	x_label="LLM Model",
	y_label="# of Illegal Moves"
	)

	with gr.Row():
	gr.Dataframe(value=illegal_df, label="Illegal Move Summary")

	# Launch the Gradio interface
	interface.launch()