Spaces:

MVLLL
/

Multi-view-leaderboard

Running

ajaxzhan

add analysis and vis support

32b50e6 4 days ago

17.8 kB

	import gradio as gr
	import pandas as pd
	import pandas as pd
	import json
	import plotly.express as px

	def on_confirm(dataset_radio, num_parts_dropdown, perspective_radio, division_method_radio):
	# 根据用户选择的参数构建文件路径
	num_parts = num_parts_dropdown

	if dataset_radio == "HumanEval":
	base_path = "./dividing_into_different_subsets"
	else: # MBPP
	base_path = "./dividing_into_different_subsets_mbpp"

	method = "QS" if division_method_radio == "Equal Frequency Partitioning" else "EI"

	# 根据perspective选择读取对应的文件
	if "Tokens" in perspective_radio:
	df = pd.read_csv(f"{base_path}/{num_parts}/{method}/token_counts_{method}.csv")
	elif "Lines" in perspective_radio:
	df = pd.read_csv(f"{base_path}/{num_parts}/{method}/line_counts_{method}.csv")
	elif "Complexity" in perspective_radio:
	df = pd.read_csv(f"{base_path}/{num_parts}/{method}/CC_{method}.csv")
	elif "Problem Types" in perspective_radio:
	df = pd.read_csv(f"{base_path}/cata_result.csv")

	# 加载分析报告
	analysis_result,_ = load_analysis_report(dataset_radio, num_parts_dropdown, perspective_radio, division_method_radio)
	# AI分析列
	df["Analysis"] = df["Model"].map(lambda m: analysis_result.get(m, "No analysis provided."))
	return df

	# 生成 CSS 样式
	def generate_css(line_counts, token_counts, cyclomatic_complexity, problem_type, show_high, show_medium, show_low):
	css = """
	#dataframe th {
	background-color: #f2f2f2

	}
	"""
	colors = ["#e6f7ff", "#ffeecc", "#e6ffe6", "#ffe6e6"]
	categories = [line_counts, token_counts, cyclomatic_complexity]
	category_index = 0
	column_index = 1

	for category in categories:
	if category:
	if show_high:
	css += f"#dataframe td:nth-child({column_index + 1}) {{ background-color: {colors[category_index]}; }}\n"
	column_index += 1
	if show_medium:
	css += f"#dataframe td:nth-child({column_index + 1}) {{ background-color: {colors[category_index]}; }}\n"
	column_index += 1
	if show_low:
	css += f"#dataframe td:nth-child({column_index + 1}) {{ background-color: {colors[category_index]}; }}\n"
	column_index += 1
	category_index += 1

	# 为 Problem Type 相关的三个子列设置固定颜色
	if problem_type:
	problem_type_color = "#d4f0fc" # 你可以选择任何你喜欢的颜色
	css += f"#dataframe td:nth-child({column_index + 1}) {{ background-color: {problem_type_color}; }}\n"
	css += f"#dataframe td:nth-child({column_index + 2}) {{ background-color: {problem_type_color}; }}\n"
	css += f"#dataframe td:nth-child({column_index + 3}) {{ background-color: {problem_type_color}; }}\n"

	# 隐藏 "data" 标识
	css += """
	.gradio-container .dataframe-container::before {
	content: none !important;
	}
	"""

	return css

	# AI分析
	def load_analysis_report(dataset_radio, num_parts_dropdown, perspective_radio, division_method_radio):

	num_parts = num_parts_dropdown
	method = "QS" if division_method_radio == "Equal Frequency Partitioning" else "EI"

	# 根据perspective确定文件路径
	if "Tokens" in perspective_radio:
	perspective = "token_counts"
	elif "Lines" in perspective_radio:
	perspective = "line_counts"
	elif "Complexity" in perspective_radio:
	perspective = "CC"
	else:
	perspective = "problem_type"

	base_path = "./llm_insight"
	if perspective == "problem_type":
	report_file = f"{base_path}/{dataset_radio}/{perspective}_report.json"
	recommendation_file = f"{base_path}/{dataset_radio}/{perspective}_recommendation.json"
	else:
	report_file = f"{base_path}/{dataset_radio}/{num_parts}/{method}/{perspective}_report.json"
	recommendation_file = f"{base_path}/{dataset_radio}/{num_parts}/{method}/{perspective}_recommendation.json"

	try:
	with open(report_file, 'r', encoding='utf-8') as f:
	analysis_result = json.load(f)
	except Exception as e:
	analysis_result = f"[Error] error load analysis report: {e}"

	try:
	with open(recommendation_file, 'r', encoding='utf-8') as f:
	recommendation_result = json.load(f)
	except Exception as e:
	recommendation_result = f"[Error] error load model recommendation: {e}"

	return (analysis_result,recommendation_result)

	# 可视化
	def plot_visualization(dataset_radio, perspective_radio, num_parts, plot_type):
	if dataset_radio == "HumanEval":
	base_path = "./dividing_into_different_subsets"
	else: # MBPP
	base_path = "./dividing_into_different_subsets_mbpp"

	if "Tokens" in perspective_radio:
	file_path = f'{base_path}/{num_parts}/QS/token_counts_QS.csv'
	elif "Lines" in perspective_radio:
	file_path = f'{base_path}/{num_parts}/QS/line_counts_QS.csv'
	elif "Complexity" in perspective_radio:
	file_path = f'{base_path}/{num_parts}/QS/CC_QS.csv'
	else: # Problem Types
	file_path = f'{base_path}/cata_result.csv'

	df = pd.read_csv(file_path)
	df.set_index('Model', inplace=True)
	df_transposed = df.T

	if plot_type == "Line Chart":
	fig = px.line(df_transposed,
	x=df_transposed.index,
	y=df_transposed.columns,
	title='Model Performance Across Different Subsets',
	labels={'value': 'Evaluation Score', 'index': 'Subsets'},
	color_discrete_sequence=px.colors.qualitative.Plotly)
	fig.update_traces(hovertemplate='%{y}')
	elif plot_type == "Radar Chart": # Radar Chart
	# 重新组织数据为雷达图所需格式
	radar_data = []
	for model in df.index:
	for subset, score in df.loc[model].items():
	radar_data.append({
	'Model': model,
	'Subset': subset,
	'Score': score
	})

	radar_df = pd.DataFrame(radar_data)

	colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']

	# 创建雷达图
	fig = px.line_polar(radar_df,
	r='Score',
	theta='Subset',
	color='Model',
	line_close=True,
	color_discrete_sequence=colors,
	title='Model Performance Radar Chart')

	# 自定义每个模型的线条样式
	for i, trace in enumerate(fig.data):
	trace.update(
	fill=None, # 移除填充
	line=dict(
	width=2,
	dash='solid' if i % 2 == 0 else 'dash', # 交替使用实线和虚线
	)
	)

	# 优化雷达图的显示
	fig.update_layout(
	polar=dict(
	radialaxis=dict(
	visible=True,
	range=[0, 100],
	showline=True,
	linewidth=1,
	gridcolor='lightgrey'
	),
	angularaxis=dict(
	showline=True,
	linewidth=1,
	gridcolor='lightgrey'
	)
	),
	showlegend=True,
	legend=dict(
	yanchor="middle", # 垂直居中
	y=0.5,
	xanchor="left",
	x=1.2, # 将图例移到雷达图右侧
	bgcolor="rgba(255, 255, 255, 0.8)", # 半透明白色背景
	bordercolor="lightgrey", # 添加边框
	borderwidth=1
	),
	margin=dict(r=150), # 增加右侧边距，为图例留出空间
	paper_bgcolor='white'
	)
	else: # Heatmap
	# 创建热力图
	fig = px.imshow(df_transposed,
	labels=dict(x="Model", y="Subset", color="Score"),
	color_continuous_scale="RdYlBu_r", # 使用科研风格配色：红-黄-蓝
	aspect="auto", # 自动调整宽高比
	title="Model Performance Heatmap")

	# 优化热力图显示
	fig.update_layout(
	title=dict(
	text='Model Performance Distribution Across Subsets',
	x=0.5,
	y=0.95,
	xanchor='center',
	yanchor='top',
	font=dict(size=14)
	),
	xaxis=dict(
	title="Model",
	tickangle=45, # 斜着显示模型名称
	tickfont=dict(size=10),
	side="bottom"
	),
	yaxis=dict(
	title="Subset",
	tickfont=dict(size=10)
	),
	coloraxis=dict(
	colorbar=dict(
	title="Score",
	titleside="right",
	tickfont=dict(size=10),
	titlefont=dict(size=12),
	len=0.9, # 色条长度
	)
	),
	margin=dict(t=80, r=100, b=80, l=80), # 调整边距
	paper_bgcolor='white',
	plot_bgcolor='white'
	)

	# 添加具体数值标注
	annotations = []
	for i in range(len(df_transposed.index)):
	for j in range(len(df_transposed.columns)):
	annotations.append(
	dict(
	x=j,
	y=i,
	text=f"{df_transposed.iloc[i, j]:.1f}",
	showarrow=False,
	font=dict(size=9, color='black')
	)
	)
	fig.update_layout(annotations=annotations)

	return fig

	# 旭日图
	def plot_recommendation_sunburst(dataset_radio, num_parts_dropdown, perspective_radio, division_method_radio):
	import plotly.graph_objects as go
	_, recommendation_result = load_analysis_report(dataset_radio, num_parts_dropdown, perspective_radio, division_method_radio)
	labels = ['Model Recommendation'] # 根节点
	parents = ['']
	values = []
	customdata = ['Choose your preference model']

	# 统计每个场景下模型数量
	scenario_model_count = {}
	total_model_count = 0

	for scenario, model_list in recommendation_result.items():
	# 处理模型
	model_items = []
	if isinstance(model_list, dict):
	model_items = model_list.items()
	elif isinstance(model_list, list):
	for d in model_list:
	if isinstance(d, dict):
	for k, v in d.items():
	model_items.append((k, v))

	scenario_model_count[scenario] = len(model_items)
	total_model_count += len(model_items)

	# 根节点 value
	values.append(total_model_count)

	# 再次遍历，填充 labels/parents/values/customdata
	for scenario, model_list in recommendation_result.items():
	scenario_words = scenario.split()
	short_label = " ".join(scenario_words[:3]) + "..." if len(scenario_words) > 3 else scenario
	labels.append(short_label)
	parents.append('Model Recommendation')
	values.append(scenario_model_count[scenario])
	customdata.append(scenario)

	# 处理模型
	model_items = []
	if isinstance(model_list, dict):
	model_items = model_list.items()
	elif isinstance(model_list, list):
	for d in model_list:
	if isinstance(d, dict):
	for k, v in d.items():
	model_items.append((k, v))

	for model, reason in model_items:
	labels.append(model)
	parents.append(short_label)
	values.append(1)
	customdata.append(reason)

	fig = go.Figure(go.Sunburst(
	labels=labels,
	parents=parents,
	values=values,
	branchvalues="total",
	hovertemplate='%{customdata}<extra></extra>',
	customdata=customdata
	))
	fig.update_layout(margin=dict(t=10, l=10, r=10, b=10), height=500)
	return fig

	### Gradio代码部分 ###

	# 自定义 CSS 样式
	custom_css = """
	<style>
	body {
	font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
	background-color: #f9f9f9;
	}
	.gr-label {
	font-size: 15px;
	}
	.gr-button-primary {
	background-color: #4CAF50;
	color: white;
	border-radius: 8px;
	}
	.gr-tabs > .tab-nav {
	background-color: #e0e0e0;
	border-bottom: 2px solid #ccc;
	}
	.gr-tabs > .tab-nav button.selected {
	background-color: #ffffff !important;
	border-bottom: 2px solid #4CAF50;
	}
	.gr-panel {
	padding: 20px;
	border-radius: 10px;
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
	background-color: #fff;
	}
	.markdown-title {
	font-size: 1.5em;
	font-weight: bold;
	margin-bottom: 10px;
	}
	.analysis-box {
	background-color: #f1f8ff;
	padding: 20px;
	border-left: 5px solid #4CAF50;
	border-radius: 6px;
	margin-top: 10px;
	}
	.recommendation-box {
	background-color: #fff3cd;
	padding: 20px;
	border-left: 5px solid #ff9800;
	border-radius: 6px;
	margin-top: 10px;
	}
	</style>
	"""

	# 构建界面
	with gr.Blocks(css=custom_css) as iface:
	gr.HTML("""
	<div style='text-align:center; padding:15px;'>
	<h1>Multi-view Code LLM Leaderboard</h1>
	<p>Multi-view Leaderboard: Evaluating Large Language Models From Multiple Views</p>
	</div>
	""")

	with gr.Row():
	# 选择配置
	with gr.Column(scale=1):
	dataset_radio = gr.Radio(
	["HumanEval", "MBPP"],
	label="Select a dataset",
	value="HumanEval"
	)
	num_parts_slider = gr.Slider(
	minimum=3,
	maximum=8,
	step=1,
	label="Choose the Number of Subsets",
	value=3
	)

	# 将多个checkbox改为一个radio
	perspective_radio = gr.Radio(
	["I - Num of Tokens in Problem Desc",
	"II - Num of Lines in Problem Desc",
	"III - Complexity of Reference Code",
	"IV - Problem Types"],
	label="Choose Perspective",
	value="I - Num of Tokens in Problem Desc"
	)

	# 统一的division method radio
	division_method_radio = gr.Radio(
	["Equal Frequency Partitioning", "Equal Interval Partitioning"],
	label="Choose the Division Method",
	visible=True
	)

	confirm_btn = gr.Button("Confirm", variant="primary")

	# 核心展示
	with gr.Column(scale=2):
	with gr.Tabs():
	# 表格
	with gr.TabItem("Ranking Table"):
	data_table = gr.Dataframe(headers=["Model", "Score","Analysis"],interactive=True)
	# 可视化
	with gr.TabItem("Visualization"):
	plot_type = gr.Radio(
	choices=["Line Chart", "Radar Chart","Heatmap"],
	label="Select Plot Type",
	value="Line Chart"
	)
	chart = gr.Plot()
	# AI分析
	with gr.TabItem("Model selection suggestions"):
	with gr.Column():
	gr.Markdown("<h2 class='markdown-title'>🎯 Model Recommendation</h2>")
	recommendation_plot = gr.Plot()
	scenario_legend = gr.Markdown(value="") # 新增图例

	def update_perspective_options(dataset):
	if dataset == "MBPP":
	return gr.update(choices=[
	"I - Num of Tokens in Problem Desc",
	"III - Complexity of Reference Code",
	"IV - Problem Types"
	])
	else:
	return gr.update(choices=[
	"I - Num of Tokens in Problem Desc",
	"II - Num of Lines in Problem Desc",
	"III - Complexity of Reference Code",
	"IV - Problem Types"
	])

	dataset_radio.change(
	fn=update_perspective_options,
	inputs=dataset_radio,
	outputs=perspective_radio
	)


	# 绑定事件
	confirm_btn.click(
	fn=on_confirm,
	inputs=[dataset_radio, num_parts_slider, perspective_radio, division_method_radio],
	outputs=data_table
	).then(
	fn=load_analysis_report,
	inputs=[dataset_radio, num_parts_slider, perspective_radio, division_method_radio],
	outputs=[gr.State()]
	).then(
	fn=plot_visualization,
	inputs=[dataset_radio, perspective_radio, num_parts_slider, plot_type],
	outputs=chart
	).then(
	fn=plot_recommendation_sunburst,
	inputs=[dataset_radio, num_parts_slider, perspective_radio, division_method_radio],
	outputs=[recommendation_plot] # 注意这里是列表
	)

	plot_type.change(
	fn=plot_visualization,
	inputs=[dataset_radio, perspective_radio, num_parts_slider, plot_type],
	outputs=chart
	)
	# 启动界面
	iface.launch()