diff --git a/app.py b/app.py index b25f2760cc4f061555c6afa39ae268beaffeb40e..6bcab626fe3322007a3543919b0a6a2d9502b37c 100644 --- a/app.py +++ b/app.py @@ -1,103 +1,37 @@ import gradio as gr import pandas as pd -import requests -import os -import shutil -import json import pandas as pd -import subprocess +import json import plotly.express as px -def on_confirm(dataset_radio, num_parts_dropdown, token_counts_radio, line_counts_radio, cyclomatic_complexity_radio, problem_type_checkbox): + +def on_confirm(dataset_radio, num_parts_dropdown, perspective_radio, division_method_radio): # 根据用户选择的参数构建文件路径 num_parts = num_parts_dropdown - # token_counts_split = token_counts_radio - # line_counts_split = line_counts_radio - # cyclomatic_complexity_split = cyclomatic_complexity_radio - - - # 读取数据 - dataframes = [] + if dataset_radio == "HumanEval": - if token_counts_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致 - token_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/QS/token_counts_QS.csv") - dataframes.append(token_counts_df) - if token_counts_radio=="Equal Interval Partitioning": - token_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/EI/token_counts_EI.csv") - dataframes.append(token_counts_df) - if line_counts_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致 - line_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/QS/line_counts_QS.csv") - dataframes.append(line_counts_df) - if token_counts_radio=="Equal Interval Partitioning": - line_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/EI/line_counts_EI.csv") - dataframes.append(line_counts_df) - if cyclomatic_complexity_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致 - CC_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/QS/CC_QS.csv") - dataframes.append(CC_df) - if token_counts_radio=="Equal Interval Partitioning": - CC_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/EI/CC_EI.csv") - dataframes.append(CC_df) - - - - #以下改为直接从一个划分文件中读取即可 - if problem_type_checkbox: - problem_type_df = pd.read_csv("/home/user/app/dividing_into_different_subsets/cata_result.csv") - dataframes.append(problem_type_df) - if dataset_radio == "MBPP": - if token_counts_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致 - token_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/QS/token_counts_QS.csv") - dataframes.append(token_counts_df) - if token_counts_radio=="Equal Interval Partitioning": - token_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/EI/token_counts_EI.csv") - dataframes.append(token_counts_df) - if line_counts_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致 - line_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/QS/line_counts_QS.csv") - dataframes.append(line_counts_df) - if token_counts_radio=="Equal Interval Partitioning": - line_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/EI/line_counts_EI.csv") - dataframes.append(line_counts_df) - if cyclomatic_complexity_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致 - CC_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/QS/CC_QS.csv") - dataframes.append(CC_df) - if token_counts_radio=="Equal Interval Partitioning": - CC_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/EI/CC_EI.csv") - dataframes.append(CC_df) - - - - #以下改为直接从一个划分文件中读取即可 - if problem_type_checkbox: - problem_type_df = pd.read_csv("/home/user/app/dividing_into_different_subsets_mbpp/cata_result.csv") - dataframes.append(problem_type_df) - - # 如果所有三个radio都有value,将三个文件中的所有行拼接 - if len(dataframes) > 0: - combined_df = dataframes[0] - for df in dataframes[1:]: - combined_df = pd.merge(combined_df, df, left_index=True, right_index=True, suffixes=('', '_y')) - combined_df = combined_df.loc[:, ~combined_df.columns.str.endswith('_y')] # 去除重复的列 - return combined_df - else: - return pd.DataFrame() - - - - -def execute_specified_python_files(directory_list, file_list): - for directory in directory_list: - for py_file in file_list: - file_path = os.path.join(directory, py_file) - if os.path.isfile(file_path) and py_file.endswith('.py'): - print(f"Executing {file_path}...") - try: - # 使用subprocess执行Python文件 - subprocess.run(['python', file_path], check=True) - print(f"{file_path} executed successfully.") - except subprocess.CalledProcessError as e: - print(f"Error executing {file_path}: {e}") - else: - print(f"File {file_path} does not exist or is not a Python file.") -# 定义一个函数来生成 CSS 样式 + base_path = "./dividing_into_different_subsets" + else: # MBPP + base_path = "./dividing_into_different_subsets_mbpp" + + method = "QS" if division_method_radio == "Equal Frequency Partitioning" else "EI" + + # 根据perspective选择读取对应的文件 + if "Tokens" in perspective_radio: + df = pd.read_csv(f"{base_path}/{num_parts}/{method}/token_counts_{method}.csv") + elif "Lines" in perspective_radio: + df = pd.read_csv(f"{base_path}/{num_parts}/{method}/line_counts_{method}.csv") + elif "Complexity" in perspective_radio: + df = pd.read_csv(f"{base_path}/{num_parts}/{method}/CC_{method}.csv") + elif "Problem Types" in perspective_radio: + df = pd.read_csv(f"{base_path}/cata_result.csv") + + # 加载分析报告 + analysis_result,_ = load_analysis_report(dataset_radio, num_parts_dropdown, perspective_radio, division_method_radio) + # AI分析列 + df["Analysis"] = df["Model"].map(lambda m: analysis_result.get(m, "No analysis provided.")) + return df + +# 生成 CSS 样式 def generate_css(line_counts, token_counts, cyclomatic_complexity, problem_type, show_high, show_medium, show_low): css = """ #dataframe th { @@ -139,225 +73,423 @@ def generate_css(line_counts, token_counts, cyclomatic_complexity, problem_type, return css +# AI分析 +def load_analysis_report(dataset_radio, num_parts_dropdown, perspective_radio, division_method_radio): + num_parts = num_parts_dropdown + method = "QS" if division_method_radio == "Equal Frequency Partitioning" else "EI" + + # 根据perspective确定文件路径 + if "Tokens" in perspective_radio: + perspective = "token_counts" + elif "Lines" in perspective_radio: + perspective = "line_counts" + elif "Complexity" in perspective_radio: + perspective = "CC" + else: + perspective = "problem_type" + base_path = "./llm_insight" + if perspective == "problem_type": + report_file = f"{base_path}/{dataset_radio}/{perspective}_report.json" + recommendation_file = f"{base_path}/{dataset_radio}/{perspective}_recommendation.json" + else: + report_file = f"{base_path}/{dataset_radio}/{num_parts}/{method}/{perspective}_report.json" + recommendation_file = f"{base_path}/{dataset_radio}/{num_parts}/{method}/{perspective}_recommendation.json" -def update_radio_options(token_counts, line_counts, cyclomatic_complexity, problem_type): - options = [] - if token_counts: - options.append("The Number of Tokens in Problem Descriptions") - if line_counts: - options.append("The Number of Lines in Problem Descriptions") - if cyclomatic_complexity: - options.append("The Complexity of Reference Code") - if problem_type: - options.append("Problem Type") + try: + with open(report_file, 'r', encoding='utf-8') as f: + analysis_result = json.load(f) + except Exception as e: + analysis_result = f"[Error] error load analysis report: {e}" - return gr.update(choices=options) + try: + with open(recommendation_file, 'r', encoding='utf-8') as f: + recommendation_result = json.load(f) + except Exception as e: + recommendation_result = f"[Error] error load model recommendation: {e}" + + return (analysis_result,recommendation_result) + +# 可视化 +def plot_visualization(dataset_radio, perspective_radio, num_parts, plot_type): + if dataset_radio == "HumanEval": + base_path = "./dividing_into_different_subsets" + else: # MBPP + base_path = "./dividing_into_different_subsets_mbpp" + + if "Tokens" in perspective_radio: + file_path = f'{base_path}/{num_parts}/QS/token_counts_QS.csv' + elif "Lines" in perspective_radio: + file_path = f'{base_path}/{num_parts}/QS/line_counts_QS.csv' + elif "Complexity" in perspective_radio: + file_path = f'{base_path}/{num_parts}/QS/CC_QS.csv' + else: # Problem Types + file_path = f'{base_path}/cata_result.csv' -def plot_csv(dataset_radio,radio,num): - print(dataset_radio,radio) - if dataset_radio=="HumanEval": - - if radio=="The Number of Tokens in Problem Descriptions": - radio_choice="token_counts" - file_path = f'/home/user/app/dividing_into_different_subsets/{num}/QS/{radio_choice}_QS.csv' - elif radio=="The Number of Lines in Problem Descriptions": - radio_choice="line_counts" - file_path = f'/home/user/app/dividing_into_different_subsets/{num}/QS/{radio_choice}_QS.csv' - elif radio=="The Complexity of Reference Code": - radio_choice="CC" - file_path = f'/home/user/app/dividing_into_different_subsets/{num}/QS/{radio_choice}_QS.csv' - elif radio=="Problem Type": - radio_choice="problem_type" - file_path = f'/home/user/app/dividing_into_different_subsets/cata_result.csv' - print("test!") - elif dataset_radio=="MBPP": - if radio=="The Number of Tokens in Problem Descriptions": - radio_choice="token_counts" - file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/{num}/QS/{radio_choice}_QS.csv' - elif radio=="The Number of Lines in Problem Descriptions": - radio_choice="line_counts" - file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/{num}/QS/{radio_choice}_QS.csv' - elif radio=="The Complexity of Reference Code": - radio_choice="CC" - file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/{num}/QS/{radio_choice}_QS.csv' - elif radio=="Problem Type": - radio_choice="problem_type" - file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/cata_result.csv' - print("test!") - - # file_path="E:/python-testn/pythonProject3/hh_1/dividing_into_different_subsets/3/QS/CC_QS.csv" df = pd.read_csv(file_path) - # 将第一列作为索引 df.set_index('Model', inplace=True) - - # 转置数据框,使得模型作为列,横轴作为行 df_transposed = df.T - # 使用plotly绘制折线图 - fig = px.line(df_transposed, x=df_transposed.index, y=df_transposed.columns, - title='Model Evaluation Results', - labels={'value': 'Evaluation Score', 'index': 'Evaluation Metric'}, - color_discrete_sequence=px.colors.qualitative.Plotly) - - # 设置悬停效果 - fig.update_traces(hovertemplate='%{y}') + if plot_type == "Line Chart": + fig = px.line(df_transposed, + x=df_transposed.index, + y=df_transposed.columns, + title='Model Performance Across Different Subsets', + labels={'value': 'Evaluation Score', 'index': 'Subsets'}, + color_discrete_sequence=px.colors.qualitative.Plotly) + fig.update_traces(hovertemplate='%{y}') + elif plot_type == "Radar Chart": # Radar Chart + # 重新组织数据为雷达图所需格式 + radar_data = [] + for model in df.index: + for subset, score in df.loc[model].items(): + radar_data.append({ + 'Model': model, + 'Subset': subset, + 'Score': score + }) + + radar_df = pd.DataFrame(radar_data) + + colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] + + # 创建雷达图 + fig = px.line_polar(radar_df, + r='Score', + theta='Subset', + color='Model', + line_close=True, + color_discrete_sequence=colors, + title='Model Performance Radar Chart') + + # 自定义每个模型的线条样式 + for i, trace in enumerate(fig.data): + trace.update( + fill=None, # 移除填充 + line=dict( + width=2, + dash='solid' if i % 2 == 0 else 'dash', # 交替使用实线和虚线 + ) + ) + + # 优化雷达图的显示 + fig.update_layout( + polar=dict( + radialaxis=dict( + visible=True, + range=[0, 100], + showline=True, + linewidth=1, + gridcolor='lightgrey' + ), + angularaxis=dict( + showline=True, + linewidth=1, + gridcolor='lightgrey' + ) + ), + showlegend=True, + legend=dict( + yanchor="middle", # 垂直居中 + y=0.5, + xanchor="left", + x=1.2, # 将图例移到雷达图右侧 + bgcolor="rgba(255, 255, 255, 0.8)", # 半透明白色背景 + bordercolor="lightgrey", # 添加边框 + borderwidth=1 + ), + margin=dict(r=150), # 增加右侧边距,为图例留出空间 + paper_bgcolor='white' + ) + else: # Heatmap + # 创建热力图 + fig = px.imshow(df_transposed, + labels=dict(x="Model", y="Subset", color="Score"), + color_continuous_scale="RdYlBu_r", # 使用科研风格配色:红-黄-蓝 + aspect="auto", # 自动调整宽高比 + title="Model Performance Heatmap") + + # 优化热力图显示 + fig.update_layout( + title=dict( + text='Model Performance Distribution Across Subsets', + x=0.5, + y=0.95, + xanchor='center', + yanchor='top', + font=dict(size=14) + ), + xaxis=dict( + title="Model", + tickangle=45, # 斜着显示模型名称 + tickfont=dict(size=10), + side="bottom" + ), + yaxis=dict( + title="Subset", + tickfont=dict(size=10) + ), + coloraxis=dict( + colorbar=dict( + title="Score", + titleside="right", + tickfont=dict(size=10), + titlefont=dict(size=12), + len=0.9, # 色条长度 + ) + ), + margin=dict(t=80, r=100, b=80, l=80), # 调整边距 + paper_bgcolor='white', + plot_bgcolor='white' + ) + + # 添加具体数值标注 + annotations = [] + for i in range(len(df_transposed.index)): + for j in range(len(df_transposed.columns)): + annotations.append( + dict( + x=j, + y=i, + text=f"{df_transposed.iloc[i, j]:.1f}", + showarrow=False, + font=dict(size=9, color='black') + ) + ) + fig.update_layout(annotations=annotations) return fig -def toggle_radio(checkbox, radio): - return gr.update(visible=checkbox) +# 旭日图 +def plot_recommendation_sunburst(dataset_radio, num_parts_dropdown, perspective_radio, division_method_radio): + import plotly.graph_objects as go + _, recommendation_result = load_analysis_report(dataset_radio, num_parts_dropdown, perspective_radio, division_method_radio) + labels = ['Model Recommendation'] # 根节点 + parents = [''] + values = [] + customdata = ['Choose your preference model'] + + # 统计每个场景下模型数量 + scenario_model_count = {} + total_model_count = 0 + + for scenario, model_list in recommendation_result.items(): + # 处理模型 + model_items = [] + if isinstance(model_list, dict): + model_items = model_list.items() + elif isinstance(model_list, list): + for d in model_list: + if isinstance(d, dict): + for k, v in d.items(): + model_items.append((k, v)) + + scenario_model_count[scenario] = len(model_items) + total_model_count += len(model_items) + + # 根节点 value + values.append(total_model_count) + + # 再次遍历,填充 labels/parents/values/customdata + for scenario, model_list in recommendation_result.items(): + scenario_words = scenario.split() + short_label = " ".join(scenario_words[:3]) + "..." if len(scenario_words) > 3 else scenario + labels.append(short_label) + parents.append('Model Recommendation') + values.append(scenario_model_count[scenario]) + customdata.append(scenario) + + # 处理模型 + model_items = [] + if isinstance(model_list, dict): + model_items = model_list.items() + elif isinstance(model_list, list): + for d in model_list: + if isinstance(d, dict): + for k, v in d.items(): + model_items.append((k, v)) + + for model, reason in model_items: + labels.append(model) + parents.append(short_label) + values.append(1) + customdata.append(reason) + + fig = go.Figure(go.Sunburst( + labels=labels, + parents=parents, + values=values, + branchvalues="total", + hovertemplate='%{customdata}', + customdata=customdata + )) + fig.update_layout(margin=dict(t=10, l=10, r=10, b=10), height=500) + return fig -def toggle_line_counts_visibility(dataset): - if dataset == "MBPP": - return gr.update(visible=False) - else: - return gr.update(visible=True) +### Gradio代码部分 ### - # 创建 Gradio 界面 -import gradio as gr +# 自定义 CSS 样式 +custom_css = """ + +""" -with gr.Blocks() as iface: +# 构建界面 +with gr.Blocks(css=custom_css) as iface: gr.HTML(""" - - - """) - - with gr.Tabs() as tabs: - with gr.TabItem("Evaluation Result"): - with gr.Row(): - with gr.Column(scale=2): - with gr.Row(): - with gr.Column(): - dataset_radio = gr.Radio(["HumanEval", "MBPP"], label="Select Dataset ") - - - with gr.Row(): - custom_css = """ - - """ - - with gr.Column(): - gr.Markdown( - f"{custom_css}
Choose Division Perspective
") - - token_counts_checkbox = gr.Checkbox(label="I-The Number of Tokens in Problem Descriptions") - line_counts_checkbox = gr.Checkbox(label="II-The Number of Lines in Problem Descriptions") - dataset_radio.change(fn=toggle_line_counts_visibility, inputs=dataset_radio, - outputs=line_counts_checkbox) - cyclomatic_complexity_checkbox = gr.Checkbox(label="III-The Complexity of Reference Code") - problem_type_checkbox = gr.Checkbox(label="IV-Problem Types ") - css_code = """ - .dropdown-container { - display: none; - } - """ - - with gr.Column(): - # gr.Markdown("
Choose Subsets
") - num_parts_dropdown = gr.Dropdown(choices=[0,3, 4, 5, 6, 7, 8], label="Choose the Number of Subsets",value="") - - with gr.Row(): - with gr.Column(): - token_counts_radio = gr.Radio( - ["Equal Frequency Partitioning", "Equal Interval Partitioning"], - label="Choose the Division Method for Perspective-I", - visible=False) - with gr.Column(): - line_counts_radio = gr.Radio( - ["Equal Frequency Partitioning", "Equal Interval Partitioning"], - label="Choose the Division Method for Perspective-II", - visible=False) - with gr.Column(): - cyclomatic_complexity_radio = gr.Radio( - ["Equal Frequency Partitioning", "Equal Interval Partitioning"], - label="Choose the Division Method for Perspective-III", - visible=False) - - token_counts_checkbox.change(fn=lambda x: toggle_radio(x, token_counts_radio), - inputs=token_counts_checkbox, outputs=token_counts_radio) - line_counts_checkbox.change(fn=lambda x: toggle_radio(x, line_counts_radio), - inputs=line_counts_checkbox, outputs=line_counts_radio) - cyclomatic_complexity_checkbox.change(fn=lambda x: toggle_radio(x, cyclomatic_complexity_radio), - inputs=cyclomatic_complexity_checkbox, - outputs=cyclomatic_complexity_radio) - - with gr.Tabs() as inner_tabs: +
+

Multi-view Code LLM Leaderboard

+

Multi-view Leaderboard: Evaluating Large Language Models From Multiple Views

+
+ """) + + with gr.Row(): + # 选择配置 + with gr.Column(scale=1): + dataset_radio = gr.Radio( + ["HumanEval", "MBPP"], + label="Select a dataset", + value="HumanEval" + ) + num_parts_slider = gr.Slider( + minimum=3, + maximum=8, + step=1, + label="Choose the Number of Subsets", + value=3 + ) + + # 将多个checkbox改为一个radio + perspective_radio = gr.Radio( + ["I - Num of Tokens in Problem Desc", + "II - Num of Lines in Problem Desc", + "III - Complexity of Reference Code", + "IV - Problem Types"], + label="Choose Perspective", + value="I - Num of Tokens in Problem Desc" + ) + + # 统一的division method radio + division_method_radio = gr.Radio( + ["Equal Frequency Partitioning", "Equal Interval Partitioning"], + label="Choose the Division Method", + visible=True + ) + + confirm_btn = gr.Button("Confirm", variant="primary") + + # 核心展示 + with gr.Column(scale=2): + with gr.Tabs(): + # 表格 with gr.TabItem("Ranking Table"): - dataframe_output = gr.Dataframe(elem_id="dataframe") - css_output = gr.HTML() - confirm_button = gr.Button("Confirm ") - confirm_button.click(fn=on_confirm, inputs=[dataset_radio, num_parts_dropdown, token_counts_radio, - line_counts_radio, cyclomatic_complexity_radio, - problem_type_checkbox], - outputs=dataframe_output) - - with gr.TabItem("Line chart"): - select_radio = gr.Radio(choices=[], label="Select One Perpective") - checkboxes = [token_counts_checkbox, line_counts_checkbox, cyclomatic_complexity_checkbox, - problem_type_checkbox] - for checkbox in checkboxes: - checkbox.change(fn=update_radio_options, inputs=checkboxes, outputs=select_radio) - select_radio.change(fn=plot_csv, inputs=[dataset_radio, select_radio, num_parts_dropdown], - outputs=gr.Plot(label="Line Plot ")) - - # with gr.TabItem("Upload Inference File"): - # gr.Markdown("Upload a JSON file") - # with gr.Row(): - # with gr.Column(): - # string_input = gr.Textbox(label="Enter the Model Name") - # number_input = gr.Number(label="Select the Number of Samples") - # dataset_choice = gr.Dropdown(label="Select Dataset", choices=["HumanEval", "MBPP"]) - # with gr.Column(): - # file_input = gr.File(label="Upload Generation Result in JSON file") - # upload_button = gr.Button("Confirm and Upload") - - # json_output = gr.JSON(label="") - - # upload_button.click(fn=generate_file, inputs=[file_input, string_input, number_input, dataset_choice], - # outputs=json_output) - - css = """ - #scale1 { - border: 1px solid rgba(0, 0, 0, 0.2); - padding: 10px; - border-radius: 8px; - background-color: #f9f9f9; - box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); -} - } - """ - gr.HTML(f"") - - # 初始化数据表格 - # initial_df = show_data(False, False, False, False, False, False, False) - # initial_css = generate_css(False, False, False, False, True, False, False) - # dataframe_output.value = initial_df - # css_output.value = f"" - + data_table = gr.Dataframe(headers=["Model", "Score","Analysis"],interactive=True) + # 可视化 + with gr.TabItem("Visualization"): + plot_type = gr.Radio( + choices=["Line Chart", "Radar Chart","Heatmap"], + label="Select Plot Type", + value="Line Chart" + ) + chart = gr.Plot() + # AI分析 + with gr.TabItem("Model selection suggestions"): + with gr.Column(): + gr.Markdown("

🎯 Model Recommendation

") + recommendation_plot = gr.Plot() + scenario_legend = gr.Markdown(value="") # 新增图例 + + def update_perspective_options(dataset): + if dataset == "MBPP": + return gr.update(choices=[ + "I - Num of Tokens in Problem Desc", + "III - Complexity of Reference Code", + "IV - Problem Types" + ]) + else: + return gr.update(choices=[ + "I - Num of Tokens in Problem Desc", + "II - Num of Lines in Problem Desc", + "III - Complexity of Reference Code", + "IV - Problem Types" + ]) + + dataset_radio.change( + fn=update_perspective_options, + inputs=dataset_radio, + outputs=perspective_radio + ) + + + # 绑定事件 + confirm_btn.click( + fn=on_confirm, + inputs=[dataset_radio, num_parts_slider, perspective_radio, division_method_radio], + outputs=data_table + ).then( + fn=load_analysis_report, + inputs=[dataset_radio, num_parts_slider, perspective_radio, division_method_radio], + outputs=[gr.State()] + ).then( + fn=plot_visualization, + inputs=[dataset_radio, perspective_radio, num_parts_slider, plot_type], + outputs=chart + ).then( + fn=plot_recommendation_sunburst, + inputs=[dataset_radio, num_parts_slider, perspective_radio, division_method_radio], + outputs=[recommendation_plot] # 注意这里是列表 + ) + + plot_type.change( + fn=plot_visualization, + inputs=[dataset_radio, perspective_radio, num_parts_slider, plot_type], + outputs=chart + ) # 启动界面 iface.launch() \ No newline at end of file diff --git a/app_back.py b/app_back.py new file mode 100644 index 0000000000000000000000000000000000000000..b25f2760cc4f061555c6afa39ae268beaffeb40e --- /dev/null +++ b/app_back.py @@ -0,0 +1,363 @@ +import gradio as gr +import pandas as pd +import requests +import os +import shutil +import json +import pandas as pd +import subprocess +import plotly.express as px +def on_confirm(dataset_radio, num_parts_dropdown, token_counts_radio, line_counts_radio, cyclomatic_complexity_radio, problem_type_checkbox): + # 根据用户选择的参数构建文件路径 + num_parts = num_parts_dropdown + # token_counts_split = token_counts_radio + # line_counts_split = line_counts_radio + # cyclomatic_complexity_split = cyclomatic_complexity_radio + + + # 读取数据 + dataframes = [] + if dataset_radio == "HumanEval": + if token_counts_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致 + token_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/QS/token_counts_QS.csv") + dataframes.append(token_counts_df) + if token_counts_radio=="Equal Interval Partitioning": + token_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/EI/token_counts_EI.csv") + dataframes.append(token_counts_df) + if line_counts_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致 + line_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/QS/line_counts_QS.csv") + dataframes.append(line_counts_df) + if token_counts_radio=="Equal Interval Partitioning": + line_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/EI/line_counts_EI.csv") + dataframes.append(line_counts_df) + if cyclomatic_complexity_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致 + CC_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/QS/CC_QS.csv") + dataframes.append(CC_df) + if token_counts_radio=="Equal Interval Partitioning": + CC_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/EI/CC_EI.csv") + dataframes.append(CC_df) + + + + #以下改为直接从一个划分文件中读取即可 + if problem_type_checkbox: + problem_type_df = pd.read_csv("/home/user/app/dividing_into_different_subsets/cata_result.csv") + dataframes.append(problem_type_df) + if dataset_radio == "MBPP": + if token_counts_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致 + token_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/QS/token_counts_QS.csv") + dataframes.append(token_counts_df) + if token_counts_radio=="Equal Interval Partitioning": + token_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/EI/token_counts_EI.csv") + dataframes.append(token_counts_df) + if line_counts_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致 + line_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/QS/line_counts_QS.csv") + dataframes.append(line_counts_df) + if token_counts_radio=="Equal Interval Partitioning": + line_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/EI/line_counts_EI.csv") + dataframes.append(line_counts_df) + if cyclomatic_complexity_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致 + CC_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/QS/CC_QS.csv") + dataframes.append(CC_df) + if token_counts_radio=="Equal Interval Partitioning": + CC_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/EI/CC_EI.csv") + dataframes.append(CC_df) + + + + #以下改为直接从一个划分文件中读取即可 + if problem_type_checkbox: + problem_type_df = pd.read_csv("/home/user/app/dividing_into_different_subsets_mbpp/cata_result.csv") + dataframes.append(problem_type_df) + + # 如果所有三个radio都有value,将三个文件中的所有行拼接 + if len(dataframes) > 0: + combined_df = dataframes[0] + for df in dataframes[1:]: + combined_df = pd.merge(combined_df, df, left_index=True, right_index=True, suffixes=('', '_y')) + combined_df = combined_df.loc[:, ~combined_df.columns.str.endswith('_y')] # 去除重复的列 + return combined_df + else: + return pd.DataFrame() + + + + +def execute_specified_python_files(directory_list, file_list): + for directory in directory_list: + for py_file in file_list: + file_path = os.path.join(directory, py_file) + if os.path.isfile(file_path) and py_file.endswith('.py'): + print(f"Executing {file_path}...") + try: + # 使用subprocess执行Python文件 + subprocess.run(['python', file_path], check=True) + print(f"{file_path} executed successfully.") + except subprocess.CalledProcessError as e: + print(f"Error executing {file_path}: {e}") + else: + print(f"File {file_path} does not exist or is not a Python file.") +# 定义一个函数来生成 CSS 样式 +def generate_css(line_counts, token_counts, cyclomatic_complexity, problem_type, show_high, show_medium, show_low): + css = """ + #dataframe th { + background-color: #f2f2f2 + + } + """ + colors = ["#e6f7ff", "#ffeecc", "#e6ffe6", "#ffe6e6"] + categories = [line_counts, token_counts, cyclomatic_complexity] + category_index = 0 + column_index = 1 + + for category in categories: + if category: + if show_high: + css += f"#dataframe td:nth-child({column_index + 1}) {{ background-color: {colors[category_index]}; }}\n" + column_index += 1 + if show_medium: + css += f"#dataframe td:nth-child({column_index + 1}) {{ background-color: {colors[category_index]}; }}\n" + column_index += 1 + if show_low: + css += f"#dataframe td:nth-child({column_index + 1}) {{ background-color: {colors[category_index]}; }}\n" + column_index += 1 + category_index += 1 + + # 为 Problem Type 相关的三个子列设置固定颜色 + if problem_type: + problem_type_color = "#d4f0fc" # 你可以选择任何你喜欢的颜色 + css += f"#dataframe td:nth-child({column_index + 1}) {{ background-color: {problem_type_color}; }}\n" + css += f"#dataframe td:nth-child({column_index + 2}) {{ background-color: {problem_type_color}; }}\n" + css += f"#dataframe td:nth-child({column_index + 3}) {{ background-color: {problem_type_color}; }}\n" + + # 隐藏 "data" 标识 + css += """ + .gradio-container .dataframe-container::before { + content: none !important; + } + """ + + return css + + + + +def update_radio_options(token_counts, line_counts, cyclomatic_complexity, problem_type): + options = [] + if token_counts: + options.append("The Number of Tokens in Problem Descriptions") + if line_counts: + options.append("The Number of Lines in Problem Descriptions") + if cyclomatic_complexity: + options.append("The Complexity of Reference Code") + if problem_type: + options.append("Problem Type") + + return gr.update(choices=options) + +def plot_csv(dataset_radio,radio,num): + print(dataset_radio,radio) + if dataset_radio=="HumanEval": + + if radio=="The Number of Tokens in Problem Descriptions": + radio_choice="token_counts" + file_path = f'/home/user/app/dividing_into_different_subsets/{num}/QS/{radio_choice}_QS.csv' + elif radio=="The Number of Lines in Problem Descriptions": + radio_choice="line_counts" + file_path = f'/home/user/app/dividing_into_different_subsets/{num}/QS/{radio_choice}_QS.csv' + elif radio=="The Complexity of Reference Code": + radio_choice="CC" + file_path = f'/home/user/app/dividing_into_different_subsets/{num}/QS/{radio_choice}_QS.csv' + elif radio=="Problem Type": + radio_choice="problem_type" + file_path = f'/home/user/app/dividing_into_different_subsets/cata_result.csv' + print("test!") + elif dataset_radio=="MBPP": + if radio=="The Number of Tokens in Problem Descriptions": + radio_choice="token_counts" + file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/{num}/QS/{radio_choice}_QS.csv' + elif radio=="The Number of Lines in Problem Descriptions": + radio_choice="line_counts" + file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/{num}/QS/{radio_choice}_QS.csv' + elif radio=="The Complexity of Reference Code": + radio_choice="CC" + file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/{num}/QS/{radio_choice}_QS.csv' + elif radio=="Problem Type": + radio_choice="problem_type" + file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/cata_result.csv' + print("test!") + + # file_path="E:/python-testn/pythonProject3/hh_1/dividing_into_different_subsets/3/QS/CC_QS.csv" + df = pd.read_csv(file_path) + # 将第一列作为索引 + df.set_index('Model', inplace=True) + + # 转置数据框,使得模型作为列,横轴作为行 + df_transposed = df.T + + # 使用plotly绘制折线图 + fig = px.line(df_transposed, x=df_transposed.index, y=df_transposed.columns, + title='Model Evaluation Results', + labels={'value': 'Evaluation Score', 'index': 'Evaluation Metric'}, + color_discrete_sequence=px.colors.qualitative.Plotly) + + # 设置悬停效果 + fig.update_traces(hovertemplate='%{y}') + + return fig + +def toggle_radio(checkbox, radio): + return gr.update(visible=checkbox) + +def toggle_line_counts_visibility(dataset): + if dataset == "MBPP": + return gr.update(visible=False) + else: + return gr.update(visible=True) + + # 创建 Gradio 界面 +import gradio as gr + +with gr.Blocks() as iface: + gr.HTML(""" + + + """) + + with gr.Tabs() as tabs: + with gr.TabItem("Evaluation Result"): + with gr.Row(): + with gr.Column(scale=2): + with gr.Row(): + with gr.Column(): + dataset_radio = gr.Radio(["HumanEval", "MBPP"], label="Select Dataset ") + + + with gr.Row(): + custom_css = """ + + """ + + with gr.Column(): + gr.Markdown( + f"{custom_css}
Choose Division Perspective
") + + token_counts_checkbox = gr.Checkbox(label="I-The Number of Tokens in Problem Descriptions") + line_counts_checkbox = gr.Checkbox(label="II-The Number of Lines in Problem Descriptions") + dataset_radio.change(fn=toggle_line_counts_visibility, inputs=dataset_radio, + outputs=line_counts_checkbox) + cyclomatic_complexity_checkbox = gr.Checkbox(label="III-The Complexity of Reference Code") + problem_type_checkbox = gr.Checkbox(label="IV-Problem Types ") + css_code = """ + .dropdown-container { + display: none; + } + """ + + with gr.Column(): + # gr.Markdown("
Choose Subsets
") + num_parts_dropdown = gr.Dropdown(choices=[0,3, 4, 5, 6, 7, 8], label="Choose the Number of Subsets",value="") + + with gr.Row(): + with gr.Column(): + token_counts_radio = gr.Radio( + ["Equal Frequency Partitioning", "Equal Interval Partitioning"], + label="Choose the Division Method for Perspective-I", + visible=False) + with gr.Column(): + line_counts_radio = gr.Radio( + ["Equal Frequency Partitioning", "Equal Interval Partitioning"], + label="Choose the Division Method for Perspective-II", + visible=False) + with gr.Column(): + cyclomatic_complexity_radio = gr.Radio( + ["Equal Frequency Partitioning", "Equal Interval Partitioning"], + label="Choose the Division Method for Perspective-III", + visible=False) + + token_counts_checkbox.change(fn=lambda x: toggle_radio(x, token_counts_radio), + inputs=token_counts_checkbox, outputs=token_counts_radio) + line_counts_checkbox.change(fn=lambda x: toggle_radio(x, line_counts_radio), + inputs=line_counts_checkbox, outputs=line_counts_radio) + cyclomatic_complexity_checkbox.change(fn=lambda x: toggle_radio(x, cyclomatic_complexity_radio), + inputs=cyclomatic_complexity_checkbox, + outputs=cyclomatic_complexity_radio) + + with gr.Tabs() as inner_tabs: + with gr.TabItem("Ranking Table"): + dataframe_output = gr.Dataframe(elem_id="dataframe") + css_output = gr.HTML() + confirm_button = gr.Button("Confirm ") + confirm_button.click(fn=on_confirm, inputs=[dataset_radio, num_parts_dropdown, token_counts_radio, + line_counts_radio, cyclomatic_complexity_radio, + problem_type_checkbox], + outputs=dataframe_output) + + with gr.TabItem("Line chart"): + select_radio = gr.Radio(choices=[], label="Select One Perpective") + checkboxes = [token_counts_checkbox, line_counts_checkbox, cyclomatic_complexity_checkbox, + problem_type_checkbox] + for checkbox in checkboxes: + checkbox.change(fn=update_radio_options, inputs=checkboxes, outputs=select_radio) + select_radio.change(fn=plot_csv, inputs=[dataset_radio, select_radio, num_parts_dropdown], + outputs=gr.Plot(label="Line Plot ")) + + # with gr.TabItem("Upload Inference File"): + # gr.Markdown("Upload a JSON file") + # with gr.Row(): + # with gr.Column(): + # string_input = gr.Textbox(label="Enter the Model Name") + # number_input = gr.Number(label="Select the Number of Samples") + # dataset_choice = gr.Dropdown(label="Select Dataset", choices=["HumanEval", "MBPP"]) + # with gr.Column(): + # file_input = gr.File(label="Upload Generation Result in JSON file") + # upload_button = gr.Button("Confirm and Upload") + + # json_output = gr.JSON(label="") + + # upload_button.click(fn=generate_file, inputs=[file_input, string_input, number_input, dataset_choice], + # outputs=json_output) + + css = """ + #scale1 { + border: 1px solid rgba(0, 0, 0, 0.2); + padding: 10px; + border-radius: 8px; + background-color: #f9f9f9; + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); +} + } + """ + gr.HTML(f"") + + # 初始化数据表格 + # initial_df = show_data(False, False, False, False, False, False, False) + # initial_css = generate_css(False, False, False, False, True, False, False) + # dataframe_output.value = initial_df + # css_output.value = f"" + +# 启动界面 +iface.launch() \ No newline at end of file diff --git a/llm_insight/HumanEval/3/EI/CC_recommendation.json b/llm_insight/HumanEval/3/EI/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..095aa1f6955dd1a54c8685192212aea7e3a5a60c --- /dev/null +++ b/llm_insight/HumanEval/3/EI/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance and robustness": [ + { + "Nxcode-CQ-7B": "Consistently high scores across all subsets, making it the most reliable choice." + }, + { + "CodeFuse-DeepSeek-33b": "Strong performance in two subsets, though with a drop in one, indicating good but not perfect robustness." + } + ], + "Moderate performance with cost-effectiveness": [ + { + "deepseek_coder_33b-instruct": "Decent performance across all subsets, offering a balance between cost and effectiveness." + }, + { + "deepseek_coder-6.7b-instruct": "Good performance in two subsets, suitable for scenarios where some variability is acceptable." + } + ], + "Not recommended": [ + { + "codegemma-2b": "Extremely low performance across all subsets." + }, + { + "deepseek-coder-1.3b-base": "Poor performance, especially in one subset with 0.0 score." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/3/EI/CC_report.json b/llm_insight/HumanEval/3/EI/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..b5d8397000b62e726de520c4694081f976da5719 --- /dev/null +++ b/llm_insight/HumanEval/3/EI/CC_report.json @@ -0,0 +1,15 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in CC_subset_1 (79.86) and CC_subset_3 (75.0), but a significant drop in CC_subset_2 (50.0). This suggests it may struggle with certain types of data in the CC perspective.", + "Nxcode-CQ-7B": "Consistently high performance across all subsets (88.65, 75.0, 87.5), indicating robustness in handling diverse data splits under the CC perspective.", + "codegemma-2b": "Poor performance across all subsets (30.56, 2.5, 2.5), suggesting it is not suitable for tasks under the CC perspective.", + "codegemma-7b": "Moderate performance in CC_subset_1 (43.4) but very low in CC_subset_2 (12.5) and CC_subset_3 (18.75), indicating limited applicability.", + "codegemma-7b-it": "Better than codegemma-7b but still inconsistent (56.32, 32.5, 21.25), with performance dropping in later subsets.", + "deepseek-coder-1.3b-base": "Very low performance, especially in CC_subset_3 (0.0), making it unsuitable for this perspective.", + "deepseek-coder-6.7b-base": "Moderate in CC_subset_1 (50.31) but poor in others (14.06, 5.0), indicating limited use.", + "deepseek_coder-6.7b-instruct": "Strong in CC_subset_1 (74.62) and decent in CC_subset_2 (53.12), but drops in CC_subset_3 (36.25).", + "deepseek_coder_33b-base": "Moderate performance across subsets (55.9, 31.87, 22.5), with a downward trend.", + "deepseek_coder_33b-instruct": "Consistently decent performance (68.65, 49.69, 41.25), though not as high as Nxcode-CQ-7B.", + "codeqwen1.5-7b": "Inconsistent performance (55.0, 17.19, 57.5), with a notable drop in CC_subset_2.", + "new": "Similar to codegemma-7b-it (56.32, 32.5, 21.25), indicating no significant improvement.", + "global_insights": "Nxcode-CQ-7B is the top performer across all subsets, showing robustness. CodeFuse-DeepSeek-33b and deepseek_coder_33b-instruct also perform well but with some variability. Smaller models like codegemma-2b and deepseek-coder-1.3b-base are not recommended for this perspective." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/3/EI/line_counts_recommendation.json b/llm_insight/HumanEval/3/EI/line_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..e469f78d9afdb5de66433bf9ed1f121a3ab79135 --- /dev/null +++ b/llm_insight/HumanEval/3/EI/line_counts_recommendation.json @@ -0,0 +1,23 @@ +{ + "Small to medium line counts with high accuracy": [ + { + "Nxcode-CQ-7B": "Excels in line_subset_1 (88.82) and line_subset_2 (87.27), making it ideal for small to medium line counts." + }, + { + "deepseek_coder-6.7b-instruct": "Strong performance in line_subset_1 (75.65) and line_subset_2 (67.42), suitable for similar scenarios." + } + ], + "Large line counts with acceptable performance": [ + { + "CodeFuse-DeepSeek-33b": "Best performance in line_subset_3 (87.5), making it the top choice for large line counts." + } + ], + "Cost-effective for small line counts": [ + { + "codegemma-7b-it": "Moderate performance in line_subset_1 (57.32) at a potentially lower cost than larger models." + }, + { + "deepseek_coder_33b-instruct": "Balanced performance in line_subset_1 (70.85) and line_subset_2 (60.0) for its size." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/3/EI/line_counts_report.json b/llm_insight/HumanEval/3/EI/line_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..116facce050541e659d8b9e8bf62ee7cd19b0102 --- /dev/null +++ b/llm_insight/HumanEval/3/EI/line_counts_report.json @@ -0,0 +1,15 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in line_subset_3 (87.5) but drops significantly in line_subset_2 (72.73). This suggests it handles larger line counts better but may struggle with medium-sized inputs.", + "Nxcode-CQ-7B": "This model excels in line_subset_1 (88.82) and line_subset_2 (87.27) but performs poorly in line_subset_3 (63.75), indicating it's optimized for small to medium line counts but not for larger ones.", + "codegemma-2b": "Consistently poor performance across all subsets, with the worst in line_subset_3 (1.88). Not suitable for any line count scenario.", + "codegemma-7b": "Better than codegemma-2b but still underperforms, especially in line_subset_3 (11.88). Limited utility across line counts.", + "codegemma-7b-it": "Shows improvement over other codegemma models, particularly in line_subset_1 (57.32), but still struggles with larger line counts (26.88 in line_subset_3).", + "deepseek-coder-1.3b-base": "Low performance across all subsets, with line_subset_1 being the best (35.69). Not recommended for any scenario.", + "deepseek-coder-6.7b-base": "Moderate performance in line_subset_1 (49.47) and line_subset_2 (40.61), but very poor in line_subset_3 (8.13). Avoid for large line counts.", + "deepseek_coder-6.7b-instruct": "Strong in line_subset_1 (75.65) and line_subset_2 (67.42), but drops in line_subset_3 (26.25). Suitable for small to medium line counts.", + "deepseek_coder_33b-base": "Decent in line_subset_1 (57.4) and line_subset_2 (44.55), but poor in line_subset_3 (15.0). Limited to small-medium line counts.", + "deepseek_coder_33b-instruct": "Good performance in line_subset_1 (70.85) and line_subset_2 (60.0), but struggles with line_subset_3 (18.75). Best for small-medium line counts.", + "codeqwen1.5-7b": "Moderate across all subsets, with line_subset_1 (53.7) being the best. Not outstanding in any scenario.", + "new": "Similar to codegemma-7b-it, with identical scores. Shows potential for small-medium line counts but not for large ones.", + "global_insights": "Models generally perform best in line_subset_1 and worst in line_subset_3, indicating a trend of decreasing performance with increasing line counts. Nxcode-CQ-7B and CodeFuse-DeepSeek-33b are exceptions, excelling in different subsets. Smaller models (e.g., codegemma-2b) are consistently poor, while larger models (e.g., deepseek_coder-6.7b-instruct) show better adaptability." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/3/EI/token_counts_recommendation.json b/llm_insight/HumanEval/3/EI/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..286355ac1fa26fc6418891bf95366d1244acc64f --- /dev/null +++ b/llm_insight/HumanEval/3/EI/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance with cost-effectiveness": [ + { + "Nxcode-CQ-7B": "Consistently high performance in shorter and medium token subsets, making it suitable for most tasks." + }, + { + "CodeFuse-DeepSeek-33b": "Strong performance in shorter and longer token subsets, ideal for tasks with varying token lengths." + } + ], + "Moderate performance with lower cost": [ + { + "deepseek_coder-6.7b-instruct": "Good performance in shorter and medium token subsets, suitable for budget-conscious users." + }, + { + "codegemma-7b-it": "Better performance than base models, offering a balance between cost and capability." + } + ], + "Not recommended for complex tasks": [ + { + "codegemma-2b": "Poor performance across all subsets, not suitable for any serious tasks." + }, + { + "deepseek-coder-1.3b-base": "Low scores across all subsets, indicating limited utility." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/3/EI/token_counts_report.json b/llm_insight/HumanEval/3/EI/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..1b4d2f950359ebac2950d7cd698b98ae852221fd --- /dev/null +++ b/llm_insight/HumanEval/3/EI/token_counts_report.json @@ -0,0 +1,15 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in token_subset_1 and token_subset_3 but a significant drop in token_subset_2, indicating potential sensitivity to medium-length tokens.", + "Nxcode-CQ-7B": "Consistently high performance across token_subset_1 and token_subset_2, but a notable decline in token_subset_3, suggesting challenges with longer token sequences.", + "codegemma-2b": "Poor performance across all subsets, with particularly low scores in token_subset_3, indicating it may not be suitable for tasks involving varying token lengths.", + "codegemma-7b": "Moderate performance with a steady decline as token length increases, suggesting limited scalability with longer inputs.", + "codegemma-7b-it": "Better performance than codegemma-7b but still struggles with longer tokens, indicating some improvement but not enough for complex tasks.", + "deepseek-coder-1.3b-base": "Low scores across all subsets, with a slight improvement in token_subset_1, indicating limited capability.", + "deepseek-coder-6.7b-base": "Moderate performance in token_subset_1 but significant drops in longer tokens, suggesting scalability issues.", + "deepseek_coder-6.7b-instruct": "Strong performance in token_subset_1 and token_subset_2 but a sharp decline in token_subset_3, indicating challenges with very long tokens.", + "deepseek_coder_33b-base": "Moderate performance with a steady decline as token length increases, similar to codegemma-7b.", + "deepseek_coder_33b-instruct": "Good performance in token_subset_1 and token_subset_3 but a drop in token_subset_2, indicating inconsistency with medium-length tokens.", + "codeqwen1.5-7b": "Moderate performance across all subsets, with a steady decline as token length increases, suggesting limited scalability.", + "new": "Identical performance to codegemma-7b-it, indicating no additional improvements.", + "global_insights": "Models generally perform best on shorter token sequences (token_subset_1) and struggle with longer ones (token_subset_3). Nxcode-CQ-7B and CodeFuse-DeepSeek-33b are the top performers but show inconsistency across subsets. Smaller models like codegemma-2b and deepseek-coder-1.3b-base are not recommended for tasks involving varying token lengths." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/3/QS/CC_recommendation.json b/llm_insight/HumanEval/3/QS/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..c25073502b11d54ce95519f925ba24e4fe43d743 --- /dev/null +++ b/llm_insight/HumanEval/3/QS/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy and robustness": [ + { + "Nxcode-CQ-7B": "Consistently high performance across all subsets, making it ideal for scenarios requiring reliable and stable outputs." + }, + { + "deepseek_coder-6.7b-instruct": "Good balance of performance and adaptability, suitable for diverse data conditions." + } + ], + "Moderate accuracy with cost-effectiveness": [ + { + "deepseek_coder_33b-instruct": "Offers decent performance at a potentially lower cost compared to larger models like Nxcode-CQ-7B." + }, + { + "codeqwen1.5-7b": "Provides moderate performance at a lower computational cost, suitable for budget-constrained scenarios." + } + ], + "Low-cost with acceptable performance": [ + { + "codegemma-7b-it": "While not the best performer, it offers a cost-effective solution for less critical applications." + }, + { + "deepseek-coder-6.7b-base": "A budget-friendly option with moderate performance, suitable for non-critical tasks." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/3/QS/CC_report.json b/llm_insight/HumanEval/3/QS/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..83ddf4b718cca99276d2bdf107e096e68c080346 --- /dev/null +++ b/llm_insight/HumanEval/3/QS/CC_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows a consistent decline in performance across subsets, indicating potential sensitivity to dataset variations. It performs best in CC_subset_1 but drops significantly in CC_subset_3.", + "Nxcode-CQ-7B": "This model demonstrates robust performance across all subsets, with minimal variation. It maintains high accuracy, suggesting strong generalization capabilities.", + "codegemma-2b": "Performance degrades sharply across subsets, indicating poor adaptability to varying data conditions. The model struggles significantly in CC_subset_3.", + "codegemma-7b": "Similar to codegemma-2b, this model shows a steady decline in performance, though it starts from a higher baseline. It may not be suitable for diverse data conditions.", + "codegemma-7b-it": "While performance declines across subsets, the drop is less severe compared to other codegemma variants. It maintains moderate accuracy in CC_subset_3.", + "deepseek-coder-1.3b-base": "This model exhibits a steep performance drop, particularly in CC_subset_3, suggesting limited robustness to dataset changes.", + "deepseek-coder-6.7b-base": "Performance declines steadily but remains above smaller models like codegemma-2b. It shows moderate adaptability.", + "deepseek_coder-6.7b-instruct": "This model maintains relatively high performance across subsets, with a slight decline in CC_subset_3. It shows good generalization.", + "deepseek_coder_33b-base": "Performance drops steadily but remains above smaller models. It shows moderate robustness to dataset variations.", + "deepseek_coder_33b-instruct": "The model maintains decent performance across subsets, with a gradual decline. It shows better adaptability than its base counterpart.", + "codeqwen1.5-7b": "Performance declines across subsets but remains above smaller models. It shows moderate adaptability to varying data conditions.", + "global_insights": "Larger models generally perform better across subsets, with Nxcode-CQ-7B being the most consistent. Performance degradation is observed in all models as dataset complexity increases (from CC_subset_1 to CC_subset_3). Instruct-tuned models tend to outperform their base counterparts, suggesting the value of instruction fine-tuning." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/3/QS/line_counts_recommendation.json b/llm_insight/HumanEval/3/QS/line_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..9f328cac47fb30db4e1e7662bf756d191a75281e --- /dev/null +++ b/llm_insight/HumanEval/3/QS/line_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy and consistency": [ + { + "Nxcode-CQ-7B": "Consistently outperforms all other models across all subsets, making it the best choice for high accuracy tasks." + }, + { + "CodeFuse-DeepSeek-33b": "Strong performance and robustness, though slightly less consistent than Nxcode-CQ-7B." + } + ], + "Moderate accuracy with cost-effectiveness": [ + { + "deepseek_coder-6.7b-instruct": "Good performance at a potentially lower cost compared to larger models." + }, + { + "deepseek_coder_33b-instruct": "Better performance than its base version, suitable for tasks needing moderate accuracy." + } + ], + "Low-cost for simpler tasks": [ + { + "codegemma-7b-it": "Decent performance for simpler tasks, though not suitable for high accuracy needs." + }, + { + "codeqwen1.5-7b": "Moderate performance at a lower cost, suitable for less critical tasks." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/3/QS/line_counts_report.json b/llm_insight/HumanEval/3/QS/line_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..5289d8f97db28e4747895629f2a106152683fa37 --- /dev/null +++ b/llm_insight/HumanEval/3/QS/line_counts_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance across all subsets, with the highest score in subset_1 (81.82) and a slight drop in subset_2 (72.22) and subset_3 (76.36). This indicates robustness but some variability based on line counts.", + "Nxcode-CQ-7B": "This model outperforms all others, especially in subset_1 (92.09) and maintains high scores in subset_2 (88.33) and subset_3 (81.45). Its consistency and high performance make it a top contender.", + "codegemma-2b": "The model performs poorly across all subsets, with the lowest scores (44.09, 17.5, 19.64). It is not suitable for tasks requiring high accuracy.", + "codegemma-7b": "Performance is mediocre, with scores (52.45, 35.19, 31.64) showing a significant drop as line counts increase. Not recommended for complex tasks.", + "codegemma-7b-it": "Better than its non-IT counterpart, but still shows a declining trend (66.36, 49.26, 43.73). Suitable for simpler tasks but not for high accuracy needs.", + "deepseek-coder-1.3b-base": "Low scores (47.45, 26.39, 23.0) indicate this model is not suitable for tasks requiring precision or handling larger line counts.", + "deepseek-coder-6.7b-base": "Moderate performance (63.36, 39.35, 34.18) but declines with larger subsets. Limited utility for complex tasks.", + "deepseek_coder-6.7b-instruct": "Strong performance (85.0, 66.85, 62.82) but not as consistent as Nxcode-CQ-7B. Good for tasks needing moderate accuracy.", + "deepseek_coder_33b-base": "Decent performance (68.0, 48.89, 41.27) but declines with larger subsets. Not the best choice for high accuracy.", + "deepseek_coder_33b-instruct": "Good performance (82.09, 62.31, 53.91) but still not as consistent as Nxcode-CQ-7B. Suitable for tasks needing moderate accuracy.", + "codeqwen1.5-7b": "Moderate scores (59.73, 48.7, 45.64) with a declining trend. Limited utility for high accuracy tasks.", + "global_insights": "Nxcode-CQ-7B is the top performer across all subsets, showing both high accuracy and consistency. Smaller models like codegemma-2b and deepseek-coder-1.3b-base perform poorly, especially with larger line counts. Instruct versions of models generally perform better than their base counterparts. Line count significantly impacts performance for most models, with larger subsets showing lower scores." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/3/QS/token_counts_recommendation.json b/llm_insight/HumanEval/3/QS/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..02a97f162f34cf5186a0572c511b5b4a8bf0b68e --- /dev/null +++ b/llm_insight/HumanEval/3/QS/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance and robustness": [ + { + "Nxcode-CQ-7B": "Consistently high scores across all token subsets, making it reliable for varied token counts." + }, + { + "deepseek_coder-6.7b-instruct": "Strong performance in all subsets, suitable for tasks requiring stability across token counts." + } + ], + "Balanced performance and cost-effectiveness": [ + { + "deepseek_coder_33b-instruct": "Good performance with a reasonable drop in higher token counts, offering a balance between cost and capability." + }, + { + "codeqwen1.5-7b": "Moderate performance with a stable decline, suitable for tasks where cost is a concern but performance cannot be compromised too much." + } + ], + "Low-cost for low token count tasks": [ + { + "codegemma-7b-it": "Performs well in low to medium token counts, offering a cost-effective solution for smaller tasks." + }, + { + "deepseek-coder-6.7b-base": "Better than smaller models and cost-effective for tasks with moderate token counts." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/3/QS/token_counts_report.json b/llm_insight/HumanEval/3/QS/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..eada21f4cd33b62220b73da974f5a2feba3ce741 --- /dev/null +++ b/llm_insight/HumanEval/3/QS/token_counts_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in token_subset_2 (88.89) but drops significantly in token_subset_3 (69.09), indicating potential sensitivity to token count variations.", + "Nxcode-CQ-7B": "Consistently high performance across all subsets, with the highest score in token_subset_1 (90.73). This suggests robustness to token count changes.", + "codegemma-2b": "Performance declines sharply as token count increases, with the lowest score in token_subset_3 (11.82). Not suitable for high token count tasks.", + "codegemma-7b": "Similar to codegemma-2b but with better overall performance, though still struggles with higher token counts.", + "codegemma-7b-it": "Shows a steady decline with increasing token counts but maintains relatively better performance than other codegemma variants.", + "deepseek-coder-1.3b-base": "Performance drops significantly with higher token counts, similar to codegemma models.", + "deepseek-coder-6.7b-base": "Better than 1.3b-base but still shows a notable decline in token_subset_3 (25.09).", + "deepseek_coder-6.7b-instruct": "Strong performance across all subsets, with the highest score in token_subset_1 (83.64). Robust to token count variations.", + "deepseek_coder_33b-base": "Moderate performance with a steady decline in higher token counts.", + "deepseek_coder_33b-instruct": "Good performance overall but shows a noticeable drop in token_subset_3 (50.36).", + "codeqwen1.5-7b": "Performance declines with higher token counts but remains relatively stable compared to other models.", + "global_insights": "Models like Nxcode-CQ-7B and deepseek_coder-6.7b-instruct show robustness across token count variations, while smaller models (e.g., codegemma-2b, deepseek-coder-1.3b-base) struggle with higher token counts. Larger models generally perform better but may still show declines in the highest token subset." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/4/EI/CC_recommendation.json b/llm_insight/HumanEval/4/EI/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..0304d7d066a4ff5506bb39cc880dcb9e1af36ab0 --- /dev/null +++ b/llm_insight/HumanEval/4/EI/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance with no cost constraints": [ + { + "Nxcode-CQ-7B": "Consistently top performer across most subsets, making it the best choice for high-stakes CC tasks." + }, + { + "CodeFuse-DeepSeek-33b": "Strong performance in multiple subsets, though slightly less consistent than Nxcode-CQ-7B." + } + ], + "Balanced performance and cost": [ + { + "deepseek_coder_33b-instruct": "Reasonable performance across subsets, offering a good balance between cost and capability." + }, + { + "codeqwen1.5-7b": "Moderate performance in some subsets, potentially more cost-effective than larger models." + } + ], + "Low cost with acceptable performance": [ + { + "codegemma-7b-it": "Better than other small models, though still inconsistent. Suitable for less critical tasks." + }, + { + "deepseek_coder-6.7b-instruct": "Better among smaller deepseek models, but still has significant performance drops." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/4/EI/CC_report.json b/llm_insight/HumanEval/4/EI/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..76416725d5846134eabca664de731c4967800733 --- /dev/null +++ b/llm_insight/HumanEval/4/EI/CC_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in CC_subset_1 and CC_subset_4 but drops significantly in CC_subset_2 and CC_subset_3. This suggests it may struggle with certain types of CC-related tasks.", + "Nxcode-CQ-7B": "This model performs exceptionally well in CC_subset_1, CC_subset_2, and CC_subset_4, but has a notable drop in CC_subset_3. It's consistently one of the top performers across most subsets.", + "codegemma-2b": "The model performs poorly across all subsets, particularly in CC_subset_3 and CC_subset_4 where it scores 0.0 and 2.5 respectively. It's not suitable for CC-related tasks.", + "codegemma-7b": "While better than its 2b counterpart, this model still struggles, especially in CC_subset_3 where it scores 0.0. It shows some improvement in other subsets but remains weak overall.", + "codegemma-7b-it": "This model shows moderate performance in CC_subset_1 and CC_subset_2 but drops in CC_subset_3 and CC_subset_4. It's better than other codegemma versions but still inconsistent.", + "deepseek-coder-1.3b-base": "The model performs poorly across all subsets, particularly in CC_subset_3 and CC_subset_4 where it scores 0.0. It's not recommended for CC tasks.", + "deepseek-coder-6.7b-base": "This model shows some capability in CC_subset_1 and CC_subset_2 but fails completely in CC_subset_3 and CC_subset_4. It's inconsistent and not reliable.", + "deepseek_coder-6.7b-instruct": "The model performs well in CC_subset_1 and CC_subset_2 but drops significantly in CC_subset_3 and CC_subset_4. It's a better option among deepseek models but still inconsistent.", + "deepseek_coder_33b-base": "This model shows moderate performance in CC_subset_1 and CC_subset_2 but struggles in CC_subset_3 and CC_subset_4. It's better than smaller deepseek models but not top-tier.", + "deepseek_coder_33b-instruct": "The model performs reasonably well across all subsets, though it drops in CC_subset_3 and CC_subset_4. It's one of the better deepseek models but still not the best.", + "codeqwen1.5-7b": "This model shows moderate performance in CC_subset_1 and CC_subset_2, fails in CC_subset_3, but recovers somewhat in CC_subset_4. It's inconsistent but has some strengths.", + "global_insights": "Nxcode-CQ-7B is the top performer overall, excelling in most subsets. CodeFuse-DeepSeek-33b also shows strong performance in some subsets. Smaller models like codegemma-2b and deepseek-coder-1.3b-base perform poorly across the board. The CC_subset_3 appears to be the most challenging for all models, with many scoring 0.0. Models generally perform better in CC_subset_1 and CC_subset_2, suggesting these subsets may contain less complex tasks." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/4/EI/line_counts_recommendation.json b/llm_insight/HumanEval/4/EI/line_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..e376f9fd74c9ad2fc1305613b554ac2b7928173e --- /dev/null +++ b/llm_insight/HumanEval/4/EI/line_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy for small to medium tasks": [ + { + "Nxcode-CQ-7B": "Best performance in subsets 1 and 2, suitable for tasks with smaller line counts." + }, + { + "CodeFuse-DeepSeek-33b": "Consistent performance across all subsets, reliable for varied tasks." + } + ], + "Moderate accuracy with cost-effectiveness": [ + { + "deepseek_coder-6.7b-instruct": "Good balance of performance and cost for smaller tasks." + }, + { + "codegemma-7b-it": "Cost-effective for moderate line counts with decent performance." + } + ], + "Not recommended": [ + { + "codegemma-2b": "Poor performance across all subsets." + }, + { + "deepseek-coder-1.3b-base": "Inconsistent and low performance." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/4/EI/line_counts_report.json b/llm_insight/HumanEval/4/EI/line_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..28d46eb845ee1fd061f382c6c19b05a0d9ee71af --- /dev/null +++ b/llm_insight/HumanEval/4/EI/line_counts_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "Consistent performance across subsets, with a slight dip in line_subset_2. Demonstrates robustness in handling varying line counts.", + "Nxcode-CQ-7B": "Highest performance in line_subset_1 and line_subset_2, but significant drop in line_subset_3. May struggle with larger line counts.", + "codegemma-2b": "Poor performance across all subsets, especially in line_subset_3 and line_subset_4. Not suitable for tasks requiring high accuracy.", + "codegemma-7b": "Better than codegemma-2b but still underperforms in larger line counts. Limited scalability.", + "codegemma-7b-it": "Improved over codegemma-7b, especially in line_subset_4. Shows potential for tasks with moderate line counts.", + "deepseek-coder-1.3b-base": "Low performance overall, with a surprising spike in line_subset_4. Inconsistent behavior.", + "deepseek-coder-6.7b-base": "Moderate performance, but drops significantly in line_subset_3 and line_subset_4. Limited reliability.", + "deepseek_coder-6.7b-instruct": "Strong performance in line_subset_1 and line_subset_2, but drops in larger subsets. Best for smaller tasks.", + "deepseek_coder_33b-base": "Decent performance but inconsistent across subsets. May require fine-tuning.", + "deepseek_coder_33b-instruct": "Good performance in line_subset_1 and line_subset_2, but drops sharply in line_subset_4. Unpredictable for larger tasks.", + "codeqwen1.5-7b": "Moderate performance across subsets, with a dip in line_subset_3. Balanced but not outstanding.", + "global_insights": "Models generally perform better in smaller line counts (subsets 1 and 2). Larger subsets (3 and 4) show significant performance drops, indicating scalability issues. Nxcode-CQ-7B and CodeFuse-DeepSeek-33b are top performers but have trade-offs. Smaller models like codegemma-2b and deepseek-coder-1.3b-base are not recommended for any serious tasks." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/4/EI/token_counts_recommendation.json b/llm_insight/HumanEval/4/EI/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..28a77ed509d2590de06b6478d12a84fd644562e1 --- /dev/null +++ b/llm_insight/HumanEval/4/EI/token_counts_recommendation.json @@ -0,0 +1,23 @@ +{ + "High performance with long inputs": [ + { + "CodeFuse-DeepSeek-33b": "This model is the only one that handles the longest token subset perfectly, making it ideal for tasks requiring processing of very long inputs." + } + ], + "Balanced performance across most inputs": [ + { + "Nxcode-CQ-7B": "This model performs consistently well across most token subsets, making it a reliable choice for general tasks with varying input lengths." + }, + { + "deepseek_coder-6.7b-instruct": "This model shows strong performance in shorter to medium token subsets, making it suitable for tasks where inputs are not extremely long." + } + ], + "Cost-effective for shorter inputs": [ + { + "codegemma-7b-it": "This model offers reasonable performance for shorter inputs at a lower computational cost compared to larger models." + }, + { + "codeqwen1.5-7b": "This model provides a balance between performance and cost for tasks with shorter to medium input lengths." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/4/EI/token_counts_report.json b/llm_insight/HumanEval/4/EI/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..1f91bfc97099e053235fd4da2b5d24c5856227af --- /dev/null +++ b/llm_insight/HumanEval/4/EI/token_counts_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in token_subset_4 with a perfect score of 100.0, but its performance degrades as token counts decrease, indicating potential sensitivity to input length.", + "Nxcode-CQ-7B": "This model performs consistently well across token_subset_1 to token_subset_3 but shows a significant drop in token_subset_4, suggesting it may struggle with very long inputs.", + "codegemma-2b": "The model's performance is poor across all subsets, with particularly low scores in token_subset_3 and token_subset_4, indicating it may not be suitable for tasks requiring handling of varying token lengths.", + "codegemma-7b": "While better than its 2b counterpart, this model still struggles with longer token subsets, showing a clear decline in performance as token counts increase.", + "codegemma-7b-it": "This model shows a moderate improvement over the base 7b model, particularly in token_subset_3, but still fails to handle token_subset_4.", + "deepseek-coder-1.3b-base": "The model's performance is weak across all subsets, with scores dropping sharply as token counts increase, making it unsuitable for tasks with longer inputs.", + "deepseek-coder-6.7b-base": "This model shows better performance than the 1.3b version but still struggles with longer token subsets.", + "deepseek_coder-6.7b-instruct": "The model performs well in token_subset_1 and token_subset_2 but shows a significant drop in token_subset_3 and fails in token_subset_4.", + "deepseek_coder_33b-base": "This model shows decent performance in token_subset_1 but declines as token counts increase, indicating limitations with longer inputs.", + "deepseek_coder_33b-instruct": "The model performs reasonably well in token_subset_1 and token_subset_2 but struggles with longer token subsets.", + "codeqwen1.5-7b": "This model shows moderate performance across subsets but fails to handle token_subset_4, similar to other models.", + "global_insights": "Most models struggle with longer token subsets, with only CodeFuse-DeepSeek-33b showing perfect performance in token_subset_4. Nxcode-CQ-7B performs consistently well across most subsets but fails in the longest subset. Smaller models generally perform worse, especially with longer inputs. The instruct versions of models tend to perform better than their base counterparts." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/4/QS/CC_recommendation.json b/llm_insight/HumanEval/4/QS/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..0bb82833043665f99c07033e319314398b612dde --- /dev/null +++ b/llm_insight/HumanEval/4/QS/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance with robust generalization": [ + { + "Nxcode-CQ-7B": "Consistently high scores across all subsets, indicating strong generalization and reliability." + }, + { + "CodeFuse-DeepSeek-33b": "Despite the decline, it maintains competitive performance, suitable for tasks where initial high accuracy is critical." + } + ], + "Cost-effective with moderate performance": [ + { + "deepseek_coder-6.7b-instruct": "Balances performance and cost, especially beneficial with instruction tuning." + }, + { + "deepseek_coder_33b-instruct": "Larger but still cost-effective for tasks needing stable performance." + } + ], + "Budget-constrained scenarios": [ + { + "codegemma-7b-it": "Reasonable performance for its size, suitable when resources are limited." + }, + { + "codeqwen1.5-7b": "Moderate performance at a lower computational cost." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/4/QS/CC_report.json b/llm_insight/HumanEval/4/QS/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..fa61a55f94a421d57a804a52d94ce2c98316ec76 --- /dev/null +++ b/llm_insight/HumanEval/4/QS/CC_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows a consistent decline in performance across subsets, indicating potential overfitting or lack of generalization. The drop from 85.37 to 68.29 suggests sensitivity to data distribution shifts.", + "Nxcode-CQ-7B": "This model demonstrates strong performance across all subsets, with a slight dip in CC_subset_2. Its high scores in CC_subset_3 and CC_subset_4 indicate robustness to varying data conditions.", + "codegemma-2b": "Poor performance across all subsets, with particularly low scores in CC_subset_4. This suggests the model may lack the capacity for the task or requires significant fine-tuning.", + "codegemma-7b": "Better than its 2b counterpart but still underperforms compared to larger models. The performance drop in CC_subset_4 is significant, indicating limitations in handling more complex data.", + "codegemma-7b-it": "Improved over the base 7b model, especially in CC_subset_2 and CC_subset_4, suggesting that instruction tuning helps but may not be sufficient for top-tier performance.", + "deepseek-coder-1.3b-base": "Low scores across all subsets, similar to codegemma-2b, indicating that small models struggle with this task regardless of architecture.", + "deepseek-coder-6.7b-base": "Moderate performance with a notable drop in CC_subset_4. The model shows potential but may need further optimization or scaling.", + "deepseek_coder-6.7b-instruct": "Strong performance, especially in CC_subset_1 and CC_subset_3, indicating that instruction tuning significantly benefits this model size.", + "deepseek_coder_33b-base": "Decent performance but inconsistent, with a significant drop in CC_subset_4. The base model may need fine-tuning to stabilize performance.", + "deepseek_coder_33b-instruct": "Consistently good performance across subsets, though not the best. Instruction tuning helps, but the model may still lag behind the top performers.", + "codeqwen1.5-7b": "Moderate performance with a steady decline across subsets. The model shows promise but may need further scaling or tuning.", + "global_insights": "Larger models generally perform better, with Nxcode-CQ-7B and CodeFuse-DeepSeek-33b leading. Instruction tuning (e.g., deepseek_coder-6.7b-instruct) significantly boosts performance. Smaller models (codegemma-2b, deepseek-coder-1.3b-base) struggle, indicating a minimum model size is required for this task. Performance drops in later subsets suggest increasing complexity or distribution shifts that challenge generalization." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/4/QS/line_counts_recommendation.json b/llm_insight/HumanEval/4/QS/line_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..af2e574c695817eb1b0c1b9c63eee28c971f0055 --- /dev/null +++ b/llm_insight/HumanEval/4/QS/line_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance with moderate cost": [ + { + "Nxcode-CQ-7B": "Maintains high performance across all subsets, making it suitable for tasks requiring consistent accuracy." + }, + { + "deepseek_coder_33b-instruct": "Offers robust performance across subsets, ideal for tasks needing reliable results with varying line counts." + } + ], + "Balanced performance and cost": [ + { + "CodeFuse-DeepSeek-33b": "Good performance in smaller subsets, suitable for tasks where line counts are generally low." + }, + { + "deepseek_coder-6.7b-instruct": "Performs well in smaller subsets and moderately in larger ones, offering a balance between cost and performance." + } + ], + "Low cost with acceptable performance for small tasks": [ + { + "codegemma-7b-it": "Performs adequately in smaller subsets, suitable for budget-conscious projects with limited line counts." + }, + { + "codeqwen1.5-7b": "Offers moderate performance at a lower cost, ideal for smaller-scale tasks." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/4/QS/line_counts_report.json b/llm_insight/HumanEval/4/QS/line_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..2da5f95d7d99ddae73720f53dbd0c20cc4b4de4e --- /dev/null +++ b/llm_insight/HumanEval/4/QS/line_counts_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows consistent performance across subsets 1 and 2 but drops significantly in subsets 3 and 4, indicating potential sensitivity to line count variations.", + "Nxcode-CQ-7B": "This model maintains relatively high performance across all subsets, though there is a gradual decline as line counts increase, suggesting robustness but with some sensitivity to complexity.", + "codegemma-2b": "Performance drops sharply from subset 1 to subset 2 and remains low, indicating poor scalability with increasing line counts.", + "codegemma-7b": "Similar to codegemma-2b, this model shows a significant drop in performance after subset 1, though it performs slightly better overall.", + "codegemma-7b-it": "This model performs better than its non-IT counterpart but still shows a decline as line counts increase, suggesting limited scalability.", + "deepseek-coder-1.3b-base": "Performance declines sharply after subset 1, indicating poor handling of larger code segments.", + "deepseek-coder-6.7b-base": "Shows a moderate drop in performance across subsets, with a significant decline in subset 3, suggesting limitations with more complex code.", + "deepseek_coder-6.7b-instruct": "Maintains high performance in subsets 1 and 2 but drops in subsets 3 and 4, indicating sensitivity to line count increases.", + "deepseek_coder_33b-base": "Performance declines steadily across subsets, suggesting a linear relationship between line count and performance drop.", + "deepseek_coder_33b-instruct": "Shows a gradual decline in performance across subsets, but maintains relatively higher scores compared to other models.", + "codeqwen1.5-7b": "Performance drops gradually across subsets, indicating moderate sensitivity to line count increases.", + "global_insights": "Models generally perform better on smaller line counts (subsets 1 and 2) and decline as line counts increase (subsets 3 and 4). Nxcode-CQ-7B and deepseek_coder_33b-instruct show the most robustness across all subsets. Smaller models like codegemma-2b and deepseek-coder-1.3b-base struggle significantly with larger line counts." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/4/QS/token_counts_recommendation.json b/llm_insight/HumanEval/4/QS/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..0f4adbf3af447c2e3c5812ac35d8f1f1d4362761 --- /dev/null +++ b/llm_insight/HumanEval/4/QS/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance and robustness across all token counts": [ + { + "Nxcode-CQ-7B": "Consistently high scores across all subsets, making it reliable for varied token counts." + }, + { + "deepseek_coder-6.7b-instruct": "Strong performance in most subsets, though slightly drops in the highest token count." + } + ], + "Moderate performance with cost-effectiveness": [ + { + "codegemma-7b-it": "Better performance than codegemma-7b and codegemma-2b, but still affordable." + }, + { + "codeqwen1.5-7b": "Moderate performance with a balance of cost and capability." + } + ], + "Not recommended for high token count tasks": [ + { + "codegemma-2b": "Poor performance across all subsets, especially in higher token counts." + }, + { + "deepseek-coder-1.3b-base": "Sharp decline in performance with higher token counts." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/4/QS/token_counts_report.json b/llm_insight/HumanEval/4/QS/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..29d1966eb225077bfda3165187efc6e8085ba765 --- /dev/null +++ b/llm_insight/HumanEval/4/QS/token_counts_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in token_subset_2 (85.37) but drops significantly in token_subset_4 (63.41). This suggests it handles moderate token counts well but struggles with higher token counts.", + "Nxcode-CQ-7B": "Consistently high performance across all subsets, especially in token_subset_1 (93.9). This indicates robustness across varying token counts.", + "codegemma-2b": "Poor performance across all subsets, with a steep decline in token_subset_4 (8.05). Not suitable for tasks with high token counts.", + "codegemma-7b": "Moderate performance, but drops significantly in token_subset_3 (27.56) and token_subset_4 (26.34). Limited capability with higher token counts.", + "codegemma-7b-it": "Better than codegemma-7b, but still shows a decline in token_subset_3 (37.07) and token_subset_4 (41.83).", + "deepseek-coder-1.3b-base": "Low performance across all subsets, with a sharp drop in token_subset_4 (12.32). Not recommended for high token count tasks.", + "deepseek-coder-6.7b-base": "Moderate performance, but declines in token_subset_3 (36.34) and token_subset_4 (23.9).", + "deepseek_coder-6.7b-instruct": "Strong performance in token_subset_1 (85.98) and token_subset_3 (72.8), but drops in token_subset_4 (56.34).", + "deepseek_coder_33b-base": "Moderate performance, with a decline in token_subset_4 (32.8).", + "deepseek_coder_33b-instruct": "Strong performance across all subsets, though it drops in token_subset_4 (46.1).", + "codeqwen1.5-7b": "Moderate performance, with a decline in token_subset_3 (39.88) and token_subset_4 (40.37).", + "global_insights": "Nxcode-CQ-7B and deepseek_coder-6.7b-instruct show the most consistent performance across all token subsets. Smaller models like codegemma-2b and deepseek-coder-1.3b-base struggle with higher token counts. Larger models generally perform better but still show declines in the highest token subset." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/5/EI/CC_recommendation.json b/llm_insight/HumanEval/5/EI/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..3701cdcb9f0d596243f075a83a3c30d4030f9ed3 --- /dev/null +++ b/llm_insight/HumanEval/5/EI/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy and robustness": [ + { + "Nxcode-CQ-7B": "Consistently high performance across most subsets, making it reliable for diverse tasks." + }, + { + "CodeFuse-DeepSeek-33b": "Strong performance in specific subsets, suitable for tasks similar to CC_subset_1 and CC_subset_4." + } + ], + "Moderate accuracy with cost-effectiveness": [ + { + "deepseek_coder-6.7b-instruct": "Decent performance at a potentially lower computational cost compared to larger models." + }, + { + "codeqwen1.5-7b": "Balanced performance and may offer a good trade-off between cost and accuracy." + } + ], + "Not recommended": [ + { + "codegemma-2b": "Poor performance across all subsets." + }, + { + "deepseek-coder-1.3b-base": "Very low scores, not suitable for any serious tasks." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/5/EI/CC_report.json b/llm_insight/HumanEval/5/EI/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..2881a8b958ccd7c0ddeccd3c33b5837ae1daf92d --- /dev/null +++ b/llm_insight/HumanEval/5/EI/CC_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in CC_subset_4 (100%) and CC_subset_1 (81.63%), but struggles in CC_subset_3 (33.33%). This indicates variability in handling different subsets, possibly due to the nature of the tasks in each subset.", + "Nxcode-CQ-7B": "Consistently high performance across most subsets (CC_subset_1: 87.35%, CC_subset_2: 88.73%, CC_subset_3: 84.17%, CC_subset_5: 98.33%), except for CC_subset_4 (37.5%). This suggests robustness but a potential weakness in tasks similar to CC_subset_4.", + "codegemma-2b": "Poor performance across all subsets, with the highest score being 31.63% in CC_subset_1. This model is not suitable for tasks requiring high accuracy.", + "codegemma-7b": "Moderate performance, with the best score in CC_subset_1 (46.84%). Performance drops significantly in other subsets, indicating limited generalization.", + "codegemma-7b-it": "Better than codegemma-7b, with the highest score in CC_subset_1 (57.35%). However, performance is inconsistent, especially in CC_subset_4 (2.5%).", + "deepseek-coder-1.3b-base": "Very poor performance, with scores below 40% in all subsets. Not recommended for any serious tasks.", + "deepseek-coder-6.7b-base": "Slightly better than the 1.3b version, but still underperforms with a maximum score of 51.68% in CC_subset_1.", + "deepseek_coder-6.7b-instruct": "Decent performance in CC_subset_1 (73.16%) and CC_subset_2 (76.36%), but struggles in CC_subset_4 (0%).", + "deepseek_coder_33b-base": "Inconsistent performance, with highs in CC_subset_4 (40%) and lows in CC_subset_5 (3.33%).", + "deepseek_coder_33b-instruct": "Good performance in CC_subset_1 (67.86%), CC_subset_2 (67.09%), and CC_subset_3 (56.67%), but fails in CC_subset_4 (5%).", + "codeqwen1.5-7b": "Moderate performance across subsets, with the best score in CC_subset_4 (47.5%). Shows some consistency but lacks top-tier performance.", + "global_insights": "Nxcode-CQ-7B and CodeFuse-DeepSeek-33b are the top performers, with Nxcode-CQ-7B being more consistent. Smaller models like codegemma-2b and deepseek-coder-1.3b-base are not suitable for high-accuracy tasks. The instruct versions of models generally perform better than their base counterparts. CC_subset_4 appears to be the most challenging subset for most models." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/5/EI/line_counts_recommendation.json b/llm_insight/HumanEval/5/EI/line_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..f41c1ab8fd2306ff68537f64d3658f1073f0b428 --- /dev/null +++ b/llm_insight/HumanEval/5/EI/line_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High Performance, High Cost": [ + { + "CodeFuse-DeepSeek-33b": "Consistently high performance across most subsets, though with some variability." + }, + { + "Nxcode-CQ-7B": "Strong performance in most scenarios, except for very long lines of code." + } + ], + "Balanced Performance and Cost": [ + { + "deepseek_coder-6.7b-instruct": "Decent performance with instruction tuning, suitable for tasks not involving very long lines of code." + }, + { + "codeqwen1.5-7b": "Moderate performance across most subsets, a good balance for general use." + } + ], + "Low Cost, Limited Performance": [ + { + "codegemma-7b-it": "Better than its base version but still limited, suitable for low-budget scenarios." + }, + { + "deepseek_coder_33b-instruct": "Instruction-tuned but inconsistent, only recommended for low-priority tasks." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/5/EI/line_counts_report.json b/llm_insight/HumanEval/5/EI/line_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..a586945624657e259ec6f0d2b1c24960dd24663d --- /dev/null +++ b/llm_insight/HumanEval/5/EI/line_counts_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance across most subsets, particularly excelling in line_subset_4 with a perfect score. However, there is variability in performance, indicating potential sensitivity to line count differences.", + "Nxcode-CQ-7B": "This model demonstrates consistent performance in line_subset_1, line_subset_2, and line_subset_3 but shows a significant drop in line_subset_4. This suggests potential limitations with very short or very long lines of code.", + "codegemma-2b": "The model performs poorly across all subsets, with particularly low scores in line_subset_4 and line_subset_5. This indicates a general lack of capability in handling varying line counts.", + "codegemma-7b": "While better than codegemma-2b, this model still struggles, especially with longer lines of code. Performance is inconsistent and generally low.", + "codegemma-7b-it": "This model shows improvement over its non-instruction-tuned counterpart but still has significant room for improvement, particularly in handling longer lines of code.", + "deepseek-coder-1.3b-base": "The model performs poorly, with very low scores in line_subset_4 and inconsistent performance elsewhere. Not suitable for tasks requiring robustness across line counts.", + "deepseek-coder-6.7b-base": "Performance is inconsistent, with a complete failure in line_subset_4. This model is not reliable for tasks involving varying line counts.", + "deepseek_coder-6.7b-instruct": "The model shows decent performance in line_subset_1 and line_subset_2 but struggles with longer lines. The instruction tuning helps but is not sufficient for all scenarios.", + "deepseek_coder_33b-base": "Performance is middling, with no standout strengths. The model is inconsistent across subsets, indicating limited robustness.", + "deepseek_coder_33b-instruct": "This model performs well in line_subset_1 and line_subset_3 but shows significant drops in line_subset_4 and line_subset_5. Instruction tuning helps but does not fully address the limitations.", + "codeqwen1.5-7b": "The model shows moderate performance across most subsets but struggles with longer lines. It is a middle-of-the-road option with no extreme weaknesses or strengths.", + "global_insights": "Models generally perform better with shorter lines of code (line_subset_1 and line_subset_2) and struggle with longer lines (line_subset_4 and line_subset_5). Instruction-tuned models tend to perform better than their base counterparts, but the improvement is not always significant. The best-performing models are CodeFuse-DeepSeek-33b and Nxcode-CQ-7B, but even they have notable weaknesses." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/5/EI/token_counts_recommendation.json b/llm_insight/HumanEval/5/EI/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..c14feaddae4376ecfe3b6d407fb1cd632710e928 --- /dev/null +++ b/llm_insight/HumanEval/5/EI/token_counts_recommendation.json @@ -0,0 +1,29 @@ +{ + "High performance with balanced token handling": [ + { + "CodeFuse-DeepSeek-33b": "Consistently high performance across most token subsets, making it reliable for diverse token ranges." + }, + { + "Nxcode-CQ-7B": "Strong performance in lower to mid token subsets, ideal for tasks not requiring very high token counts." + } + ], + "Cost-effective for lower token tasks": [ + { + "deepseek_coder-6.7b-instruct": "Good performance in lower to mid token subsets at a potentially lower cost." + }, + { + "deepseek_coder_33b-instruct": "Balanced performance in lower subsets, suitable for tasks with moderate token counts." + } + ], + "Avoid for high token tasks": [ + { + "codegemma-2b": "Poor performance across all subsets, especially in higher token counts." + }, + { + "codegemma-7b": "Fails in higher token ranges, not recommended for robust tasks." + }, + { + "codegemma-7b-it": "Better than other codegemma variants but still inadequate for high token tasks." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/5/EI/token_counts_report.json b/llm_insight/HumanEval/5/EI/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..a0de9da234ee816610181550aea1d6d88e94f25c --- /dev/null +++ b/llm_insight/HumanEval/5/EI/token_counts_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance across most subsets, particularly excelling in token_subset_5 with a perfect score. However, there is a noticeable dip in token_subset_3, suggesting potential variability in handling certain token ranges.", + "Nxcode-CQ-7B": "This model performs consistently well in subsets 1-4 but has a significant drop in token_subset_5, indicating a potential limitation with very high token counts.", + "codegemma-2b": "Poor performance across all subsets, especially in higher token counts, making it unsuitable for tasks requiring robust token handling.", + "codegemma-7b": "Moderate performance in lower token subsets but fails completely in higher token ranges, similar to codegemma-2b.", + "codegemma-7b-it": "Better than other codegemma variants but still struggles with higher token counts, showing a steep decline in performance.", + "deepseek-coder-1.3b-base": "Low performance across the board, with particularly poor results in higher token subsets.", + "deepseek-coder-6.7b-base": "Moderate performance in lower token subsets but fails to maintain consistency in higher ranges.", + "deepseek_coder-6.7b-instruct": "Strong performance in lower to mid token subsets but drops significantly in token_subset_5, similar to Nxcode-CQ-7B.", + "deepseek_coder_33b-base": "Decent performance in lower token subsets but struggles as token counts increase.", + "deepseek_coder_33b-instruct": "Good performance in lower subsets but inconsistent in higher token ranges, with a notable drop in token_subset_3.", + "codeqwen1.5-7b": "Moderate performance across subsets but fails in higher token counts, similar to other models.", + "global_insights": "Models generally perform better in lower token subsets, with performance degrading as token counts increase. CodeFuse-DeepSeek-33b and Nxcode-CQ-7B are exceptions, showing strong performance in most subsets except the highest token range. This suggests that token count is a critical factor in model performance, and models vary significantly in their ability to handle different token ranges." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/5/QS/CC_recommendation.json b/llm_insight/HumanEval/5/QS/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..ab1e63e35e865703f0774e4c99954f514f2e33f7 --- /dev/null +++ b/llm_insight/HumanEval/5/QS/CC_recommendation.json @@ -0,0 +1,29 @@ +{ + "High accuracy and robustness": [ + { + "Nxcode-CQ-7B": "Consistently high performance across all subsets, making it the most reliable choice for high-accuracy tasks." + }, + { + "deepseek_coder-6.7b-instruct": "Strong performance across all subsets, particularly in subsets 1 and 4, indicating robustness." + }, + { + "deepseek_coder_33b-instruct": "Reliable performance across all subsets, with high scores in subsets 1 and 4, making it a strong contender for high-accuracy tasks." + } + ], + "Moderate accuracy with cost-effectiveness": [ + { + "codegemma-7b-it": "Better performance than codegemma-7b and more cost-effective than larger models, suitable for tasks where moderate accuracy is acceptable." + }, + { + "codeqwen1.5-7b": "Moderate performance in subsets 1-4, making it a cost-effective choice for tasks where high accuracy is not critical." + } + ], + "Low accuracy for lightweight tasks": [ + { + "codegemma-2b": "Suitable for lightweight tasks where accuracy is not a priority." + }, + { + "deepseek-coder-1.3b-base": "Low performance but may be suitable for very basic tasks where cost is a major concern." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/5/QS/CC_report.json b/llm_insight/HumanEval/5/QS/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..54455dbcb5a12bacd829eb7fdf4e0133e8fce50a --- /dev/null +++ b/llm_insight/HumanEval/5/QS/CC_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in subsets 1-4 but drops significantly in subset 5, indicating potential issues with specific types of data in subset 5.", + "Nxcode-CQ-7B": "Consistently high performance across all subsets, with the highest scores in subsets 1 and 3. This model is robust and reliable across different data types.", + "codegemma-2b": "Poor performance across all subsets, particularly in subset 5. This model is not suitable for tasks requiring high accuracy.", + "codegemma-7b": "Moderate performance with a noticeable drop in subset 5. The model struggles with certain data types but performs reasonably well in others.", + "codegemma-7b-it": "Better performance than codegemma-7b, especially in subsets 2-5, but still not as strong as the top-performing models.", + "deepseek-coder-1.3b-base": "Low performance across all subsets, similar to codegemma-2b. Not recommended for high-accuracy tasks.", + "deepseek-coder-6.7b-base": "Moderate performance with a significant drop in subset 5. The model is inconsistent across different data types.", + "deepseek_coder-6.7b-instruct": "Strong performance across all subsets, particularly in subsets 1 and 4. This model is reliable and performs well under various conditions.", + "deepseek_coder_33b-base": "Good performance in subsets 1-4 but drops in subset 5. The model is generally reliable but has some weaknesses.", + "deepseek_coder_33b-instruct": "Strong performance across all subsets, with particularly high scores in subsets 1 and 4. This model is robust and performs well under various conditions.", + "codeqwen1.5-7b": "Moderate performance with a noticeable drop in subset 5. The model is inconsistent but performs reasonably well in subsets 1-4.", + "global_insights": "The models Nxcode-CQ-7B, deepseek_coder-6.7b-instruct, and deepseek_coder_33b-instruct consistently perform well across all subsets, indicating robustness. Subset 5 appears to be challenging for most models, suggesting it contains more complex or diverse data. The codegemma and deepseek-coder-1.3b-base models generally underperform, making them less suitable for high-accuracy tasks." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/5/QS/line_counts_recommendation.json b/llm_insight/HumanEval/5/QS/line_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..0488265abd7666e6330b8b2cad2a4ffaad6d19ff --- /dev/null +++ b/llm_insight/HumanEval/5/QS/line_counts_recommendation.json @@ -0,0 +1,23 @@ +{ + "High performance and robustness across varying line counts": [ + { + "Nxcode-CQ-7B": "This model consistently performs well across all subsets, making it ideal for tasks requiring stability and high accuracy regardless of line count." + } + ], + "Balanced performance and cost-effectiveness": [ + { + "deepseek_coder-6.7b-instruct": "This model offers strong performance in lower line counts and is more cost-effective than larger models, suitable for tasks where line counts are moderate." + }, + { + "CodeFuse-DeepSeek-33b": "Provides decent performance across subsets and is a good balance between cost and performance for tasks with varying line counts." + } + ], + "Budget-conscious scenarios with lower line counts": [ + { + "codeqwen1.5-7b": "A cost-effective option for tasks with lower line counts, though performance drops as line counts increase." + }, + { + "codegemma-7b-it": "Suitable for budget-conscious scenarios where line counts are low to moderate, though performance is not as robust as larger models." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/5/QS/line_counts_report.json b/llm_insight/HumanEval/5/QS/line_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..efac0e44cc845d3e4ce80ae0629dc94ff67c611c --- /dev/null +++ b/llm_insight/HumanEval/5/QS/line_counts_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows consistent performance across subsets, with a peak in line_subset_2 (84.85). However, it has a noticeable dip in line_subset_4 (66.67), indicating potential sensitivity to certain line count distributions.", + "Nxcode-CQ-7B": "This model demonstrates strong and stable performance across all subsets, with the highest score in line_subset_1 (92.27) and maintaining above 87 in most subsets. It is robust to varying line counts.", + "codegemma-2b": "Performance is consistently low across all subsets, with the highest score in line_subset_1 (51.67) and dropping significantly in other subsets. This suggests the model struggles with tasks involving varying line counts.", + "codegemma-7b": "While better than codegemma-2b, this model still underperforms, with scores ranging from 28.44 to 60.45. It shows a declining trend as line counts increase.", + "codegemma-7b-it": "This model performs better than its non-IT counterpart, with scores ranging from 39.69 to 73.33. However, it still shows a decline as line counts increase.", + "deepseek-coder-1.3b-base": "The model's performance is poor, with scores ranging from 21.72 to 52.73. It shows a clear downward trend as line counts increase.", + "deepseek-coder-6.7b-base": "Performance is moderate, with scores ranging from 29.53 to 74.09. Like other models, it shows a decline with increasing line counts.", + "deepseek_coder-6.7b-instruct": "This model shows strong performance in line_subset_1 (79.55) and line_subset_2 (84.39), but drops significantly in line_subset_5 (52.03). It is sensitive to higher line counts.", + "deepseek_coder_33b-base": "Performance is moderate, with scores ranging from 38.59 to 75.0. It shows a consistent decline as line counts increase.", + "deepseek_coder_33b-instruct": "This model performs well in line_subset_1 (82.27) but shows a steady decline to 50.16 in line_subset_5. It is less robust to higher line counts.", + "codeqwen1.5-7b": "Performance is moderate, with scores ranging from 41.72 to 63.18. It shows a steady decline as line counts increase.", + "global_insights": "Most models show a decline in performance as line counts increase, indicating a general sensitivity to larger code blocks. Nxcode-CQ-7B stands out as the most robust model across all subsets. Smaller models like codegemma-2b and deepseek-coder-1.3b-base struggle significantly, while larger models like Nxcode-CQ-7B and deepseek_coder-6.7b-instruct show better but varying performance." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/5/QS/token_counts_recommendation.json b/llm_insight/HumanEval/5/QS/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..4e1861f3b592466a27e5125a1fd0fbe1d6ba1e41 --- /dev/null +++ b/llm_insight/HumanEval/5/QS/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance and robustness": [ + { + "Nxcode-CQ-7B": "Consistently high performance across all token subsets, making it reliable for varying input sizes." + }, + { + "deepseek_coder-6.7b-instruct": "Strong performance in most subsets, though it shows some inconsistency in the largest token subset." + } + ], + "Moderate performance with cost-effectiveness": [ + { + "codegemma-7b-it": "Balances performance and cost, though it declines with larger token counts." + }, + { + "deepseek_coder-33b-instruct": "Good performance in smaller token subsets and maintains a baseline in larger ones." + } + ], + "Low-cost with basic capabilities": [ + { + "codeqwen1.5-7b": "Moderate performance at a lower cost, suitable for less demanding tasks." + }, + { + "deepseek-coder-6.7b-base": "Basic performance at a lower cost, though it declines with larger token counts." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/5/QS/token_counts_report.json b/llm_insight/HumanEval/5/QS/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..61b3bdc4108883f73470c2cfdd9379be7cb04621 --- /dev/null +++ b/llm_insight/HumanEval/5/QS/token_counts_report.json @@ -0,0 +1,15 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in token_subset_3 (90.91) but has inconsistent results across other subsets, indicating potential instability with varying token counts.", + "Nxcode-CQ-7B": "Consistently high performance across all subsets, especially in token_subset_1 (95.45), suggesting robustness to token count variations.", + "codegemma-2b": "Poor performance across all subsets, with a significant drop in token_subset_5 (5.31), indicating it struggles with larger token counts.", + "codegemma-7b": "Moderate performance with a steady decline as token counts increase, suggesting limitations in handling larger inputs.", + "codegemma-7b-it": "Better than codegemma-7b but still shows a decline in performance with increasing token counts, though it maintains a baseline in token_subset_5 (39.22).", + "deepseek-coder-1.3b-base": "Low performance across all subsets, with a sharp drop in token_subset_4 (15.45) and token_subset_5 (10.94), indicating it is not suitable for larger token counts.", + "deepseek-coder-6.7b-base": "Moderate performance with a steady decline as token counts increase, similar to codegemma-7b.", + "deepseek_coder-6.7b-instruct": "Strong performance in token_subset_1 (87.88) and token_subset_4 (76.36), but a significant drop in token_subset_5 (51.72), indicating inconsistency.", + "deepseek_coder_33b-base": "Moderate performance with a steady decline as token counts increase, similar to other base models.", + "deepseek_coder_33b-instruct": "Strong performance in token_subset_1 (87.12) but declines steadily, though it maintains a baseline in token_subset_4 (64.7).", + "codeqwen1.5-7b": "Moderate performance with a steady decline as token counts increase, similar to other 7b models.", + "new_model": "Performance mirrors codegemma-7b-it, suggesting similar capabilities and limitations.", + "global_insights": "Nxcode-CQ-7B and deepseek_coder-6.7b-instruct show the highest and most consistent performance across subsets. Smaller models like codegemma-2b and deepseek-coder-1.3b-base struggle significantly with larger token counts. There is a general trend of declining performance as token counts increase, with some models like Nxcode-CQ-7B being exceptions." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/6/EI/CC_recommendation.json b/llm_insight/HumanEval/6/EI/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..840183301a3ae3f0f47a4e53484acdedc8fbb9fe --- /dev/null +++ b/llm_insight/HumanEval/6/EI/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy tasks with tolerance for variability": [ + { + "CodeFuse-DeepSeek-33b": "Achieves perfect scores in subsets 4 and 5, making it ideal for tasks where these subsets are representative." + }, + { + "Nxcode-CQ-7B": "Excels in subsets 2 and 6, suitable for tasks prioritizing these data types." + } + ], + "Balanced performance and cost-effectiveness": [ + { + "deepseek_coder-6.7b-instruct": "Offers strong performance in multiple subsets without the extreme variability of larger models." + }, + { + "codeqwen1.5-7b": "Provides good performance in subset 5 and moderate performance elsewhere, suitable for balanced tasks." + } + ], + "Low-resource environments": [ + { + "codegemma-7b-it": "Better performance than its non-IT counterpart, suitable for environments with limited computational resources." + }, + { + "deepseek-coder-6.7b-base": "Moderate performance at a lower computational cost compared to larger models." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/6/EI/CC_report.json b/llm_insight/HumanEval/6/EI/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..2a515d3e3ebcbe5e51c40f81ee840cbb248e9916 --- /dev/null +++ b/llm_insight/HumanEval/6/EI/CC_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in subsets 4 and 5 with perfect scores, but has a significant drop in subset 3 (46.67). This indicates potential variability in handling different types of data within the CC perspective.", + "Nxcode-CQ-7B": "This model excels in subsets 2 and 6 (91.41 and 98.33) but performs poorly in subset 4 (20.0). The wide range of scores suggests sensitivity to specific data characteristics.", + "codegemma-2b": "Consistently low performance across all subsets, with the highest score being 31.63 in subset 1. This model may not be suitable for tasks requiring high accuracy.", + "codegemma-7b": "Moderate performance with peaks in subset 1 (46.84) and subset 5 (40.0). The model struggles in subset 4 (0.0), indicating a potential weakness in certain data types.", + "codegemma-7b-it": "Better than its non-IT counterpart, with a high of 57.35 in subset 1. However, it still shows significant drops in subsets 4 and 5 (0.0 and 5.0).", + "deepseek-coder-1.3b-base": "Very low performance across all subsets, with no scores above 38.57. Not recommended for high-accuracy tasks.", + "deepseek-coder-6.7b-base": "Moderate performance with a high of 51.68 in subset 1. Similar to other models, it fails in subset 4 (0.0).", + "deepseek_coder-6.7b-instruct": "Strong performance in subsets 1, 2, and 3 (73.16, 77.72, 56.67), but fails in subset 4 (0.0). This suggests good generalizability except for specific data types.", + "deepseek_coder_33b-base": "Variable performance with a high of 80.0 in subset 5 but 0.0 in subset 4. This inconsistency may limit its reliability.", + "deepseek_coder_33b-instruct": "Consistently good performance in subsets 1, 2, and 3 (67.86, 70.33, 53.0), but struggles in subset 4 (0.0). The model shows promise but has clear limitations.", + "codeqwen1.5-7b": "Strong performance in subset 5 (95.0) but fails in subset 4 (0.0). The model's performance is otherwise moderate, indicating potential for specific use cases.", + "global_insights": "1. Most models struggle with subset 4, indicating a potential outlier or particularly challenging data type within the CC perspective. 2. Larger models (e.g., 33b variants) generally perform better but are not immune to significant drops in specific subsets. 3. The Nxcode-CQ-7B and CodeFuse-DeepSeek-33b models show the highest peaks but also the most variability, suggesting a trade-off between peak performance and consistency." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/6/EI/line_counts_recommendation.json b/llm_insight/HumanEval/6/EI/line_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..9f603b944ae2dc354714dfeb1fb99f18c8bc6742 --- /dev/null +++ b/llm_insight/HumanEval/6/EI/line_counts_recommendation.json @@ -0,0 +1,31 @@ +{ + "Small to Medium Line Counts (Cost-Effective)": [ + { + "Nxcode-CQ-7B": "Consistently high performance in subsets 1-4, making it reliable for smaller to medium line counts." + }, + { + "deepseek_coder-6.7b-instruct": "Strong performance in subsets 1-4, offering a good balance of cost and effectiveness." + } + ], + "Large Line Counts (High Performance)": [ + { + "CodeFuse-DeepSeek-33b": "Exceptional performance in subset 6, though inconsistent in others. Best for tasks focusing on large line counts." + } + ], + "General Purpose (Balanced)": [ + { + "codeqwen1.5-7b": "Moderate performance across most subsets, suitable for general use where line counts vary." + }, + { + "deepseek_coder_33b-instruct": "More consistent than its base version, offering a balance for varied line counts." + } + ], + "Avoid": [ + { + "codegemma-2b": "Poor performance across all subsets." + }, + { + "deepseek-coder-1.3b-base": "Consistently low scores, not suitable for any scenario." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/6/EI/line_counts_report.json b/llm_insight/HumanEval/6/EI/line_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..b06f8bd82d97730000530883c69ac21847b9068d --- /dev/null +++ b/llm_insight/HumanEval/6/EI/line_counts_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in subsets 1-3 and subset 6, but a significant drop in subset 4. This suggests it handles larger line counts well but struggles with medium-sized subsets.", + "Nxcode-CQ-7B": "Consistently high performance across subsets 1-4, with a notable drop in subsets 5-6. This indicates robustness for smaller to medium line counts but less effectiveness for larger ones.", + "codegemma-2b": "Poor performance across all subsets, with particularly low scores in subsets 4-6. Not suitable for tasks requiring handling of larger line counts.", + "codegemma-7b": "Moderate performance in subsets 1-3, but sharply declines in subsets 4-6. Better than codegemma-2b but still not reliable for larger line counts.", + "codegemma-7b-it": "Improved over codegemma-7b, especially in subsets 1-3, but still struggles with larger line counts. A middle-ground option among codegemma models.", + "deepseek-coder-1.3b-base": "Low performance across the board, with slight improvements in subset 6. Not recommended for any line count range.", + "deepseek-coder-6.7b-base": "Better than 1.3b but still inconsistent, with a sharp drop in subsets 4-6. Limited utility for larger line counts.", + "deepseek_coder-6.7b-instruct": "Strong in subsets 1-4, but performance halves in subsets 5-6. Good for smaller to medium line counts but not larger ones.", + "deepseek_coder_33b-base": "Variable performance, with high scores in subsets 1 and 6 but poor in subset 5. Unpredictable for consistent use.", + "deepseek_coder_33b-instruct": "Similar to 33b-base but slightly more consistent in subsets 1-4. Still not reliable for larger line counts.", + "codeqwen1.5-7b": "Moderate in subsets 1-3, but drops significantly in subsets 4-6. A balanced option but not for larger line counts.", + "global_insights": "Models generally perform better on smaller to medium line counts (subsets 1-3) and struggle with larger ones (subsets 4-6). Nxcode-CQ-7B and CodeFuse-DeepSeek-33b are top performers but have specific weaknesses. Codegemma models are consistently weak, while deepseek models show variability. Instruct versions of models often outperform their base counterparts." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/6/EI/token_counts_recommendation.json b/llm_insight/HumanEval/6/EI/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..df909a4b9ddf6b665f5efca8a9562efe7cd25a12 --- /dev/null +++ b/llm_insight/HumanEval/6/EI/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance with moderate token counts": [ + { + "Nxcode-CQ-7B": "Consistently high performance across most subsets except the largest." + }, + { + "deepseek_coder-6.7b-instruct": "Robust performance for most token counts, though not the largest." + } + ], + "Cost-effective for small to medium tasks": [ + { + "codegemma-7b-it": "Better performance than smaller codegemma models for mid-range token counts." + }, + { + "deepseek-coder-6.7b-base": "Balanced performance and resource usage for smaller tasks." + } + ], + "Large and complex tasks": [ + { + "CodeFuse-DeepSeek-33b": "Handles larger token counts better than most, though with some variability." + }, + { + "deepseek_coder_33b-instruct": "Good for large inputs but inconsistent in some subsets." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/6/EI/token_counts_report.json b/llm_insight/HumanEval/6/EI/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..e5d1d2ea6b015d282038943c53d0ea476c5a3f73 --- /dev/null +++ b/llm_insight/HumanEval/6/EI/token_counts_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in subsets with higher token counts (token_subset_2 and token_subset_6), but struggles in token_subset_3 and token_subset_4. This suggests it may be better suited for larger code snippets or more complex tasks.", + "Nxcode-CQ-7B": "Consistently performs well across most subsets except token_subset_6, where it drops significantly. This indicates robustness in handling varying token counts but may have limitations with very large inputs.", + "codegemma-2b": "Performance degrades sharply as token count increases, making it unsuitable for larger or more complex code tasks.", + "codegemma-7b": "Similar to codegemma-2b but with slightly better performance across subsets. Still not ideal for larger inputs.", + "codegemma-7b-it": "Shows improvement over codegemma-7b, particularly in mid-range token subsets, but still fails in token_subset_6.", + "deepseek-coder-1.3b-base": "Struggles with larger token counts, performing poorly in subsets 3 through 6.", + "deepseek-coder-6.7b-base": "Better than the 1.3b version but still not suitable for larger inputs.", + "deepseek_coder-6.7b-instruct": "Performs well in subsets 1-5 but fails in token_subset_6. This suggests it is robust for most tasks but may not handle very large inputs well.", + "deepseek_coder_33b-base": "Shows moderate performance across subsets but struggles with larger inputs.", + "deepseek_coder_33b-instruct": "Performs well in subsets 1-2 and subset 5, but drops in subsets 3-4 and fails in subset 6. This indicates variability in handling different token counts.", + "codeqwen1.5-7b": "Moderate performance across subsets but struggles with larger inputs.", + "global_insights": "Models generally perform better with smaller token counts, with performance degrading as token count increases. Larger models (e.g., 33b variants) tend to handle larger inputs better but still have limitations. The Nxcode-CQ-7B and deepseek_coder-6.7b-instruct models show the most consistent performance across varying token counts, though they still struggle with the largest inputs." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/6/QS/CC_recommendation.json b/llm_insight/HumanEval/6/QS/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..96c5a0523d64c026f98dea1498bc0426c3b14441 --- /dev/null +++ b/llm_insight/HumanEval/6/QS/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy and reliability": [ + { + "Nxcode-CQ-7B": "Consistently high performance across all subsets, making it the most reliable choice for critical applications." + }, + { + "CodeFuse-DeepSeek-33b": "Strong performance in most subsets, though with some instability in subset 6." + } + ], + "Moderate accuracy with cost-effectiveness": [ + { + "deepseek_coder-6.7b-instruct": "Good performance in most subsets, offering a balance between cost and accuracy." + }, + { + "deepseek_coder_33b-instruct": "Solid performance across most subsets, suitable for less critical tasks." + } + ], + "Low-cost with basic performance": [ + { + "codegemma-7b-it": "Better than its non-IT counterpart, suitable for basic tasks where high accuracy is not required." + }, + { + "deepseek-coder-6.7b-base": "Moderate performance, suitable for non-critical applications." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/6/QS/CC_report.json b/llm_insight/HumanEval/6/QS/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..aa042a6015352ea9aafef2e78f99ca92f050c864 --- /dev/null +++ b/llm_insight/HumanEval/6/QS/CC_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in subsets 2, 3, and 5, but struggles in subset 6 with a significant drop to 50.0. This indicates potential instability in handling certain types of data within the CC perspective.", + "Nxcode-CQ-7B": "Consistently high performance across all subsets, with the lowest score being 80.18 in subset 3. This model demonstrates robustness and reliability across different data splits.", + "codegemma-2b": "Poor performance across all subsets, particularly in subset 6 with a score of 2.29. This model is not suitable for tasks requiring high accuracy.", + "codegemma-7b": "Moderate performance with scores ranging from 14.37 to 61.61. The model shows some capability but is inconsistent, especially in subset 6.", + "codegemma-7b-it": "Better than its non-IT counterpart, with scores ranging from 33.33 to 65.36. Still, it lacks the consistency needed for critical applications.", + "deepseek-coder-1.3b-base": "Low performance across the board, with a significant drop in subset 6 to 6.67. Not recommended for any serious tasks.", + "deepseek-coder-6.7b-base": "Moderate performance, with scores between 13.75 and 69.64. Shows some promise but is inconsistent.", + "deepseek_coder-6.7b-instruct": "Strong performance in subsets 1, 4, and 5, with a notable drop in subset 6 to 54.58. This model is reliable but has some limitations.", + "deepseek_coder_33b-base": "Moderate to good performance, with scores ranging from 28.96 to 70.71. Shows potential but is not the top performer.", + "deepseek_coder_33b-instruct": "Good performance across most subsets, with scores between 45.42 and 80.18. A solid choice but not the best.", + "codeqwen1.5-7b": "Moderate performance, with scores ranging from 29.58 to 64.64. Inconsistent and not recommended for high-stakes tasks.", + "global_insights": "Nxcode-CQ-7B is the top performer, showing consistent high scores across all subsets. CodeFuse-DeepSeek-33b and deepseek_coder-6.7b-instruct also perform well but have notable drops in subset 6. Smaller models like codegemma-2b and deepseek-coder-1.3b-base perform poorly, indicating that model size and architecture play a significant role in performance. The CC perspective seems to favor larger, more robust models." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/6/QS/line_counts_recommendation.json b/llm_insight/HumanEval/6/QS/line_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..f2b8af8f3ae69ffeb615aa6240e317597135d2db --- /dev/null +++ b/llm_insight/HumanEval/6/QS/line_counts_recommendation.json @@ -0,0 +1,34 @@ +{ + "High accuracy tasks with variable line counts": [ + { + "Nxcode-CQ-7B": "Consistently high performance across most subsets." + }, + { + "deepseek_coder-6.7b-instruct": "Strong in subsets with higher line counts." + } + ], + "Cost-effective for moderate line counts": [ + { + "CodeFuse-DeepSeek-33b": "Balanced performance and cost for moderate line counts." + }, + { + "codeqwen1.5-7b": "Moderate performance at a lower cost." + } + ], + "Tasks with smaller line counts": [ + { + "deepseek-coder-6.7b-base": "Suitable for smaller line counts at a lower cost." + }, + { + "deepseek_coder_33b-base": "Better performance for small to medium line counts." + } + ], + "Not recommended for any tasks": [ + { + "codegemma-2b": "Poor performance across all subsets." + }, + { + "deepseek-coder-1.3b-base": "Low accuracy in all scenarios." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/6/QS/line_counts_report.json b/llm_insight/HumanEval/6/QS/line_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..8effaf3c3bf337bf9b30f6acb8e17022bbbb7527 --- /dev/null +++ b/llm_insight/HumanEval/6/QS/line_counts_report.json @@ -0,0 +1,15 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows consistent performance across subsets, with a slight dip in subset_4. It performs well in subsets with moderate line counts.", + "Nxcode-CQ-7B": "This model excels in most subsets, particularly in subset_1 and subset_2, but shows a noticeable drop in subset_6. It is robust for larger line counts.", + "codegemma-2b": "Performance is poor across all subsets, with the lowest scores in subset_4 and subset_6. Not suitable for tasks requiring high accuracy.", + "codegemma-7b": "Better than codegemma-2b but still underperforms in subsets with higher line counts. Moderate performance overall.", + "codegemma-7b-it": "Improved over codegemma-7b, especially in subsets_1 and subset_5. Still struggles with higher line counts.", + "deepseek-coder-1.3b-base": "Low performance across all subsets, particularly in subset_4 and subset_6. Not recommended for complex tasks.", + "deepseek-coder-6.7b-base": "Moderate performance in subsets_1 and subset_2, but drops significantly in subset_6. Suitable for smaller line counts.", + "deepseek_coder-6.7b-instruct": "Strong performance in subset_2, but inconsistent in others. Good for tasks with variable line counts.", + "deepseek_coder_33b-base": "Decent performance in subset_1, but declines as line counts increase. Best for smaller to medium line counts.", + "deepseek_coder_33b-instruct": "High performance in subset_1 and subset_2, but drops in subset_6. Suitable for tasks with moderate line counts.", + "codeqwen1.5-7b": "Moderate performance across subsets, with a dip in subset_4. Best for tasks with consistent line counts.", + "new": "Similar to codegemma-7b-it, with better performance in subset_1 and subset_5. Still struggles with higher line counts.", + "global_insights": "Models like Nxcode-CQ-7B and deepseek_coder-6.7b-instruct perform well in subsets with higher line counts, while smaller models like codegemma-2b struggle. Larger models generally handle variability better, but cost-effectiveness must be considered." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/6/QS/token_counts_recommendation.json b/llm_insight/HumanEval/6/QS/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..f408e8ee729a22520839e3c98c0d0434fb6bd227 --- /dev/null +++ b/llm_insight/HumanEval/6/QS/token_counts_recommendation.json @@ -0,0 +1,23 @@ +{ + "High accuracy and robustness across token counts": [ + { + "Nxcode-CQ-7B": "Consistently high performance across all token subsets, making it reliable for varied token counts." + } + ], + "Balanced performance and cost-effectiveness for low to medium token tasks": [ + { + "deepseek_coder-6.7b-instruct": "Strong performance in lower token subsets and moderate in medium, suitable for cost-effective solutions." + }, + { + "deepseek_coder_33b-instruct": "High performance in low token subsets, good for tasks where token count is controlled." + } + ], + "Budget-friendly for low token tasks": [ + { + "codegemma-7b-it": "Moderate performance in low token subsets at a lower cost compared to larger models." + }, + { + "codeqwen1.5-7b": "Decent performance in low token subsets, suitable for budget constraints." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/6/QS/token_counts_report.json b/llm_insight/HumanEval/6/QS/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..49cde9bb0a9a5108cd94712298464f0ce189b329 --- /dev/null +++ b/llm_insight/HumanEval/6/QS/token_counts_report.json @@ -0,0 +1,15 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows high variability in performance across token subsets, peaking at 96.43% in token_subset_3 but dropping to 62.5% in token_subset_6. This suggests sensitivity to token count variations.", + "Nxcode-CQ-7B": "Consistently high performance across all subsets, with the lowest score at 77.08% in token_subset_6. Demonstrates robustness to token count changes.", + "codegemma-2b": "Poor performance across all subsets, especially in token_subset_6 with only 1.25%. Not suitable for tasks requiring high accuracy.", + "codegemma-7b": "Moderate performance with significant drops in higher token subsets. Performance degrades as token count increases.", + "codegemma-7b-it": "Better than codegemma-7b but still shows a declining trend with increasing token counts. Best performance in token_subset_1 at 75.89%.", + "deepseek-coder-1.3b-base": "Low performance across all subsets, with a sharp decline in token_subset_6 to 5.62%. Not recommended for high-token tasks.", + "deepseek-coder-6.7b-base": "Moderate performance with a steady decline as token count increases. Best in token_subset_1 at 73.04%.", + "deepseek_coder-6.7b-instruct": "Strong performance in lower token subsets (85.71% in token_subset_1) but drops to 46.67% in token_subset_6. Good for low to medium token tasks.", + "deepseek_coder_33b-base": "Variable performance, peaking at 73.57% in token_subset_1 and dropping to 25.42% in token_subset_6. Sensitive to token count.", + "deepseek_coder_33b-instruct": "High performance in lower token subsets (86.25% in token_subset_1) but declines to 39.58% in token_subset_6. Suitable for low to medium token tasks.", + "codeqwen1.5-7b": "Moderate performance with a steady decline as token count increases. Best in token_subset_1 at 69.11%.", + "new": "Similar to codegemma-7b-it, with best performance in token_subset_1 at 75.89% and declining to 36.25% in token_subset_6.", + "global_insights": "Models generally perform better in lower token subsets, with performance degrading as token count increases. Nxcode-CQ-7B is the most robust across all subsets. Smaller models like codegemma-2b and deepseek-coder-1.3b-base perform poorly, especially in high-token subsets. Instruct models (e.g., deepseek_coder-6.7b-instruct) show better performance than their base counterparts but still decline with higher token counts." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/7/EI/CC_recommendation.json b/llm_insight/HumanEval/7/EI/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..95f3407de5b9d66bd44820f67a9dc4dd74a4bb83 --- /dev/null +++ b/llm_insight/HumanEval/7/EI/CC_recommendation.json @@ -0,0 +1,34 @@ +{ + "High accuracy tasks": [ + { + "Nxcode-CQ-7B": "Consistently high performance across most subsets, especially in subset 7 (98.33%)." + }, + { + "CodeFuse-DeepSeek-33b": "Strong performance in subsets 5 and 6 with 100% accuracy." + } + ], + "Moderate accuracy tasks with cost-effectiveness": [ + { + "deepseek_coder_33b-instruct": "Balanced performance across most subsets with reasonable accuracy." + }, + { + "codeqwen1.5-7b": "High variability but shows potential in specific subsets like subset 6 (95.0%)." + } + ], + "Low priority tasks": [ + { + "codegemma-7b-it": "Moderate performance in some subsets but fails in others." + }, + { + "deepseek_coder-6.7b-instruct": "Good performance in certain subsets but inconsistent overall." + } + ], + "Not recommended": [ + { + "codegemma-2b": "Poor performance across all subsets." + }, + { + "deepseek-coder-1.3b-base": "Very poor performance, especially in critical subsets." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/7/EI/CC_report.json b/llm_insight/HumanEval/7/EI/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..acbd780d53ab5a1a9c90b84c58b1f45e9e5f07ee --- /dev/null +++ b/llm_insight/HumanEval/7/EI/CC_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in subsets 5 and 6 with 100% accuracy but struggles in subset 4 with only 33.33%. This indicates potential overfitting or dataset bias in certain subsets.", + "Nxcode-CQ-7B": "Consistently high performance across most subsets, especially in subset 7 (98.33%). However, it performs poorly in subset 5 (20.0%), suggesting a potential weakness in handling specific data characteristics.", + "codegemma-2b": "Poor performance across all subsets, with the highest score being 38.63% in subset 1. This model is not suitable for tasks requiring high accuracy.", + "codegemma-7b": "Moderate performance in subsets 1 and 2 but drops significantly in others. The model fails completely in subset 5 (0.0%).", + "codegemma-7b-it": "Better than its 7b counterpart, especially in subsets 1, 2, and 3. However, it still struggles in subset 5 (0.0%).", + "deepseek-coder-1.3b-base": "Very poor performance, especially in subsets 5, 6, and 7 (0.0%). Not recommended for any serious tasks.", + "deepseek-coder-6.7b-base": "Slightly better than the 1.3b version but still underperforms in subsets 5, 6, and 7.", + "deepseek_coder-6.7b-instruct": "Good performance in subsets 1, 2, and 3 but fails in subset 5 (0.0%). Shows potential for specific tasks.", + "deepseek_coder_33b-base": "Mixed performance, with a high score in subset 6 (80.0%) but fails in subset 5 (0.0%).", + "deepseek_coder_33b-instruct": "Consistently moderate to high performance across most subsets except subset 5 (0.0%).", + "codeqwen1.5-7b": "High variability in performance, with a peak in subset 6 (95.0%) but fails in subset 5 (0.0%).", + "global_insights": "1. Subset 5 is particularly challenging for most models, with many scoring 0.0%. 2. Larger models (e.g., 33b) generally perform better than smaller ones. 3. Instruct-tuned models show better performance than their base counterparts. 4. There is significant variability in model performance across different subsets, indicating the importance of dataset diversity." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/7/EI/line_counts_recommendation.json b/llm_insight/HumanEval/7/EI/line_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..a287de7afeb621dd8e0a6ea56a0fb6fa5914cf40 --- /dev/null +++ b/llm_insight/HumanEval/7/EI/line_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance with large line counts": [ + { + "CodeFuse-DeepSeek-33b": "Excels in larger line count subsets (line_subset_5 and line_subset_7) with perfect scores." + }, + { + "Nxcode-CQ-7B": "Strong performance in line_subset_7 and generally robust across most subsets." + } + ], + "Balanced performance for general use": [ + { + "deepseek_coder-6.7b-instruct": "Good performance in line_subset_1 and line_subset_2, suitable for tasks with moderate line counts." + }, + { + "codeqwen1.5-7b": "Moderate performance across all subsets, making it a versatile choice for general tasks." + } + ], + "Cost-effective for smaller code segments": [ + { + "codegemma-7b-it": "Better performance than base codegemma models for smaller line counts (line_subset_1 and line_subset_3)." + }, + { + "deepseek-coder-1.3b-base": "Low cost but limited to very small code segments where performance is acceptable." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/7/EI/line_counts_report.json b/llm_insight/HumanEval/7/EI/line_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..8d75764705ec36d310b0908ea95f9790f4a42e6e --- /dev/null +++ b/llm_insight/HumanEval/7/EI/line_counts_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance across most subsets, particularly excelling in line_subset_5 and line_subset_7 with perfect scores. However, it has a noticeable dip in line_subset_3 and line_subset_6, suggesting potential variability in handling certain line count ranges.", + "Nxcode-CQ-7B": "This model demonstrates robust performance in line_subset_1 and line_subset_2 but shows inconsistency in line_subset_5, where performance drops significantly. It performs well in line_subset_7, indicating strength in larger line counts.", + "codegemma-2b": "The model struggles across all subsets, with particularly poor performance in line_subset_6 and line_subset_7. This suggests it may not be suitable for tasks requiring handling of larger or more complex code segments.", + "codegemma-7b": "Performance is mediocre, with some improvement over codegemma-2b but still lacking in subsets like line_subset_5 and line_subset_6. It shows slight improvement in line_subset_7 compared to its smaller counterpart.", + "codegemma-7b-it": "This model shows better performance than the base codegemma versions, particularly in line_subset_1 and line_subset_3. However, it still struggles with line_subset_5 and line_subset_6.", + "deepseek-coder-1.3b-base": "Performance is generally weak, with significant drops in line_subset_5 and line_subset_6. It shows some capability in line_subset_7 but remains inconsistent.", + "deepseek-coder-6.7b-base": "The model has variable performance, with a complete failure in line_subset_5. It performs moderately in other subsets but lacks consistency.", + "deepseek_coder-6.7b-instruct": "This model shows strong performance in line_subset_1 and line_subset_2 but declines in subsets with higher line counts, particularly line_subset_5 and line_subset_6.", + "deepseek_coder_33b-base": "Performance is inconsistent, with strong showings in line_subset_1 but significant drops in line_subset_5 and line_subset_6. It recovers somewhat in line_subset_7.", + "deepseek_coder_33b-instruct": "The model performs well in line_subset_1 and line_subset_4 but fails completely in line_subset_5. It shows moderate performance in other subsets.", + "codeqwen1.5-7b": "Performance is middling across all subsets, with no standout strengths or weaknesses. It handles line_subset_7 better than some other models but remains inconsistent.", + "global_insights": "Models like CodeFuse-DeepSeek-33b and Nxcode-CQ-7B generally perform better across subsets, especially in handling larger line counts. Smaller models (e.g., codegemma-2b, deepseek-coder-1.3b-base) struggle significantly, particularly with more complex or lengthy code segments. Instruct-tuned models (e.g., deepseek_coder-6.7b-instruct) show improved performance in certain subsets but still exhibit variability. The dataset division by line counts reveals that model performance can be highly dependent on the complexity and size of the code segments being evaluated." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/7/EI/token_counts_recommendation.json b/llm_insight/HumanEval/7/EI/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..56be9eafaa8ea726c08380c631b70eb3f13934cd --- /dev/null +++ b/llm_insight/HumanEval/7/EI/token_counts_recommendation.json @@ -0,0 +1,34 @@ +{ + "High performance with cost-effectiveness": [ + { + "Nxcode-CQ-7B": "Consistently high performance across most subsets, making it a reliable choice for varied token counts." + }, + { + "deepseek_coder-6.7b-instruct": "Strong performance in lower token subsets and moderate in higher, offering a good balance." + } + ], + "Budget-conscious with moderate performance": [ + { + "codegemma-7b-it": "Better performance than base models at a lower cost, suitable for tasks with lower token counts." + }, + { + "deepseek_coder_33b-instruct": "Good performance in lower token subsets, though inconsistent in higher counts." + } + ], + "High performance regardless of cost": [ + { + "CodeFuse-DeepSeek-33b": "Excels in mid-range token subsets and achieves perfect score in subset 7, ideal for high-stakes tasks." + }, + { + "Nxcode-CQ-7B": "Robust performance across most subsets, suitable for critical applications." + } + ], + "Not recommended": [ + { + "codegemma-2b": "Poor performance across all subsets, not suitable for any token count." + }, + { + "deepseek-coder-1.3b-base": "Low performance across all subsets, not recommended for any task." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/7/EI/token_counts_report.json b/llm_insight/HumanEval/7/EI/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..6efd757abf59e20711939d26eca8c73c29331825 --- /dev/null +++ b/llm_insight/HumanEval/7/EI/token_counts_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in subsets 2 and 7 with 84.51 and 100.0 respectively, but has a significant drop in subset 6 with 0. This indicates variability in performance based on token counts, excelling in mid-range token subsets but failing in subset 6.", + "Nxcode-CQ-7B": "Consistently high performance across most subsets (94.05 in subset 1, 86.48 in subset 2), but drops to 0 in subset 6 and 20.0 in subset 7. This suggests robustness in handling various token counts except for extreme cases.", + "codegemma-2b": "Poor performance across all subsets, with the highest score being 42.98 in subset 1. This model struggles with all token counts, indicating it may not be suitable for tasks requiring varied token lengths.", + "codegemma-7b": "Better than codegemma-2b but still underperforms, with the highest score of 54.52 in subset 1. Performance degrades as token counts increase, suggesting limitations in handling larger token subsets.", + "codegemma-7b-it": "Improved over codegemma-7b, with the highest score of 69.76 in subset 1. However, performance still declines with increasing token counts, indicating limited scalability.", + "deepseek-coder-1.3b-base": "Low performance across all subsets, peaking at 48.33 in subset 1. This model is not recommended for tasks involving varied token counts.", + "deepseek-coder-6.7b-base": "Moderate performance, peaking at 67.86 in subset 1. Performance drops significantly with higher token counts, similar to other base models.", + "deepseek_coder-6.7b-instruct": "Strong performance in subsets 1 and 2 (86.31 and 70.28), but drops in higher token subsets. This model is more robust than base versions but still has limitations.", + "deepseek_coder_33b-base": "Moderate performance, with the highest score of 68.21 in subset 1. Performance declines with higher token counts, indicating scalability issues.", + "deepseek_coder_33b-instruct": "Good performance in subsets 1 and 2 (82.74 and 67.61), but inconsistent in higher token subsets. This model shows promise but is not consistently reliable.", + "codeqwen1.5-7b": "Moderate performance, peaking at 65.6 in subset 1. Performance is inconsistent across subsets, suggesting limited reliability for varied token counts.", + "global_insights": "Models generally perform better in lower token count subsets (1-3) and degrade in higher token counts. Instruct models tend to outperform base models. Subset 6 shows universally poor performance, indicating a potential outlier or particularly challenging token range. CodeFuse-DeepSeek-33b and Nxcode-CQ-7B show the most robust performance across subsets, though with notable exceptions." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/7/QS/CC_recommendation.json b/llm_insight/HumanEval/7/QS/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..b58a06c60a287af6d33796a89259ed84d9f16feb --- /dev/null +++ b/llm_insight/HumanEval/7/QS/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy and reliability": [ + { + "Nxcode-CQ-7B": "Consistently high performance across all subsets." + }, + { + "deepseek_coder-6.7b-instruct": "Strong and reliable performance with minimal drops." + } + ], + "Moderate accuracy with cost-effectiveness": [ + { + "deepseek_coder_33b-instruct": "Good performance with reasonable consistency." + }, + { + "codegemma-7b-it": "Better than base versions but still cost-effective." + } + ], + "Low-cost solutions for less critical tasks": [ + { + "deepseek-coder-6.7b-base": "Moderate performance at a lower cost." + }, + { + "codeqwen1.5-7b": "Decent performance for less demanding tasks." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/7/QS/CC_report.json b/llm_insight/HumanEval/7/QS/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..9935bc55310bada2489d96f61a9c680c0ef7b566 --- /dev/null +++ b/llm_insight/HumanEval/7/QS/CC_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows high variability in performance across subsets, with scores ranging from 50.0 to 92.0. It performs exceptionally well in CC_subset_6 but poorly in CC_subset_7.", + "Nxcode-CQ-7B": "Consistently high performance across all subsets, with scores mostly above 85.65. The model is robust and reliable, with minimal performance drops.", + "codegemma-2b": "Poor performance across all subsets, with scores as low as 2.29 in CC_subset_7. Not suitable for tasks requiring high accuracy.", + "codegemma-7b": "Moderate performance, with scores ranging from 14.37 to 57.61. Shows some consistency but struggles in more challenging subsets.", + "codegemma-7b-it": "Better than codegemma-7b, with scores ranging from 33.33 to 62.17. Still, it has significant drops in performance in harder subsets.", + "deepseek-coder-1.3b-base": "Low to moderate performance, with scores ranging from 6.67 to 54.13. Not recommended for high-stakes tasks.", + "deepseek-coder-6.7b-base": "Moderate performance, with scores ranging from 13.75 to 65.0. Shows some promise but inconsistent.", + "deepseek_coder-6.7b-instruct": "Strong performance, with scores ranging from 54.58 to 82.39. Reliable but has some variability.", + "deepseek_coder_33b-base": "Moderate to good performance, with scores ranging from 28.96 to 68.04. Shows potential but not the best.", + "deepseek_coder_33b-instruct": "Good performance, with scores ranging from 45.42 to 76.09. More consistent than the base version.", + "codeqwen1.5-7b": "Moderate performance, with scores ranging from 29.58 to 64.78. Inconsistent across subsets.", + "global_insights": "Nxcode-CQ-7B and deepseek_coder-6.7b-instruct are the top performers, showing high and consistent scores across subsets. Smaller models like codegemma-2b and deepseek-coder-1.3b-base struggle significantly. Performance variability is high for most models, indicating that subset difficulty varies." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/7/QS/line_counts_recommendation.json b/llm_insight/HumanEval/7/QS/line_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..4d8b4d1e157875c227ca3309dece9e7fc0c97080 --- /dev/null +++ b/llm_insight/HumanEval/7/QS/line_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy tasks with moderate line counts": [ + { + "Nxcode-CQ-7B": "Consistently high performance across most subsets, making it reliable for high accuracy tasks." + }, + { + "deepseek_coder_33b-instruct": "Strong performance in most subsets, suitable for tasks requiring high accuracy." + } + ], + "Cost-effective for simpler tasks": [ + { + "codegemma-7b-it": "Balances cost and performance for simpler tasks with lower line counts." + }, + { + "deepseek_coder-6.7b-instruct": "Good performance for medium complexity tasks at a lower cost than larger models." + } + ], + "Tasks with very low line counts": [ + { + "codeqwen1.5-7b": "Moderate performance for simpler tasks with very low line counts." + }, + { + "deepseek-coder-6.7b-base": "Suitable for tasks with low to medium line counts where cost is a concern." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/7/QS/line_counts_report.json b/llm_insight/HumanEval/7/QS/line_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..4c4a57e014a73b6e16c0bcca0b3d6b142ac7772a --- /dev/null +++ b/llm_insight/HumanEval/7/QS/line_counts_report.json @@ -0,0 +1,15 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows consistent performance across subsets, with a slight dip in line_subset_2 and line_subset_5. It maintains relatively high accuracy, indicating robustness in handling varying line counts.", + "Nxcode-CQ-7B": "This model demonstrates strong performance in most subsets, particularly excelling in line_subset_1 and line_subset_5. However, there is a noticeable drop in line_subset_7, suggesting potential limitations with larger line counts.", + "codegemma-2b": "Performance is consistently low across all subsets, with a significant decline as line counts increase. This model is not suitable for tasks requiring high accuracy.", + "codegemma-7b": "While better than codegemma-2b, this model still struggles with higher line counts, showing a steady decline in performance.", + "codegemma-7b-it": "This model performs better than its non-IT counterpart, but still shows a decline with increasing line counts. It may be suitable for simpler tasks.", + "deepseek-coder-1.3b-base": "Low performance across the board, with significant drops in subsets with higher line counts. Not recommended for complex tasks.", + "deepseek-coder-6.7b-base": "Moderate performance, but declines noticeably with larger line counts. May be suitable for medium complexity tasks.", + "deepseek_coder-6.7b-instruct": "Shows strong performance in subsets with lower to medium line counts, but drops significantly in line_subset_7. Good for tasks with moderate line counts.", + "deepseek_coder_33b-base": "Performance is decent but declines with larger line counts. Suitable for tasks with medium complexity.", + "deepseek_coder_33b-instruct": "One of the top performers, maintaining high accuracy across most subsets. However, there is a noticeable drop in line_subset_6 and line_subset_7.", + "codeqwen1.5-7b": "Moderate performance with a steady decline as line counts increase. May be suitable for simpler tasks.", + "new": "Similar to codegemma-7b-it, this model shows moderate performance but declines with larger line counts.", + "global_insights": "Models with larger parameter sizes (e.g., 33b) generally perform better across subsets. Performance tends to decline with increasing line counts, indicating that line count is a significant factor in model accuracy. Nxcode-CQ-7B and deepseek_coder_33b-instruct are the top performers, while codegemma-2b and deepseek-coder-1.3b-base are the weakest. The instruct variants generally outperform their base counterparts." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/7/QS/token_counts_recommendation.json b/llm_insight/HumanEval/7/QS/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..2c3d36969645af39c60ef7ed8a4052ee09301714 --- /dev/null +++ b/llm_insight/HumanEval/7/QS/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High-performance, cost-effective for varied token tasks": [ + { + "Nxcode-CQ-7B": "Consistently high performance across all token subsets, making it the most reliable choice." + }, + { + "deepseek_coder_33b-instruct": "Balanced performance across most subsets, suitable for tasks with moderate token counts." + } + ], + "Moderate-performance, budget-friendly for low to moderate token tasks": [ + { + "deepseek_coder-6.7b-instruct": "Good performance in low to moderate token subsets, offering a cost-effective solution." + }, + { + "codegemma-7b-it": "Decent performance in low-token subsets, suitable for budget-conscious projects." + } + ], + "Not recommended for high-token tasks": [ + { + "codegemma-2b": "Poor performance across all subsets, especially high-token ones." + }, + { + "deepseek-coder-1.3b-base": "Weak performance, particularly in high-token subsets." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/7/QS/token_counts_report.json b/llm_insight/HumanEval/7/QS/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..a4aec3c71fc34304f4ea9c55cdbe9555c520d4b5 --- /dev/null +++ b/llm_insight/HumanEval/7/QS/token_counts_report.json @@ -0,0 +1,15 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in subsets with moderate token counts (subsets 2-5) but declines in subsets with very low or high token counts. This suggests it may struggle with extreme token distributions.", + "Nxcode-CQ-7B": "Consistently high performance across all subsets, indicating robustness to varying token counts. It outperforms most models, especially in high-token subsets.", + "codegemma-2b": "Poor performance across all subsets, particularly in high-token subsets where it nearly fails. Not suitable for tasks with varying token counts.", + "codegemma-7b": "Moderate performance but declines significantly in high-token subsets. Better than codegemma-2b but still not reliable for high-token tasks.", + "codegemma-7b-it": "Improved over codegemma-7b but still shows significant drops in high-token subsets. May be suitable for low to moderate token tasks.", + "deepseek-coder-1.3b-base": "Weak performance, especially in high-token subsets. Not recommended for tasks with high token counts.", + "deepseek-coder-6.7b-base": "Moderate performance but inconsistent across subsets. Shows some resilience in low-token subsets but struggles with high tokens.", + "deepseek_coder-6.7b-instruct": "Strong performance in low to moderate token subsets but declines in high-token subsets. Suitable for tasks with controlled token counts.", + "deepseek_coder_33b-base": "Moderate performance with a noticeable drop in high-token subsets. Better than smaller models but not as robust as Nxcode-CQ-7B.", + "deepseek_coder_33b-instruct": "Good performance across most subsets but still declines in high-token subsets. A balanced choice for varied token tasks.", + "codeqwen1.5-7b": "Moderate performance with significant drops in high-token subsets. Similar to deepseek-coder-6.7b-base but slightly better in some subsets.", + "new": "Identical performance to codegemma-7b-it, suggesting it may be a variant or duplicate entry. Performance insights mirror those of codegemma-7b-it.", + "global_insights": "Nxcode-CQ-7B is the most robust model across all token subsets. Smaller models (e.g., codegemma-2b, deepseek-coder-1.3b-base) struggle significantly with high token counts. Instruction-tuned models (e.g., deepseek_coder-6.7b-instruct) generally perform better than their base counterparts. Token count extremes (very low or high) are challenging for most models, indicating a need for token-aware preprocessing or model selection." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/8/EI/CC_recommendation.json b/llm_insight/HumanEval/8/EI/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..ee06bb4472b783d9bc71d82339cbc026965fa08f --- /dev/null +++ b/llm_insight/HumanEval/8/EI/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy and robustness": [ + { + "Nxcode-CQ-7B": "Consistently high performance across most subsets, making it reliable for a wide range of tasks." + }, + { + "deepseek_coder_33b-instruct": "Balanced performance with moderate to high accuracy across most subsets." + } + ], + "Specialized tasks": [ + { + "CodeFuse-DeepSeek-33b": "Excels in specific subsets, making it suitable for specialized tasks where it performs well." + }, + { + "codeqwen1.5-7b": "High performance in subset 7, indicating potential for specific task types." + } + ], + "Cost-effective for moderate tasks": [ + { + "codegemma-7b-it": "Moderate performance at a potentially lower cost compared to larger models." + }, + { + "deepseek_coder-6.7b-instruct": "Good performance in certain subsets, offering a balance between cost and capability." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/8/EI/CC_report.json b/llm_insight/HumanEval/8/EI/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..5c21041c1161c5e1efc2267232f3791db49ec119 --- /dev/null +++ b/llm_insight/HumanEval/8/EI/CC_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows high variability in performance across subsets, excelling in subsets 6 and 8 but performing poorly in subset 5. This suggests the model may be highly specialized for certain types of tasks.", + "Nxcode-CQ-7B": "Consistently high performance across most subsets, except for a significant drop in subset 6. This indicates robustness but with a notable weakness in specific scenarios.", + "codegemma-2b": "Poor performance across all subsets, indicating it may not be suitable for tasks requiring high accuracy or complexity.", + "codegemma-7b": "Better than codegemma-2b but still underperforms in most subsets, suggesting limited capability for complex tasks.", + "codegemma-7b-it": "Moderate performance with some variability, showing potential in subsets 5 and 8 but failing in subset 6.", + "deepseek-coder-1.3b-base": "Very low performance across all subsets, indicating it is not suitable for most tasks.", + "deepseek-coder-6.7b-base": "Slightly better than the 1.3b version but still underperforms in most subsets.", + "deepseek_coder-6.7b-instruct": "Good performance in subsets 1, 2, and 3 but drops significantly in others, indicating inconsistency.", + "deepseek_coder_33b-base": "Moderate performance with some variability, showing potential in subsets 1, 2, and 7 but failing in subset 6.", + "deepseek_coder_33b-instruct": "Consistently moderate to high performance across most subsets, except for a significant drop in subset 6.", + "codeqwen1.5-7b": "High variability, excelling in subset 7 but performing poorly in others, indicating specialization in certain tasks.", + "global_insights": "Models with larger parameter sizes generally perform better, but there are exceptions. Subset 6 is particularly challenging for most models, indicating it may contain unique or complex tasks. The Nxcode-CQ-7B and deepseek_coder_33b-instruct models show the most consistent performance across subsets." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/8/EI/line_counts_recommendation.json b/llm_insight/HumanEval/8/EI/line_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..3abfbb5b8602e7ebe416ddae1f49493e0d1ad4f0 --- /dev/null +++ b/llm_insight/HumanEval/8/EI/line_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy tasks": [ + { + "Nxcode-CQ-7B": "Consistently high performance across most subsets, making it reliable for high accuracy requirements." + }, + { + "CodeFuse-DeepSeek-33b": "Excels in several subsets, though with some variability, suitable for tasks where peak performance is needed." + } + ], + "Cost-effective tasks": [ + { + "deepseek_coder-6.7b-instruct": "Balances performance and cost, offering decent accuracy without the highest resource demands." + }, + { + "codeqwen1.5-7b": "Moderate performance at a potentially lower cost, suitable for budget-conscious scenarios." + } + ], + "Instruction-tuned models": [ + { + "deepseek_coder_33b-instruct": "Strong performance in key subsets, benefiting from instruction tuning." + }, + { + "deepseek_coder-6.7b-instruct": "Good balance of performance and cost, with the added benefit of instruction tuning." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/8/EI/line_counts_report.json b/llm_insight/HumanEval/8/EI/line_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..d628c98cac7005f953b1525873b70b9f760a5661 --- /dev/null +++ b/llm_insight/HumanEval/8/EI/line_counts_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows high variability in performance across subsets, excelling in subsets 2, 4, 6, and 8 with scores above 80%, but struggling in subsets 5 and 7 with scores as low as 50%. This suggests the model may be sensitive to certain types of data or contexts.", + "Nxcode-CQ-7B": "Consistently performs well across most subsets, with scores generally above 80%. The notable exception is subset 6 with a score of 48.75%, indicating a potential weakness in handling specific data characteristics present in that subset.", + "codegemma-2b": "Poor performance across all subsets, with scores mostly below 30%. This model is not recommended for tasks requiring high accuracy.", + "codegemma-7b": "Better than codegemma-2b but still underperforms, with scores mostly below 50%. Shows some improvement in subsets 4 and 8, but overall performance is lacking.", + "codegemma-7b-it": "Moderate performance with scores ranging from 10% to 72.24%. Shows potential in subsets 1, 2, 3, and 4 but struggles in others.", + "deepseek-coder-1.3b-base": "Similar to codegemma-2b, with very low scores across most subsets. Not suitable for high-accuracy tasks.", + "deepseek-coder-6.7b-base": "Better than its 1.3b counterpart but still inconsistent, with scores ranging from 0% to 73.28%. Shows some promise in subsets 1 and 4.", + "deepseek_coder-6.7b-instruct": "Relatively stable performance with scores mostly above 50%. Excels in subsets 1, 2, and 3, but drops significantly in subset 6.", + "deepseek_coder_33b-base": "Inconsistent performance with highs and lows across subsets. Shows potential in subsets 1, 4, and 8 but fails in subset 7.", + "deepseek_coder_33b-instruct": "Strong performance in subsets 1, 2, and 4, but inconsistent in others. The drop in subset 7 to 2.5% is concerning.", + "codeqwen1.5-7b": "Moderate performance with scores mostly between 30% and 63.97%. Shows some consistency but lacks standout performance.", + "global_insights": "The Nxcode-CQ-7B and CodeFuse-DeepSeek-33b models show the highest overall performance, though with some variability. Smaller models like codegemma-2b and deepseek-coder-1.3b-base consistently underperform. The instruct variants of deepseek models generally perform better than their base counterparts, indicating the value of instruction tuning." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/8/EI/token_counts_recommendation.json b/llm_insight/HumanEval/8/EI/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..7ec5c240b3818ca4a482d8931d207a358ec3d911 --- /dev/null +++ b/llm_insight/HumanEval/8/EI/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance with controlled token counts": [ + { + "deepseek_coder_33b-instruct": "Excels in lower token subsets, making it ideal for tasks where token counts can be kept low." + }, + { + "Nxcode-CQ-7B": "Consistently strong performance across most subsets, except for extreme token counts." + } + ], + "Balanced performance and cost-effectiveness": [ + { + "deepseek_coder-6.7b-instruct": "Offers a good balance of performance and cost, suitable for tasks with moderate token counts." + }, + { + "codeqwen1.5-7b": "Provides moderate performance at a potentially lower cost, suitable for budget-conscious scenarios." + } + ], + "Avoid for tasks with high token variability": [ + { + "codegemma-2b": "Poor performance across all subsets, not suitable for any task requiring handling of varying token counts." + }, + { + "deepseek-coder-1.3b-base": "Consistently low performance, not recommended for any serious application." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/8/EI/token_counts_report.json b/llm_insight/HumanEval/8/EI/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..67ae02e75117309da8953cc976fc7c1e19984893 --- /dev/null +++ b/llm_insight/HumanEval/8/EI/token_counts_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows high variability in performance across token subsets, excelling in subsets 2 and 8 but failing completely in subset 7. This suggests sensitivity to token count variations.", + "Nxcode-CQ-7B": "Consistently strong performance across most subsets, except for a complete failure in subset 7 and poor performance in subset 8. This indicates robustness to moderate token counts but struggles with extremes.", + "codegemma-2b": "Poor performance across all subsets, with scores decreasing as token counts increase. Not suitable for tasks requiring handling of varying token lengths.", + "codegemma-7b": "Moderate performance that degrades with increasing token counts. Shows some capability but is inconsistent.", + "codegemma-7b-it": "Better than its non-IT counterpart, but still shows significant performance drops with higher token counts. Handles moderate token counts reasonably well.", + "deepseek-coder-1.3b-base": "Low performance across the board, with complete failures in higher token subsets. Not recommended for tasks with diverse token counts.", + "deepseek-coder-6.7b-base": "Moderate performance that declines with token count. Shows some promise but is inconsistent.", + "deepseek_coder-6.7b-instruct": "Strong performance in lower token subsets, with significant but not complete degradation in higher token subsets. A good balance for tasks with moderate token counts.", + "deepseek_coder_33b-base": "Moderate to good performance in lower token subsets, but struggles as token counts increase. Suitable for tasks with limited token variability.", + "deepseek_coder_33b-instruct": "Excellent performance in lower token subsets, with notable degradation in higher token subsets. Best suited for tasks where token counts are controlled.", + "codeqwen1.5-7b": "Moderate performance across most subsets, with significant drops in higher token counts. A middle-ground option for tasks with moderate token variability.", + "global_insights": "Models generally perform better with lower token counts, with performance degrading as token counts increase. The Nxcode-CQ-7B and deepseek_coder-6.7b-instruct models show the most consistent performance across varying token counts. Subset 7 appears to be particularly challenging, with most models failing completely. Subset 8 shows extreme variability, suggesting it may represent an outlier condition." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/8/QS/CC_recommendation.json b/llm_insight/HumanEval/8/QS/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..d48e87aba21c8d540925fce9b324bf52d5066554 --- /dev/null +++ b/llm_insight/HumanEval/8/QS/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy and robustness": [ + { + "Nxcode-CQ-7B": "Consistently high performance across all subsets, making it reliable for critical CC tasks." + }, + { + "deepseek_coder-6.7b-instruct": "Strong performance in most subsets, suitable for tasks requiring high accuracy." + } + ], + "Balanced performance and cost": [ + { + "deepseek_coder_33b-instruct": "Good performance with reasonable consistency, suitable for tasks where cost is a consideration." + }, + { + "codegemma-7b-it": "Better performance than smaller models, offering a balance between cost and accuracy." + } + ], + "Low cost but limited accuracy": [ + { + "codegemma-7b": "Moderate performance at a lower cost, suitable for non-critical tasks." + }, + { + "deepseek-coder-6.7b-base": "Moderate performance, suitable for tasks where cost is a primary concern." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/8/QS/CC_report.json b/llm_insight/HumanEval/8/QS/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..9bd73434f9ed05bfbe6ef6e279662780b6231829 --- /dev/null +++ b/llm_insight/HumanEval/8/QS/CC_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows strong performance in subsets 2, 6, and 7, but struggles in subset 8. This indicates variability in handling different CC perspectives, with a notable drop in subset 8.", + "Nxcode-CQ-7B": "Consistently high performance across most subsets, particularly excelling in subsets 1, 5, and 7. This model demonstrates robustness across different CC perspectives.", + "codegemma-2b": "Poor performance across all subsets, with the lowest scores in subsets 3, 5, and 8. This model is not suitable for tasks requiring high accuracy in CC perspectives.", + "codegemma-7b": "Moderate performance with significant drops in subsets 3, 5, and 8. The model shows some capability but is inconsistent.", + "codegemma-7b-it": "Better performance than codegemma-7b, especially in subsets 2, 4, and 6. However, it still struggles in subsets 3, 5, and 8.", + "deepseek-coder-1.3b-base": "Low performance across all subsets, with particularly poor results in subsets 3, 5, and 8. Not recommended for CC tasks.", + "deepseek-coder-6.7b-base": "Moderate performance with notable drops in subsets 3 and 8. Shows some promise but lacks consistency.", + "deepseek_coder-6.7b-instruct": "Strong performance in subsets 1, 6, and 7, with decent results in other subsets. This model is reliable for CC tasks.", + "deepseek_coder_33b-base": "Moderate to good performance in subsets 1, 2, and 6, but struggles in subsets 3, 5, and 8. Inconsistent across CC perspectives.", + "deepseek_coder_33b-instruct": "Good performance in subsets 1, 2, and 6, with acceptable results in others. More consistent than the base version.", + "codeqwen1.5-7b": "Moderate performance with drops in subsets 3, 5, and 8. Shows some capability but is not the best choice for CC tasks.", + "global_insights": "Nxcode-CQ-7B and deepseek_coder-6.7b-instruct are the top performers across most subsets, showing robustness in handling different CC perspectives. Smaller models like codegemma-2b and deepseek-coder-1.3b-base perform poorly, indicating that model size and instruction tuning play a significant role in performance. Subsets 3, 5, and 8 are consistently challenging for most models, suggesting these CC perspectives are more complex or require better generalization." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/8/QS/line_counts_recommendation.json b/llm_insight/HumanEval/8/QS/line_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..95cc959b765bb140c5e6ea89889152478d8612d0 --- /dev/null +++ b/llm_insight/HumanEval/8/QS/line_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy and consistency": [ + { + "Nxcode-CQ-7B": "Consistently high performance across all subsets, making it reliable for diverse tasks." + }, + { + "CodeFuse-DeepSeek-33b": "Strong performance in most subsets, particularly in subset 3, indicating good generalization." + } + ], + "Moderate accuracy with cost-effectiveness": [ + { + "deepseek_coder-6.7b-instruct": "Balances performance and cost, especially in early subsets, though it drops in later ones." + }, + { + "codegemma-7b-it": "Better performance than its base version, suitable for tasks where moderate accuracy is acceptable." + } + ], + "Budget-conscious with lower accuracy requirements": [ + { + "codeqwen1.5-7b": "Moderate performance at a lower cost, suitable for less critical tasks." + }, + { + "deepseek-coder-6.7b-base": "Decent performance in early subsets, but inconsistent; suitable for non-critical applications." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/8/QS/line_counts_report.json b/llm_insight/HumanEval/8/QS/line_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..710c8c6bb02d8e60b15c1f54d1f701aef40ff56b --- /dev/null +++ b/llm_insight/HumanEval/8/QS/line_counts_report.json @@ -0,0 +1,15 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows consistent performance across most subsets, with a slight dip in subsets 4, 6, and 7. It performs best in subset 3 with 85.0% accuracy.", + "Nxcode-CQ-7B": "This model demonstrates strong performance across all subsets, with the highest accuracy in subset 1 (94.75%) and maintaining above 80% in all other subsets.", + "codegemma-2b": "The model struggles significantly across all subsets, with the highest performance in subset 2 (54.5%) and the lowest in subset 8 (13.81%).", + "codegemma-7b": "Performance is inconsistent, with the best result in subset 1 (64.75%) and the worst in subset 3 (34.0%). Generally, it underperforms compared to larger models.", + "codegemma-7b-it": "Shows moderate performance, with the best result in subset 1 (76.5%) and the worst in subset 8 (39.29%). It performs better than its non-IT counterpart.", + "deepseek-coder-1.3b-base": "This model has poor performance across all subsets, with the highest accuracy in subset 1 (55.25%) and the lowest in subset 6 (19.29%).", + "deepseek-coder-6.7b-base": "Performance is moderate, with the best result in subset 1 (75.25%) and the worst in subset 8 (27.38%). It shows variability across subsets.", + "deepseek_coder-6.7b-instruct": "This model performs well in subsets 1, 2, and 3 (79.0%, 82.25%, 87.0%) but drops significantly in subset 8 (51.43%).", + "deepseek_coder_33b-base": "Shows decent performance in subsets 1 and 2 (75.5%, 78.25%) but drops in subset 3 (49.5%). Overall, it is inconsistent.", + "deepseek_coder_33b-instruct": "Performs well in subsets 1 and 2 (85.5%, 85.0%) but declines in later subsets, with the lowest in subset 8 (48.81%).", + "codeqwen1.5-7b": "Moderate performance, with the best result in subset 1 (63.5%) and the worst in subset 6 (37.14%). It is relatively stable but not outstanding.", + "new": "This model mirrors the performance of codegemma-7b-it, suggesting similar architecture or training. Best in subset 1 (76.5%) and worst in subset 8 (39.29%).", + "global_insights": "Nxcode-CQ-7B and CodeFuse-DeepSeek-33b are the top performers, with Nxcode-CQ-7B being the most consistent. Smaller models like codegemma-2b and deepseek-coder-1.3b-base struggle significantly. Instruct variants generally perform better than their base counterparts. The performance drop in later subsets (especially subset 8) suggests potential issues with data distribution or model generalization." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/8/QS/token_counts_recommendation.json b/llm_insight/HumanEval/8/QS/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..5561c9547ea86a160c25a6a1c594651b649e1424 --- /dev/null +++ b/llm_insight/HumanEval/8/QS/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy and robustness": [ + { + "Nxcode-CQ-7B": "Consistently high performance across all token subsets, making it reliable for varied input sizes." + }, + { + "deepseek_coder-6.7b-instruct": "Strong performance in most subsets, particularly in handling higher token counts." + } + ], + "Moderate accuracy with cost-effectiveness": [ + { + "deepseek_coder_33b-instruct": "Good balance of performance and cost, though slightly less robust than top performers." + }, + { + "codegemma-7b-it": "Better performance than smaller models but at a lower cost than the largest models." + } + ], + "Low-cost with basic performance": [ + { + "deepseek-coder-6.7b-base": "Moderate performance at a lower cost, suitable for less critical tasks." + }, + { + "codeqwen1.5-7b": "Decent performance for smaller budgets, though with some variability." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/8/QS/token_counts_report.json b/llm_insight/HumanEval/8/QS/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..da47b449b4c946e2a2d353c68056ba6209ef923a --- /dev/null +++ b/llm_insight/HumanEval/8/QS/token_counts_report.json @@ -0,0 +1,15 @@ +{ + "CodeFuse-DeepSeek-33b": "The model shows high variability in performance across subsets, excelling in token_subset_4 with 100% accuracy but dropping significantly in token_subset_8 to 61.9%. This suggests sensitivity to token count variations.", + "Nxcode-CQ-7B": "Consistently high performance across all subsets, with the lowest score being 73.81% in token_subset_8. Demonstrates robustness to token count changes.", + "codegemma-2b": "Poor performance across all subsets, particularly in token_subset_8 with only 1.19%. Not suitable for tasks requiring high accuracy.", + "codegemma-7b": "Moderate performance with significant drops in subsets with higher token counts, indicating limitations in handling larger inputs.", + "codegemma-7b-it": "Better than codegemma-7b but still shows variability, particularly in subsets with higher token counts.", + "deepseek-coder-1.3b-base": "Low performance across the board, with a notable drop in token_subset_8 to 6.43%. Not recommended for complex tasks.", + "deepseek-coder-6.7b-base": "Moderate performance with some variability, but generally better than the 1.3b version.", + "deepseek_coder-6.7b-instruct": "Strong performance in most subsets, particularly in token_subset_2 with 91.0%. Shows good adaptability to varying token counts.", + "deepseek_coder_33b-base": "Decent performance but with noticeable drops in subsets with higher token counts.", + "deepseek_coder_33b-instruct": "Consistently good performance across most subsets, though it drops in token_subset_8 to 36.67%.", + "codeqwen1.5-7b": "Moderate performance with some variability, particularly in subsets with higher token counts.", + "new": "Similar performance to codegemma-7b-it, indicating possible redundancy or similar architecture.", + "global_insights": "Models like Nxcode-CQ-7B and deepseek_coder-6.7b-instruct show the most consistent performance across varying token counts. Smaller models (e.g., codegemma-2b, deepseek-coder-1.3b-base) struggle significantly, especially with larger inputs. There is a clear trend where larger, more sophisticated models handle token count variations better, but even they show some sensitivity in the highest token count subsets." +} \ No newline at end of file diff --git a/llm_insight/HumanEval/problem_type_recommendation.json b/llm_insight/HumanEval/problem_type_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..e075915cf3f3ce220260517cf26ad3d00e090046 --- /dev/null +++ b/llm_insight/HumanEval/problem_type_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy tasks": [ + { + "CodeFuse-DeepSeek-33b": "This model achieves 100% accuracy in Stack and Matrix tasks, making it ideal for high-accuracy requirements in these areas." + }, + { + "Nxcode-CQ-7B": "This model performs well across most tasks, making it a reliable choice for general high-accuracy needs." + } + ], + "Balanced performance and cost-effectiveness": [ + { + "deepseek_coder-6.7b-instruct": "This model offers a good balance of performance and cost, particularly for String, Math, and Array tasks." + }, + { + "deepseek_coder_33b-instruct": "This model provides strong performance in several tasks while being more cost-effective than the top performers." + } + ], + "Low-cost solutions": [ + { + "codegemma-7b-it": "This model is a cost-effective option for tasks where moderate performance is acceptable." + }, + { + "codeqwen1.5-7b": "This model offers balanced performance at a lower cost, suitable for budget-conscious scenarios." + } + ] +} \ No newline at end of file diff --git a/llm_insight/HumanEval/problem_type_report.json b/llm_insight/HumanEval/problem_type_report.json new file mode 100644 index 0000000000000000000000000000000000000000..4680d1d65e8d90a4421a8ca113a0c5aaf26bd74d --- /dev/null +++ b/llm_insight/HumanEval/problem_type_report.json @@ -0,0 +1,14 @@ +{ + "CodeFuse-DeepSeek-33b": "This model performs exceptionally well in Stack and Matrix tasks, achieving 100% accuracy. However, its performance in Math and String tasks is relatively lower, suggesting room for improvement in these areas.", + "Nxcode-CQ-7B": "Nxcode-CQ-7B shows strong performance across most tasks, particularly in String, Math, and Array tasks. Its performance in Hash Table and Matrix tasks is notably weaker, indicating a potential area for further optimization.", + "codegemma-2b": "This model has the lowest performance across all tasks, with particularly poor results in Hash Table and Matrix tasks. It may not be suitable for complex problem-solving scenarios.", + "codegemma-7b": "While better than codegemma-2b, this model still struggles with Hash Table and Matrix tasks. Its performance in other areas is moderate but not outstanding.", + "codegemma-7b-it": "This model shows significant improvement over its smaller counterparts, particularly in String and Math tasks. However, it still underperforms in Hash Table and Matrix tasks.", + "deepseek-coder-1.3b-base": "This model has low performance across all tasks, similar to codegemma-2b. It is not recommended for tasks requiring high accuracy.", + "deepseek-coder-6.7b-base": "This model shows moderate performance, with better results in Math and Array tasks. However, it still struggles with Hash Table and Matrix tasks.", + "deepseek_coder-6.7b-instruct": "This model performs well in String, Math, and Array tasks, but its performance drops significantly in Hash Table and Matrix tasks.", + "deepseek_coder_33b-base": "This model has moderate to good performance in most tasks, except for Hash Table and Matrix tasks where it underperforms.", + "deepseek_coder_33b-instruct": "This model shows strong performance in String, Math, and Array tasks, but its performance in Hash Table and Matrix tasks is still lacking.", + "codeqwen1.5-7b": "This model has moderate performance across most tasks, with no standout strengths or weaknesses. It is a balanced but not exceptional performer.", + "global_insights": "The models generally perform well in String, Math, and Array tasks but struggle with Hash Table and Matrix tasks. Larger models tend to perform better, but there are exceptions like Nxcode-CQ-7B which performs well despite its size. The best-performing models are CodeFuse-DeepSeek-33b and Nxcode-CQ-7B, but they have specific weaknesses in certain tasks." +} \ No newline at end of file diff --git a/llm_insight/MBPP/3/EI/CC_recommendation.json b/llm_insight/MBPP/3/EI/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..016f570e924c729b57e01b100895af1393b09196 --- /dev/null +++ b/llm_insight/MBPP/3/EI/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance and robustness": [ + { + "DeepSeekCoder-6.7b-instruct": "This model excels across all subsets, making it ideal for tasks requiring high accuracy and robustness." + }, + { + "codeqwen2.5-7b": "This model shows top-tier performance on the first subset and maintains strong results on others, suitable for high-stakes applications." + } + ], + "Balanced performance and cost-effectiveness": [ + { + "codeqwen2.5-1.5b": "This model offers a good balance between performance and cost, performing well across all subsets." + }, + { + "DeepSeekCoder-6.7b-base": "A cost-effective option with solid performance across subsets, though not as strong as the instruct variant." + } + ], + "Budget-conscious with moderate needs": [ + { + "CodeGemma-7b": "Suitable for tasks where the first subset is the primary focus, but performance drops on more complex subsets." + }, + { + "DeepSeekCoder-1.3b-base": "A smaller model with acceptable performance on the first two subsets, ideal for less critical tasks." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/3/EI/CC_report.json b/llm_insight/MBPP/3/EI/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..a6a9d34b123041c6894eb460196cc8a9fe8366c5 --- /dev/null +++ b/llm_insight/MBPP/3/EI/CC_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "The model shows a significant drop in performance from CC_subset_1 to CC_subset_3, indicating poor generalization on more complex subsets. It performs moderately on the first subset but fails completely on the third.", + "CodeGemma-7b-it": "This model maintains relatively good performance on the first two subsets but fails on the third, similar to other CodeGemma variants. The performance drop is less severe compared to CodeGemma-2b.", + "CodeGemma-7b": "The model performs well on the first subset but shows a notable decline in the second and fails on the third. It outperforms CodeGemma-2b and -7b-it on the first subset.", + "DeepSeekCoder-1.3b-base": "This model shows moderate performance on the first two subsets but fails on the third. It is outperformed by larger models in the same family.", + "DeepSeekCoder-6.7b-base": "The model performs well on the first two subsets and maintains a decent performance on the third, showing better generalization compared to smaller models.", + "DeepSeekCoder-6.7b-instruct": "This model excels across all subsets, with perfect performance on the third subset. It demonstrates strong generalization and robustness.", + "codeqwen2.5-1.5b": "The model performs exceptionally well on the first subset and maintains good performance on the second and third subsets, showing strong generalization capabilities.", + "codeqwen2.5-7b": "This model outperforms all others on the first subset and maintains good performance on the second and third subsets, indicating excellent scalability and robustness.", + "global_insights": "Larger models generally perform better across subsets, with DeepSeekCoder-6.7b-instruct and codeqwen2.5-7b showing the best overall performance. The third subset appears to be the most challenging, with most models failing or showing significant performance drops. The codeqwen2.5 models demonstrate strong generalization, while CodeGemma models struggle with more complex subsets." +} \ No newline at end of file diff --git a/llm_insight/MBPP/3/EI/token_counts_recommendation.json b/llm_insight/MBPP/3/EI/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..acf14c2354dfc5d26fc679e1f7d8bbb9fd35c628 --- /dev/null +++ b/llm_insight/MBPP/3/EI/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance, regardless of cost": [ + { + "DeepSeekCoder-6.7b-instruct": "Excels across all token subsets, particularly with high token counts." + }, + { + "codeqwen2.5-7b": "Strong performance in moderate token subsets, though less effective with very high counts." + } + ], + "Cost-effective for moderate token counts": [ + { + "CodeGemma-7b": "Balanced performance across subsets, suitable for varied token complexities." + }, + { + "DeepSeekCoder-6.7b-base": "Robust performance, especially in higher token counts, at a lower cost than the instruct variant." + } + ], + "Budget-conscious with lower token counts": [ + { + "CodeGemma-2b": "Moderate performance, suitable for simpler tasks with lower token counts." + }, + { + "codeqwen2.5-1.5b": "Good performance in lower to mid-range token counts, cost-effective." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/3/EI/token_counts_report.json b/llm_insight/MBPP/3/EI/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..85694e1640e91d9120076dab6e6f9b1d5835bd13 --- /dev/null +++ b/llm_insight/MBPP/3/EI/token_counts_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "CodeGemma-2b shows moderate performance across subsets, with a noticeable drop in token_subset_2. This suggests it may struggle with mid-range token complexity.", + "CodeGemma-7b-it": "CodeGemma-7b-it performs better than its 2b counterpart, especially in token_subset_1, but still lags in token_subset_2. Its performance is consistent in token_subset_3.", + "CodeGemma-7b": "CodeGemma-7b demonstrates strong performance in token_subset_1 and token_subset_2, indicating better handling of varied token complexities. Its performance in token_subset_3 is stable.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base has the lowest performance across all subsets, particularly in token_subset_2 and token_subset_3, suggesting limitations in handling higher token counts.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base shows robust performance, especially in token_subset_1 and token_subset_3, but a slight dip in token_subset_2. It handles higher token counts well.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct excels in all subsets, particularly in token_subset_3, indicating superior performance with higher token counts and complexity.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b performs well in token_subset_1 and token_subset_2 but shows a significant drop in token_subset_3, suggesting limitations with very high token counts.", + "codeqwen2.5-7b": "codeqwen2.5-7b leads in token_subset_1 and token_subset_2 but has a notable decline in token_subset_3. It is strong with moderate token counts but less so with very high ones.", + "global_insights": "Larger models generally perform better across token subsets, with DeepSeekCoder-6.7b-instruct being the standout. Performance dips in mid-range token subsets (token_subset_2) are common, suggesting this may be a challenging range. Models with higher parameter counts tend to handle higher token counts more effectively." +} \ No newline at end of file diff --git a/llm_insight/MBPP/3/QS/CC_recommendation.json b/llm_insight/MBPP/3/QS/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..f28716cd8faaa943f42b52bf409595319740338a --- /dev/null +++ b/llm_insight/MBPP/3/QS/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance and robustness": [ + { + "codeqwen2.5-7b": "Consistently top performer across all subsets with minimal performance drop in high complexity tasks." + }, + { + "DeepSeekCoder-6.7b-instruct": "Strong and stable performance across all subsets, especially in high complexity tasks." + } + ], + "Balanced performance and cost-effectiveness": [ + { + "CodeGemma-7b": "Good performance with moderate resource requirements, suitable for a wide range of tasks." + }, + { + "DeepSeekCoder-6.7b-base": "Competitive performance with a balance of cost and capability." + } + ], + "Lightweight and efficient": [ + { + "codeqwen2.5-1.5b": "Decent performance for mid-complexity tasks with lower resource usage." + }, + { + "CodeGemma-7b-it": "Better than the 2b version but still lightweight, suitable for less complex tasks." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/3/QS/CC_report.json b/llm_insight/MBPP/3/QS/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..13d8abdebfb7c8e7d77f7b4f48b375cad0224943 --- /dev/null +++ b/llm_insight/MBPP/3/QS/CC_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "CodeGemma-2b shows a consistent decline in performance across subsets, indicating it struggles with more complex tasks. Its performance drops significantly in CC_subset_3, suggesting limitations in handling higher complexity.", + "CodeGemma-7b-it": "CodeGemma-7b-it performs better than its 2b counterpart but still shows a downward trend across subsets. The drop in CC_subset_3 is less severe, indicating better handling of complexity.", + "CodeGemma-7b": "CodeGemma-7b maintains relatively stable performance across subsets, with the smallest drop in CC_subset_3. This suggests it is more robust to increasing task complexity.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base has the lowest performance overall and a significant drop in CC_subset_2 and CC_subset_3, indicating it is not suitable for complex tasks.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base shows strong performance in CC_subset_1 but declines in CC_subset_2 and CC_subset_3. However, it remains competitive, especially in higher complexity subsets.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct performs consistently well across all subsets, with the smallest performance drop in CC_subset_3. This indicates it is highly effective even for complex tasks.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b shows an interesting trend, peaking in CC_subset_2 before dropping in CC_subset_3. This suggests it may be optimized for mid-complexity tasks.", + "codeqwen2.5-7b": "codeqwen2.5-7b is the top performer across all subsets, with the highest scores and minimal performance drop in CC_subset_3. It is clearly the most robust model for varying task complexities.", + "global_insights": "Larger models generally perform better across all subsets, with codeqwen2.5-7b and DeepSeekCoder-6.7b-instruct leading the pack. Performance drops are observed in higher complexity subsets, but the extent varies by model. CodeGemma-7b and codeqwen2.5-7b show the least sensitivity to increasing complexity, making them reliable choices for diverse tasks." +} \ No newline at end of file diff --git a/llm_insight/MBPP/3/QS/token_counts_recommendation.json b/llm_insight/MBPP/3/QS/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..9df460c4942838e2fb5bc31233300e98dbe03892 --- /dev/null +++ b/llm_insight/MBPP/3/QS/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High-performance tasks with no budget constraints": [ + { + "codeqwen2.5-7b": "Consistently outperforms all other models across all token subsets, making it ideal for high-performance tasks." + }, + { + "DeepSeekCoder-6.7b-instruct": "Offers stable performance across all subsets, making it a reliable choice for tasks requiring consistency." + } + ], + "Cost-effective tasks with moderate performance requirements": [ + { + "codeqwen2.5-1.5b": "Provides impressive performance for its size, making it a cost-effective choice for many tasks." + }, + { + "DeepSeekCoder-6.7b-base": "Balances performance and cost, especially for tasks that don't require the highest accuracy." + } + ], + "Lightweight tasks with limited resources": [ + { + "CodeGemma-7b": "Performs well for its size and is suitable for tasks with limited computational resources." + }, + { + "CodeGemma-7b-it": "Offers decent performance and stability, making it a good choice for lightweight tasks." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/3/QS/token_counts_report.json b/llm_insight/MBPP/3/QS/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..9ebb13b7aa051c0be25b1be382a876a578dbbf9f --- /dev/null +++ b/llm_insight/MBPP/3/QS/token_counts_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "The performance of CodeGemma-2b decreases significantly as token counts increase, dropping from 50.3 in subset_1 to 33.06 in subset_3. This suggests that the model struggles with longer code snippets.", + "CodeGemma-7b-it": "CodeGemma-7b-it shows relatively stable performance across subsets, with a slight dip in subset_3. The model maintains decent performance even with higher token counts.", + "CodeGemma-7b": "CodeGemma-7b performs well across all subsets, with the highest scores in subset_1 and a gradual decline in subset_3. It handles longer code snippets better than its 2b counterpart.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base shows a significant drop in performance with increasing token counts, similar to CodeGemma-2b. This indicates limitations in processing longer code.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base maintains strong performance across subsets, though it does show a decline in subset_3. It outperforms the 1.3b version significantly.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct is the most stable model across all subsets, with only a slight performance drop in subset_3. It handles longer code snippets exceptionally well.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b shows impressive performance across all subsets, with the highest scores in subset_1 and a gradual decline in subset_3. It outperforms many larger models.", + "codeqwen2.5-7b": "codeqwen2.5-7b is the top performer across all subsets, with the highest scores in each category. It handles longer code snippets with minimal performance drop.", + "global_insights": "Larger models generally perform better across all token subsets, with codeqwen2.5-7b and DeepSeekCoder-6.7b-instruct leading the pack. Smaller models like CodeGemma-2b and DeepSeekCoder-1.3b-base struggle with longer code snippets. The performance drop in higher token subsets is a common trend, but the extent varies by model size and architecture." +} \ No newline at end of file diff --git a/llm_insight/MBPP/4/EI/CC_recommendation.json b/llm_insight/MBPP/4/EI/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..a5e74ad8e7761d45b7341cf5ef8c04e89d48f568 --- /dev/null +++ b/llm_insight/MBPP/4/EI/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance and consistency": [ + { + "DeepSeekCoder-6.7b-instruct": "This model excels across all subsets, particularly in complex tasks (CC_subset_3 and CC_subset_4), making it the top choice for high-performance needs." + }, + { + "codeqwen2.5-7b": "Despite a drop in CC_subset_3, this model performs exceptionally well in other subsets, especially CC_subset_4, and is a strong alternative." + } + ], + "Balanced performance and cost-effectiveness": [ + { + "DeepSeekCoder-6.7b-base": "This model offers strong performance in most subsets and is more cost-effective than the instruct version, suitable for balanced needs." + }, + { + "CodeGemma-7b": "A reliable choice among CodeGemma models, offering decent performance in most subsets without the highest cost." + } + ], + "Budget-conscious with moderate performance": [ + { + "codeqwen2.5-1.5b": "This model provides good performance in CC_subset_1 and CC_subset_2 and excels in CC_subset_4, making it a budget-friendly option." + }, + { + "DeepSeekCoder-1.3b-base": "While inconsistent, this model can be considered for less critical tasks where cost is a major factor." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/4/EI/CC_report.json b/llm_insight/MBPP/4/EI/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..0f53e555fd98169b6d689aa898c2a1e5cc80f89a --- /dev/null +++ b/llm_insight/MBPP/4/EI/CC_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "CodeGemma-2b shows a significant drop in performance across subsets, especially in CC_subset_4 where it scores 0.0. This indicates poor generalization on more complex tasks.", + "CodeGemma-7b-it": "CodeGemma-7b-it performs better than its 2b counterpart but still struggles with CC_subset_4, scoring 0.0. It shows moderate performance in other subsets.", + "CodeGemma-7b": "CodeGemma-7b demonstrates consistent performance across subsets but fails in CC_subset_4, similar to other CodeGemma models. It has the highest scores among CodeGemma models in CC_subset_1 and CC_subset_2.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base has inconsistent performance, with a notable drop in CC_subset_3 and CC_subset_4. It performs better in CC_subset_2 compared to CC_subset_1.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base shows strong performance in CC_subset_1 and CC_subset_2, and maintains decent scores in CC_subset_3 and CC_subset_4, making it a reliable choice.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct excels in all subsets, particularly in CC_subset_3 and CC_subset_4 with scores of 80.0 and 100.0 respectively. It is the top-performing model overall.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b has high scores in CC_subset_1 and CC_subset_2 but drops significantly in CC_subset_3. It surprisingly scores 100.0 in CC_subset_4, indicating potential inconsistency.", + "codeqwen2.5-7b": "codeqwen2.5-7b is the best performer in CC_subset_1 and CC_subset_2 but shows a sharp decline in CC_subset_3. Like its 1.5b version, it scores 100.0 in CC_subset_4.", + "global_insights": "Models generally perform well in CC_subset_1 and CC_subset_2 but struggle in CC_subset_3 and CC_subset_4, except for DeepSeekCoder-6.7b-instruct and codeqwen2.5 models. The latter two show exceptional performance in CC_subset_4. Larger models tend to perform better, with DeepSeekCoder-6.7b-instruct being the most consistent across all subsets." +} \ No newline at end of file diff --git a/llm_insight/MBPP/4/EI/token_counts_recommendation.json b/llm_insight/MBPP/4/EI/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..7554af6c6d2c3b13aaafbcb58e797662809f9bef --- /dev/null +++ b/llm_insight/MBPP/4/EI/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy and consistency across token lengths": [ + { + "DeepSeekCoder-6.7b-instruct": "This model excels in larger token subsets and maintains high performance across all subsets, making it ideal for tasks requiring consistency and accuracy." + }, + { + "codeqwen2.5-7b": "This model performs exceptionally well in smaller token subsets and is a strong choice for tasks where token lengths are moderate." + } + ], + "Cost-effective for smaller token subsets": [ + { + "CodeGemma-7b": "This model offers good performance in smaller token subsets at a lower computational cost compared to larger models." + }, + { + "codeqwen2.5-1.5b": "This model is highly effective for specific token ranges and is a cost-effective option for tasks within those ranges." + } + ], + "Not recommended for complex tasks": [ + { + "CodeGemma-2b": "This model struggles with larger token subsets and is not suitable for tasks requiring high accuracy or handling complex code snippets." + }, + { + "DeepSeekCoder-1.3b-base": "This model has the lowest performance overall and is not recommended for any task requiring reliable accuracy." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/4/EI/token_counts_report.json b/llm_insight/MBPP/4/EI/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..78c1bc993d10aa971450c1d91d7d918b80bf0884 --- /dev/null +++ b/llm_insight/MBPP/4/EI/token_counts_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "CodeGemma-2b shows a decline in performance as token counts increase, with the lowest performance in token_subset_3 (28.0). It struggles with larger token subsets, indicating limitations in handling more complex or lengthy code snippets.", + "CodeGemma-7b-it": "CodeGemma-7b-it performs moderately across subsets but also shows a significant drop in token_subset_3 (28.0). Its performance is inconsistent, suggesting it may not be reliable for tasks with varying token lengths.", + "CodeGemma-7b": "CodeGemma-7b has the best performance among the CodeGemma models, especially in token_subset_1 (59.68) and token_subset_2 (56.0). However, it still struggles with larger token subsets, similar to its smaller counterparts.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base has the lowest performance overall, particularly in token_subset_3 (10.0). It is not suitable for tasks requiring high accuracy or handling larger token subsets.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base shows strong performance in token_subset_1 (63.17) and token_subset_4 (66.67), but its performance drops in token_subset_3 (40.0). It is a robust model but may not be consistent across all token lengths.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct excels in token_subset_3 (77.78) and maintains high performance in other subsets. It is the most consistent and high-performing model across varying token lengths.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b has outstanding performance in token_subset_3 (85.71) but drops significantly in token_subset_4 (33.33). It is highly effective for specific token ranges but inconsistent overall.", + "codeqwen2.5-7b": "codeqwen2.5-7b is the top performer in token_subset_1 (78.26) and token_subset_2 (74.44), with a notable drop in token_subset_3 (57.14). It is a strong model but may not be the best for all token subsets.", + "global_insights": "Models generally perform better in smaller token subsets (token_subset_1 and token_subset_2) and struggle with larger ones (token_subset_3 and token_subset_4). DeepSeekCoder-6.7b-instruct and codeqwen2.5-7b are the most consistent high performers. Smaller models like CodeGemma-2b and DeepSeekCoder-1.3b-base are less reliable for complex tasks. The performance drop in larger token subsets suggests that token length is a critical factor in model accuracy." +} \ No newline at end of file diff --git a/llm_insight/MBPP/4/QS/CC_recommendation.json b/llm_insight/MBPP/4/QS/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..e8fc9462254e1ce2d10a3b6f741809713fb159d9 --- /dev/null +++ b/llm_insight/MBPP/4/QS/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance and scalability": [ + { + "codeqwen2.5-7b": "Consistently high performance across all subsets with minimal decline in complex tasks." + }, + { + "DeepSeekCoder-6.7b-instruct": "Strong performance with a slight drop in the most complex subset, making it reliable for varied tasks." + } + ], + "Balanced performance and cost-effectiveness": [ + { + "CodeGemma-7b": "Good performance across subsets with a moderate drop in complex tasks, suitable for balanced budgets." + }, + { + "codeqwen2.5-1.5b": "Robust performance with a smaller model size, offering a good trade-off between cost and capability." + } + ], + "Budget-conscious with acceptable performance": [ + { + "DeepSeekCoder-6.7b-base": "Variable performance but acceptable for less complex tasks, suitable for tighter budgets." + }, + { + "CodeGemma-7b-it": "Moderate performance with a steady decline, suitable for less demanding scenarios." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/4/QS/CC_report.json b/llm_insight/MBPP/4/QS/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..08e59bfb02bb0c94875207322ba3265d91bbe742 --- /dev/null +++ b/llm_insight/MBPP/4/QS/CC_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "The performance of CodeGemma-2b decreases significantly as the complexity of the subsets increases, dropping from 55.2% in subset_1 to 30.4% in subset_4. This suggests that the model struggles with more complex tasks.", + "CodeGemma-7b-it": "CodeGemma-7b-it shows a moderate decline in performance across subsets, from 60.0% in subset_1 to 43.04% in subset_4. It performs better than CodeGemma-2b but still shows limitations with increasing complexity.", + "CodeGemma-7b": "CodeGemma-7b maintains relatively stable performance across subsets, starting at 68.48% in subset_1 and dropping to 48.96% in subset_4. It outperforms its smaller counterparts, indicating better handling of complex tasks.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base shows a consistent decline in performance, from 49.28% in subset_1 to 32.16% in subset_4. It is the least performant among the models evaluated.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base performs well in the first two subsets (65.6%) but shows a drop in subset_3 (52.0%) and a slight recovery in subset_4 (54.4%). This indicates variability in handling different complexities.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct shows strong performance across all subsets, with a peak of 70.83% in subset_2 and a slight drop to 59.32% in subset_4. It is one of the top performers.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b demonstrates robust performance, starting at 70.0% in subset_1 and maintaining above 63.64% in subset_4. It handles complexity better than most models.", + "codeqwen2.5-7b": "codeqwen2.5-7b is the top performer, with the highest scores across all subsets (87.0% in subset_1 to 68.69% in subset_4). It shows minimal decline, indicating excellent scalability with task complexity.", + "global_insights": "Larger models generally perform better across all subsets, with codeqwen2.5-7b and DeepSeekCoder-6.7b-instruct leading the pack. Performance tends to decline with increasing subset complexity, but the rate of decline varies by model size and architecture. The 7b models (CodeGemma-7b, codeqwen2.5-7b) show better resilience to complexity compared to their smaller counterparts." +} \ No newline at end of file diff --git a/llm_insight/MBPP/4/QS/token_counts_recommendation.json b/llm_insight/MBPP/4/QS/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..6fd8f8caed6a352a76dc6ada2669ab19e17e3040 --- /dev/null +++ b/llm_insight/MBPP/4/QS/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance with large token counts": [ + { + "codeqwen2.5-7b": "Consistently high performance across all token subsets, making it ideal for tasks with varying token counts." + }, + { + "DeepSeekCoder-6.7b-instruct": "Strong performance with moderate to large token counts, suitable for complex tasks." + } + ], + "Cost-effective for moderate token counts": [ + { + "CodeGemma-7b": "Balanced performance across all subsets, offering good value for moderate token counts." + }, + { + "codeqwen2.5-1.5b": "Excellent performance in moderate token counts, providing a good balance of cost and performance." + } + ], + "Budget-friendly for small token counts": [ + { + "CodeGemma-2b": "Adequate performance for small token counts, suitable for budget-conscious users." + }, + { + "DeepSeekCoder-1.3b-base": "Lowest cost option, but only recommended for very small token counts." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/4/QS/token_counts_report.json b/llm_insight/MBPP/4/QS/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..8d4165bb5df44037d2595e8d4d575a727f5fdd5d --- /dev/null +++ b/llm_insight/MBPP/4/QS/token_counts_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "The performance of CodeGemma-2b decreases as the token count increases, with the lowest score in token_subset_4 (30.72). This suggests that the model struggles with larger token counts.", + "CodeGemma-7b-it": "CodeGemma-7b-it shows a peak performance in token_subset_2 (57.28) but maintains relatively stable scores across other subsets, indicating better handling of moderate token counts.", + "CodeGemma-7b": "CodeGemma-7b consistently performs well across all token subsets, with the highest score in token_subset_1 (62.08). This model is robust across varying token counts.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base has the lowest scores among all models, particularly in token_subset_4 (26.88), indicating poor performance with larger token counts.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base performs well in the first three subsets but drops significantly in token_subset_4 (48.8), suggesting limitations with higher token counts.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct shows high performance in token_subset_1 and token_subset_2 (66.67 and 67.2) but a slight drop in token_subset_3 (61.54), indicating strong performance with moderate token counts.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b performs exceptionally well in token_subset_2 (75.0) and maintains good performance in other subsets, showing versatility across token counts.", + "codeqwen2.5-7b": "codeqwen2.5-7b is the top performer across all subsets, with the highest score in token_subset_1 (83.0). This model excels regardless of token count.", + "global_insights": "Models generally perform better with lower to moderate token counts, with performance degrading as token counts increase. codeqwen2.5-7b and CodeGemma-7b are the most consistent performers across all subsets. Smaller models like DeepSeekCoder-1.3b-base and CodeGemma-2b struggle with larger token counts, while larger models like codeqwen2.5-7b and DeepSeekCoder-6.7b-instruct handle them better." +} \ No newline at end of file diff --git a/llm_insight/MBPP/5/EI/CC_recommendation.json b/llm_insight/MBPP/5/EI/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..3d7d57258930e706c16e126ed3cb6a7d8ff90ce2 --- /dev/null +++ b/llm_insight/MBPP/5/EI/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "For high-complexity tasks where performance is critical": [ + { + "DeepSeekCoder-6.7b-instruct": "Consistently high performance across all subsets, including a perfect score in the most complex subset." + }, + { + "codeqwen2.5-7b": "Top performer in most subsets, though it fails in subset 4, making it a strong but slightly less reliable choice." + } + ], + "For balanced performance and cost-effectiveness": [ + { + "DeepSeekCoder-6.7b-base": "Strong performance in most subsets and handles some high-complexity tasks, offering a good balance between cost and capability." + }, + { + "codeqwen2.5-1.5b": "Good performance in simpler subsets and handles high-complexity tasks well, making it a cost-effective choice for varied scenarios." + } + ], + "For simpler tasks where cost is a primary concern": [ + { + "CodeGemma-7b-it": "Performs well in simpler subsets and is more cost-effective than larger models, though it struggles with high complexity." + }, + { + "DeepSeekCoder-1.3b-base": "Moderate performance in simpler subsets and is a low-cost option, suitable for less demanding tasks." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/5/EI/CC_report.json b/llm_insight/MBPP/5/EI/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..ba23e22a00c7eb1dfaf4158041b36d684bbaef31 --- /dev/null +++ b/llm_insight/MBPP/5/EI/CC_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "CodeGemma-2b shows a steep decline in performance as the complexity of the subsets increases, dropping to 0% in subsets 4 and 5. This indicates limited capability in handling more complex tasks.", + "CodeGemma-7b-it": "CodeGemma-7b-it performs better than its 2b counterpart but still struggles with subsets 4 and 5, suggesting it may not be suitable for highly complex tasks despite its improved performance in simpler subsets.", + "CodeGemma-7b": "CodeGemma-7b shows inconsistent performance, with a notable drop in subset 3 but a rebound in subset 4. Its inability to handle subset 5 indicates limitations in extreme complexity scenarios.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base maintains moderate performance across subsets 1-3 but fails in subsets 4 and 5, similar to the CodeGemma models, highlighting a common limitation among smaller models.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base shows strong performance in subsets 1-3 and manages to score in subsets 4 and 5, indicating better handling of complexity compared to smaller models.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct excels in subsets 1-3 and performs well in subsets 4 and 5, including a perfect score in subset 5. This suggests it is highly capable of handling complex tasks.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b shows strong performance in subsets 1 and 2 but drops in subset 3. It recovers in subsets 4 and 5, with a perfect score in subset 5, indicating robustness in high-complexity tasks.", + "codeqwen2.5-7b": "codeqwen2.5-7b is the top performer in subsets 1-3 and achieves a perfect score in subset 5, though it fails in subset 4. This inconsistency suggests potential instability in certain high-complexity scenarios.", + "global_insights": "Larger models generally outperform smaller ones in handling complex tasks, with DeepSeekCoder-6.7b-instruct and codeqwen2.5-7b leading the pack. However, even the best models show inconsistencies in certain subsets, indicating that task complexity and model robustness are critical factors. The performance drop in subset 4 for some models suggests a unique challenge in that subset that may require further investigation." +} \ No newline at end of file diff --git a/llm_insight/MBPP/5/EI/token_counts_recommendation.json b/llm_insight/MBPP/5/EI/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..2e9908aa5b09623d3d6cc19e2c3ad39c2c18f120 --- /dev/null +++ b/llm_insight/MBPP/5/EI/token_counts_recommendation.json @@ -0,0 +1,23 @@ +{ + "High accuracy and complexity handling": [ + { + "DeepSeekCoder-6.7b-instruct": "Best overall performance, especially in complex tasks." + }, + { + "codeqwen2.5-7b": "Close second, robust across most subsets." + } + ], + "Moderate complexity with cost-effectiveness": [ + { + "CodeGemma-7b": "Balances performance and cost for less complex tasks." + }, + { + "codeqwen2.5-1.5b": "Good performance in moderate complexity tasks at a lower cost." + } + ], + "Low complexity and minimal cost": [ + { + "CodeGemma-2b": "Suitable for simple tasks where high accuracy is not critical." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/5/EI/token_counts_report.json b/llm_insight/MBPP/5/EI/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..2a17fb0236bf9c5cc10daa0d24c8d80bd6efa73a --- /dev/null +++ b/llm_insight/MBPP/5/EI/token_counts_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "The model shows moderate performance in subsets 1-4 but fails completely in subset 5. This suggests it struggles with higher token counts or more complex tasks.", + "CodeGemma-7b-it": "Similar to CodeGemma-2b but with slightly better performance across subsets 1-4. Still fails in subset 5, indicating limitations with complex tasks.", + "CodeGemma-7b": "Performs better than its smaller counterparts in subsets 1-4 but also fails in subset 5. The improvement is notable but not sufficient for high-complexity tasks.", + "DeepSeekCoder-1.3b-base": "The weakest performer overall, with poor results across all subsets. Not suitable for tasks requiring high accuracy.", + "DeepSeekCoder-6.7b-base": "Shows strong performance in subsets 1-4 and manages a 50% score in subset 5, indicating better handling of complex tasks.", + "DeepSeekCoder-6.7b-instruct": "The best performer overall, excelling in subsets 1-4 and achieving 50% in subset 5. Handles complexity better than other models.", + "codeqwen2.5-1.5b": "Performs well in subsets 1-3 but drops in subset 4 and fails in subset 5. A good choice for moderate complexity tasks.", + "codeqwen2.5-7b": "The second-best performer, with strong results in subsets 1-4 and a 50% score in subset 5. A robust choice for most tasks.", + "global_insights": "Larger models generally perform better, with DeepSeekCoder-6.7b-instruct and codeqwen2.5-7b leading the pack. Subset 5 is challenging for all models, indicating a complexity threshold. Token count seems to correlate with performance degradation, suggesting that models struggle with longer or more complex inputs." +} \ No newline at end of file diff --git a/llm_insight/MBPP/5/QS/CC_recommendation.json b/llm_insight/MBPP/5/QS/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..4460bbdbcfe160cfc680005ab49ae03e3e66dcad --- /dev/null +++ b/llm_insight/MBPP/5/QS/CC_recommendation.json @@ -0,0 +1,28 @@ +{ + "High performance and reliability": [ + { + "codeqwen2.5-7b": "Consistently high performance across all subsets, making it ideal for diverse and complex tasks." + }, + { + "DeepSeekCoder-6.7b-instruct": "Reliable and consistent performance, suitable for tasks requiring high accuracy." + } + ], + "Balanced performance and cost-effectiveness": [ + { + "CodeGemma-7b": "Good performance with a reasonable model size, suitable for general-purpose tasks." + }, + { + "DeepSeekCoder-6.7b-base": "Strong performance with a slightly lower cost than the top models." + } + ], + "Niche or specific tasks": [ + { + "codeqwen2.5-1.5b": "Excels in specific task types, as seen in subset 3, making it suitable for niche applications." + } + ], + "Budget constraints": [ + { + "CodeGemma-7b-it": "Decent performance at a lower cost, suitable for less complex tasks." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/5/QS/CC_report.json b/llm_insight/MBPP/5/QS/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..305ca5e81d8a725dafc7cf6fcd6d0c8c6572b04c --- /dev/null +++ b/llm_insight/MBPP/5/QS/CC_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "The performance of CodeGemma-2b is inconsistent across subsets, with a significant drop in subsets 4 and 5 (34.2 and 33.6). This suggests the model struggles with more complex or specific tasks in these subsets.", + "CodeGemma-7b-it": "CodeGemma-7b-it shows better performance than CodeGemma-2b but still exhibits a decline in subsets 4 and 5. The model's performance is more stable but still impacted by task complexity.", + "CodeGemma-7b": "CodeGemma-7b demonstrates robust performance across most subsets, though it also sees a drop in subsets 4 and 5. This indicates that while larger models handle complexity better, they are not immune to performance degradation.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base has the lowest performance overall, with significant drops in subsets 3, 4, and 5. This suggests the model is not suitable for complex tasks.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base performs well across all subsets, with a slight drop in subset 3. The model handles complexity better than its smaller counterpart.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct shows the most consistent performance across all subsets, with only a minor drop in subset 5. This model is highly reliable for diverse tasks.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b has a standout performance in subset 3 (80.0) but shows variability in other subsets. This suggests the model excels in specific task types.", + "codeqwen2.5-7b": "codeqwen2.5-7b is the top performer overall, with high scores across all subsets. The model's consistency and high performance make it a strong choice for diverse tasks.", + "global_insights": "Larger models generally perform better, with codeqwen2.5-7b and DeepSeekCoder-6.7b-instruct leading the pack. Performance drops in subsets 4 and 5 suggest these subsets contain more complex or niche tasks. The QS split method effectively highlights model strengths and weaknesses across different task complexities." +} \ No newline at end of file diff --git a/llm_insight/MBPP/5/QS/token_counts_recommendation.json b/llm_insight/MBPP/5/QS/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..17255a4a313eba4be87e8ad40e6aca62151babe6 --- /dev/null +++ b/llm_insight/MBPP/5/QS/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance and scalability with token counts": [ + { + "codeqwen2.5-7b": "Consistently outperforms all other models across all token subsets, making it ideal for tasks requiring high accuracy regardless of code complexity." + }, + { + "DeepSeekCoder-6.7b-instruct": "Shows strong performance, especially in handling longer code snippets, making it suitable for complex coding tasks." + } + ], + "Cost-effective for moderate token counts": [ + { + "codeqwen2.5-1.5b": "Offers a good balance between performance and resource usage, especially for tasks with moderate token counts." + }, + { + "DeepSeekCoder-6.7b-base": "Performs well in most subsets except the longest, providing a cost-effective solution for less complex tasks." + } + ], + "Not recommended": [ + { + "DeepSeekCoder-1.3b-base": "Poor performance across all subsets, especially with longer code snippets." + }, + { + "CodeGemma-2b": "Significant performance drop with increasing token counts, limiting its usability." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/5/QS/token_counts_report.json b/llm_insight/MBPP/5/QS/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..ef0fd266cc523e85b859d6997f5756943eb21484 --- /dev/null +++ b/llm_insight/MBPP/5/QS/token_counts_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "The performance of CodeGemma-2b decreases significantly as token counts increase, dropping from 51.0 in subset_1 to 28.6 in subset_5. This suggests that the model struggles with longer code snippets.", + "CodeGemma-7b-it": "CodeGemma-7b-it shows a peak performance in subset_2 (56.6) and subset_3 (59.4), but its performance declines in higher token count subsets, indicating limitations with more complex code.", + "CodeGemma-7b": "CodeGemma-7b maintains relatively stable performance across subsets 1-4 (around 60.0), but drops in subset_5 (50.6), suggesting it handles moderate token counts well but struggles with the longest snippets.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base has the lowest performance overall, especially in subset_5 (22.8), indicating it is not suitable for tasks with high token counts.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base performs well in subsets 1-4 (60.0-65.0) but drops in subset_5 (44.0), showing robustness except for the most complex cases.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct shows variability, with the highest performance in subset_2 (72.34) and a dip in subset_3 (59.38), but recovers in subset_5 (62.5), indicating it can handle longer code snippets better than most.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b demonstrates strong performance across all subsets, with only a slight decline in subset_5 (62.03), making it a reliable choice for various token counts.", + "codeqwen2.5-7b": "codeqwen2.5-7b is the top performer across all subsets, with the highest scores in subsets 1-3 (81.25-82.5) and maintaining strong performance in subset_5 (70.89), indicating excellent scalability with token counts.", + "global_insights": "1. Larger models generally perform better across all token subsets. 2. Performance tends to decline with higher token counts, but some models like codeqwen2.5-7b and DeepSeekCoder-6.7b-instruct handle longer snippets better. 3. The 1.3b models (DeepSeekCoder-1.3b-base) are not recommended for complex tasks. 4. CodeGemma models show a clear drop in performance with increasing token counts, while codeqwen models maintain more consistent performance." +} \ No newline at end of file diff --git a/llm_insight/MBPP/6/EI/CC_recommendation.json b/llm_insight/MBPP/6/EI/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..72f2708c7d48440fbab16609555b232b532410b9 --- /dev/null +++ b/llm_insight/MBPP/6/EI/CC_recommendation.json @@ -0,0 +1,23 @@ +{ + "For high complexity tasks with no cost constraints": [ + { + "DeepSeekCoder-6.7b-instruct": "This model consistently outperforms others, especially in complex subsets, making it ideal for high-complexity tasks." + } + ], + "For balanced performance and cost-effectiveness": [ + { + "DeepSeekCoder-6.7b-base": "This model offers robust performance across most subsets and is a good balance between cost and capability." + }, + { + "CodeGemma-7b": "This model provides decent performance in the initial subsets and is a cost-effective option for less complex tasks." + } + ], + "For lightweight tasks with low complexity": [ + { + "CodeGemma-2b": "This model is suitable for simple tasks where cost is a major concern, though it struggles with complexity." + }, + { + "DeepSeekCoder-1.3b-base": "This model is a lightweight option for moderate tasks but has limitations with more complex subsets." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/6/EI/CC_report.json b/llm_insight/MBPP/6/EI/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..168fac029b742c8d4542a91b1c44b4b8b90a33ae --- /dev/null +++ b/llm_insight/MBPP/6/EI/CC_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "CodeGemma-2b shows a significant drop in performance as the complexity of the subsets increases, with no performance in the last two subsets. This suggests it struggles with more complex tasks.", + "CodeGemma-7b-it": "CodeGemma-7b-it performs better than the 2b version but still shows a decline in performance with increasing complexity. It maintains some capability in the middle subsets but fails in the last two.", + "CodeGemma-7b": "CodeGemma-7b has a strong start but also declines in performance with complexity. It performs similarly to the 7b-it version but shows slightly better results in the initial subsets.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base has moderate performance across the first four subsets but fails in the last two, indicating limitations with more complex tasks.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base shows robust performance in the first four subsets and even manages to perform in the sixth subset, suggesting better handling of complexity.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct is the top performer, excelling in the first four subsets and achieving perfect scores in the last two, indicating superior capability with complex tasks.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b has inconsistent performance, with high scores in some subsets but no performance in the last two, suggesting variability in task handling.", + "codeqwen2.5-7b": "codeqwen2.5-7b shows strong performance in the first three subsets but fails completely in the last three, indicating a sharp drop in capability with increased complexity.", + "global_insights": "The models generally perform well in the first few subsets but struggle with the more complex ones. DeepSeekCoder-6.7b-instruct stands out as the best performer, especially in handling complex tasks. The larger models (7b and 6.7b) tend to perform better than the smaller ones, but even they have limitations with the most complex subsets." +} \ No newline at end of file diff --git a/llm_insight/MBPP/6/EI/token_counts_recommendation.json b/llm_insight/MBPP/6/EI/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..b76078cd991255bafbd2d6e42c61173953e898dc --- /dev/null +++ b/llm_insight/MBPP/6/EI/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy with high token counts": [ + { + "DeepSeekCoder-6.7b-instruct": "Consistently high performance across all subsets, especially excels in high token count scenarios." + }, + { + "DeepSeekCoder-6.7b-base": "Strong performance in high token counts, though slightly less consistent than the instruct version." + } + ], + "Balanced performance with moderate token counts": [ + { + "CodeGemma-7b": "Good performance in subsets 1-4, suitable for tasks with moderate token counts." + }, + { + "codeqwen2.5-7b": "Performs well in most subsets, though with some inconsistencies." + } + ], + "Cost-effective for low token counts": [ + { + "CodeGemma-2b": "Moderate performance in low to mid token counts, suitable for budget-conscious scenarios." + }, + { + "codeqwen2.5-1.5b": "High performance in specific subsets (1-2, 4), ideal for targeted low token count tasks." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/6/EI/token_counts_report.json b/llm_insight/MBPP/6/EI/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..b538a9ee601b9188e783ef6b7ef42c7957deded7 --- /dev/null +++ b/llm_insight/MBPP/6/EI/token_counts_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "CodeGemma-2b shows moderate performance in subsets with lower token counts (subsets 1-4) but fails completely in subset 6. It performs best in subset 5 with 50% accuracy, indicating it may handle mid-range complexity well but struggles with very high token counts.", + "CodeGemma-7b-it": "CodeGemma-7b-it performs similarly to CodeGemma-2b but with slightly better accuracy in subsets 1-3. However, it also fails in subset 6, suggesting that increasing model size alone doesn't resolve issues with very high token counts.", + "CodeGemma-7b": "CodeGemma-7b outperforms its smaller counterparts in subsets 1-4, showing that larger models can handle moderate token counts better. However, it still fails in subset 6, indicating a limit to scalability with very high token counts.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base has the lowest performance across all subsets, particularly struggling with higher token counts (subsets 3-6). This suggests that smaller models may not be suitable for tasks with varying token complexities.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base shows strong performance in subsets 1-5 and excels in subset 6 with 100% accuracy. This indicates that larger base models can handle a wide range of token counts effectively.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct is the top performer across all subsets, especially excelling in subsets 4-6. This suggests that instruction-tuning significantly enhances a model's ability to handle high token counts.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b shows high performance in subsets 1-2 and subset 4 but struggles in subsets 3, 5, and 6. This indicates inconsistency in handling varying token counts, possibly due to its smaller size.", + "codeqwen2.5-7b": "codeqwen2.5-7b performs well in subsets 1-3 and subset 6 but shows a drop in subset 4 and 5. This suggests that while larger models can handle high token counts, they may still have inconsistencies in mid-range complexities.", + "global_insights": "1. Larger models generally perform better across varying token counts, with DeepSeekCoder-6.7b-instruct being the standout performer. 2. Instruction-tuning appears to significantly improve model performance, especially for high token counts. 3. Smaller models (e.g., DeepSeekCoder-1.3b-base) struggle with higher token counts, indicating a clear trade-off between model size and performance. 4. Some models (e.g., codeqwen2.5-7b) show inconsistent performance, suggesting that model architecture and training data also play critical roles in handling token complexity." +} \ No newline at end of file diff --git a/llm_insight/MBPP/6/QS/CC_recommendation.json b/llm_insight/MBPP/6/QS/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..b8573bd7b5250d6ca2c5f040841d85649363e42a --- /dev/null +++ b/llm_insight/MBPP/6/QS/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance and versatility": [ + { + "codeqwen2.5-7b": "Consistently high performance across all subsets, making it ideal for diverse and complex tasks." + }, + { + "DeepSeekCoder-6.7b-instruct": "Strong performance in all subsets, particularly in complex tasks, with a good balance of cost and capability." + } + ], + "Balanced performance and cost-effectiveness": [ + { + "codeqwen2.5-1.5b": "Offers robust performance at a lower computational cost, suitable for a wide range of tasks." + }, + { + "DeepSeekCoder-6.7b-base": "Provides consistent performance across subsets, making it a reliable choice for general use." + } + ], + "Budget-conscious with moderate performance": [ + { + "CodeGemma-7b": "Decent performance in earlier subsets, suitable for less complex tasks." + }, + { + "CodeGemma-7b-it": "Better than the 2b version but still limited in later subsets, ideal for simpler tasks." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/6/QS/CC_report.json b/llm_insight/MBPP/6/QS/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..1f1285795d69e083cfe9e06f988bc8d94cd209b1 --- /dev/null +++ b/llm_insight/MBPP/6/QS/CC_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "CodeGemma-2b shows a declining trend in performance across subsets, with the lowest scores in CC_subset_5 and CC_subset_6. This suggests the model struggles with more complex or nuanced tasks.", + "CodeGemma-7b-it": "CodeGemma-7b-it performs better than its 2b counterpart but still shows a drop in performance in later subsets. The model maintains relatively stable performance in the first four subsets.", + "CodeGemma-7b": "CodeGemma-7b demonstrates strong performance in the first four subsets but experiences a significant drop in CC_subset_6. This indicates potential limitations in handling certain types of tasks.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base has the lowest overall performance, with particularly poor results in CC_subset_4 and CC_subset_5. The model may not be suitable for complex tasks.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base shows consistent performance across all subsets, with a slight dip in CC_subset_4. The model handles a wide range of tasks effectively.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct performs well across all subsets, with the highest scores in CC_subset_3 and CC_subset_6. This model excels in diverse task scenarios.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b shows strong performance across all subsets, with particularly high scores in CC_subset_2 and CC_subset_3. The model is robust and versatile.", + "codeqwen2.5-7b": "codeqwen2.5-7b is the top performer, maintaining high scores across all subsets. The model demonstrates exceptional capability in handling diverse and complex tasks.", + "global_insights": "Larger models generally perform better, with codeqwen2.5-7b leading the pack. Performance tends to drop in later subsets, suggesting increased difficulty or complexity. DeepSeekCoder-6.7b-instruct and codeqwen2.5 models show the most consistent performance across all subsets." +} \ No newline at end of file diff --git a/llm_insight/MBPP/6/QS/token_counts_recommendation.json b/llm_insight/MBPP/6/QS/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..f1ad7a98b459485aefba398b3dfb4930512d78ac --- /dev/null +++ b/llm_insight/MBPP/6/QS/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance with cost-effectiveness": [ + { + "codeqwen2.5-7b": "Consistently top performer across all token subsets, ideal for high-accuracy requirements." + }, + { + "DeepSeekCoder-6.7b-instruct": "Strong performance, especially in longer code snippets, suitable for varied token lengths." + } + ], + "Balanced performance and cost": [ + { + "CodeGemma-7b": "Robust performance with a good balance of accuracy and computational cost." + }, + { + "codeqwen2.5-1.5b": "Good performance for its size, suitable for scenarios with moderate token counts." + } + ], + "Budget-conscious with acceptable performance": [ + { + "CodeGemma-7b-it": "Decent performance across subsets, suitable for lower-budget projects." + }, + { + "DeepSeekCoder-6.7b-base": "Reasonable performance, especially in mid-range token subsets." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/6/QS/token_counts_report.json b/llm_insight/MBPP/6/QS/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..4ce5c5939853fd70d8852e8689796dec0e9a5cc9 --- /dev/null +++ b/llm_insight/MBPP/6/QS/token_counts_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "CodeGemma-2b shows a declining trend in performance as token counts increase, with a significant drop in token_subset_6 (25.6). This suggests the model struggles with longer code snippets.", + "CodeGemma-7b-it": "CodeGemma-7b-it performs consistently well across subsets, peaking in token_subset_2 (63.06) and maintaining relatively high scores even in token_subset_6 (40.27).", + "CodeGemma-7b": "CodeGemma-7b demonstrates robust performance, especially in token_subset_5 (65.41), but also shows a decline in token_subset_6 (45.87), indicating some sensitivity to token length.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base has the lowest performance overall, with a sharp drop in token_subset_6 (19.2), highlighting its limitations with larger token counts.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base performs well, especially in token_subset_2 (68.24) and token_subset_4 (64.71), but shows a decline in token_subset_6 (44.0).", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct maintains high performance across all subsets, with the best scores in token_subset_2 (69.62) and token_subset_6 (63.38), indicating strong handling of varying token lengths.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b shows strong performance, peaking in token_subset_3 (76.47) and maintaining decent scores in token_subset_6 (59.32).", + "codeqwen2.5-7b": "codeqwen2.5-7b is the top performer, with the highest scores across all subsets, especially in token_subset_2 (83.82) and token_subset_4 (79.41), and a strong showing in token_subset_6 (69).", + "global_insights": "Larger models generally perform better across all token subsets, with codeqwen2.5-7b leading. Performance tends to decline with increasing token counts, but models like DeepSeekCoder-6.7b-instruct and codeqwen2.5-7b handle longer snippets more effectively. Smaller models like DeepSeekCoder-1.3b-base struggle significantly with larger token counts." +} \ No newline at end of file diff --git a/llm_insight/MBPP/7/EI/CC_recommendation.json b/llm_insight/MBPP/7/EI/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..63926f36df58f5172484acdb03fb1a93c3e63ed6 --- /dev/null +++ b/llm_insight/MBPP/7/EI/CC_recommendation.json @@ -0,0 +1,23 @@ +{ + "High performance with cost-effectiveness": [ + { + "codeqwen2.5-1.5b": "Offers a good balance of performance and cost, especially in subsets 1-3 and 7." + }, + { + "DeepSeekCoder-6.7b-instruct": "Performs well in multiple subsets and is a good choice for tasks similar to subsets 1, 2, 4, and 7." + } + ], + "Best overall performance": [ + { + "codeqwen2.5-7b": "The top performer across most subsets, ideal for tasks where high accuracy is critical." + } + ], + "Budget-conscious with moderate performance": [ + { + "DeepSeekCoder-1.3b-base": "A cost-effective option for tasks in subsets 1-4 where moderate performance is acceptable." + }, + { + "CodeGemma-7b-it": "Provides decent performance in subsets 1-4 at a lower cost compared to larger models." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/7/EI/CC_report.json b/llm_insight/MBPP/7/EI/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..d5f26d18ada91f188fd8dc5bde13fdf28907618f --- /dev/null +++ b/llm_insight/MBPP/7/EI/CC_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "The model shows a significant drop in performance across subsets, particularly in subsets 5, 6, and 7 where it scores 0.0. This indicates poor generalization on more complex or diverse tasks.", + "CodeGemma-7b-it": "Performs better than CodeGemma-2b but still struggles with subsets 5 and 6. It shows moderate performance in subsets 1-4 but fails completely in subsets 5 and 6.", + "CodeGemma-7b": "Shows inconsistent performance, with high scores in subsets 1-3 but very low scores in subsets 4 and 6. It performs moderately in subset 5.", + "DeepSeekCoder-1.3b-base": "Consistently moderate performance across subsets 1-4 but fails in subsets 5-7. This suggests limitations in handling more complex tasks.", + "DeepSeekCoder-6.7b-base": "Performs well in subsets 1-3 and moderately in subsets 5 and 7. However, it fails in subset 6 and shows low performance in subset 4.", + "DeepSeekCoder-6.7b-instruct": "Shows strong performance in subsets 1, 2, 4, and 7 but fails in subset 6. This indicates good generalization but with some limitations.", + "codeqwen2.5-1.5b": "Performs well in subsets 1-3 and exceptionally in subset 7. However, it fails in subset 6 and shows moderate performance in subsets 4 and 5.", + "codeqwen2.5-7b": "The best performer overall, with high scores in subsets 1-4 and 7. It fails in subsets 5 and 6, suggesting some limitations in certain task types.", + "global_insights": "Larger models generally perform better, with codeqwen2.5-7b being the top performer. Subsets 5 and 6 are particularly challenging for most models, indicating these may represent more complex or niche tasks. Subset 7 shows high performance for some models, suggesting it may contain tasks that align well with certain model strengths." +} \ No newline at end of file diff --git a/llm_insight/MBPP/7/EI/token_counts_recommendation.json b/llm_insight/MBPP/7/EI/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..f92a6540a69c82b0f82b2e0adfc027f98c07d5c7 --- /dev/null +++ b/llm_insight/MBPP/7/EI/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High complexity tasks with no cost constraints": [ + { + "DeepSeekCoder-6.7b-instruct": "Consistently outperforms all other models, especially in high token count subsets, making it ideal for complex tasks." + }, + { + "codeqwen2.5-7b": "Also performs exceptionally well in high token subsets, offering a strong alternative." + } + ], + "Balanced tasks with moderate cost": [ + { + "DeepSeekCoder-6.7b-base": "Provides good performance across most subsets without the additional cost of instruction tuning." + }, + { + "CodeGemma-7b": "A cost-effective option for tasks that do not require handling extremely high token counts." + } + ], + "Low complexity tasks with tight cost constraints": [ + { + "CodeGemma-2b": "Suitable for simpler tasks with lower token counts, offering decent performance at a lower cost." + }, + { + "codeqwen2.5-1.5b": "Performs well in lower token subsets, providing a budget-friendly option." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/7/EI/token_counts_report.json b/llm_insight/MBPP/7/EI/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..3fc11526ea97d5ff9bb7796270f573453767187a --- /dev/null +++ b/llm_insight/MBPP/7/EI/token_counts_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "The model shows moderate performance in subsets with lower token counts but struggles significantly in higher token subsets, especially subset_7 where it fails completely. This suggests limitations in handling complex or lengthy code snippets.", + "CodeGemma-7b-it": "This model performs better than its 2b counterpart across most subsets, particularly in mid-range token counts. However, it also fails in subset_7, indicating similar limitations with very high token counts.", + "CodeGemma-7b": "The 7b model shows a more balanced performance across subsets, with notable improvement in subset_5. Its failure in subset_7 suggests that even larger models within the same family may struggle with extremely high token counts.", + "DeepSeekCoder-1.3b-base": "This model has the weakest performance across all subsets, particularly in subset_4 and subset_7. Its small size likely limits its ability to handle diverse and complex code snippets.", + "DeepSeekCoder-6.7b-base": "A significant improvement over the 1.3b model, this model performs well in subsets with higher token counts, especially in subset_7 where it achieves 100% accuracy. This indicates better scalability with complexity.", + "DeepSeekCoder-6.7b-instruct": "The best performer among all models, excelling in high token count subsets (subset_5 and subset_7) with perfect scores. This suggests that instruction tuning greatly enhances performance on complex tasks.", + "codeqwen2.5-1.5b": "This model shows strong performance in lower to mid-range token subsets but fails in subset_7. Its performance is inconsistent, possibly due to its smaller size.", + "codeqwen2.5-7b": "The larger version of codeqwen2.5 shows robust performance across most subsets, with perfect scores in subset_7. This highlights the benefits of scaling up model size.", + "global_insights": "1. Larger models generally perform better across all token subsets, with DeepSeekCoder-6.7b-instruct and codeqwen2.5-7b leading the pack. 2. Instruction tuning (as seen in DeepSeekCoder-6.7b-instruct) significantly boosts performance, especially in complex subsets. 3. All models struggle with subset_7 except for the largest and instruction-tuned models, indicating a common challenge with very high token counts. 4. The codeqwen2.5 family shows inconsistent performance, possibly due to architectural differences compared to other models." +} \ No newline at end of file diff --git a/llm_insight/MBPP/7/QS/CC_recommendation.json b/llm_insight/MBPP/7/QS/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..443383a32744f86f054e34233a78be55dd8d8ee1 --- /dev/null +++ b/llm_insight/MBPP/7/QS/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance and reliability": [ + { + "codeqwen2.5-7b": "Consistently outperforms all other models across all subsets." + }, + { + "DeepSeekCoder-6.7b-instruct": "Strong and consistent performance, especially in complex tasks." + } + ], + "Cost-effective for moderate tasks": [ + { + "codeqwen2.5-1.5b": "Offers excellent performance for its size, particularly in CC_subset_4." + }, + { + "DeepSeekCoder-6.7b-base": "Reliable performance with a good balance of cost and capability." + } + ], + "Budget-friendly with acceptable performance": [ + { + "CodeGemma-7b": "Stable performance across most subsets, suitable for less critical tasks." + }, + { + "CodeGemma-7b-it": "Inconsistent but can be useful for specific subsets with fine-tuning." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/7/QS/CC_report.json b/llm_insight/MBPP/7/QS/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..ed06a757111bed7b368f6db6425e1818f2f7d507 --- /dev/null +++ b/llm_insight/MBPP/7/QS/CC_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "CodeGemma-2b shows a declining trend in performance across subsets, with the lowest scores in CC_subset_6 and CC_subset_7. This suggests the model struggles with more complex or nuanced tasks.", + "CodeGemma-7b-it": "CodeGemma-7b-it performs well in CC_subset_2 but shows inconsistency, particularly in CC_subset_5 and CC_subset_7. The model may need fine-tuning for specific subsets.", + "CodeGemma-7b": "CodeGemma-7b maintains relatively stable performance across most subsets, with a slight drop in CC_subset_7. It is a robust model but may not excel in the most challenging tasks.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base has the lowest overall performance, particularly in CC_subset_6. It is not recommended for complex tasks.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base shows strong performance in CC_subset_2 and CC_subset_3 but drops slightly in CC_subset_4 and CC_subset_5. It is a reliable model for most tasks.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct performs consistently well across all subsets, with particularly high scores in CC_subset_2 and CC_subset_3. It is a top performer.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b shows impressive performance, especially in CC_subset_4. It is a strong contender despite its smaller size.", + "codeqwen2.5-7b": "codeqwen2.5-7b is the best-performing model across all subsets, with the highest scores in CC_subset_1 and CC_subset_2. It is highly recommended for all tasks.", + "global_insights": "Larger models generally perform better, with codeqwen2.5-7b leading the pack. Performance drops in later subsets (CC_subset_6 and CC_subset_7) suggest these are more challenging. The split perspective (CC) and method (QS) effectively highlight model strengths and weaknesses." +} \ No newline at end of file diff --git a/llm_insight/MBPP/7/QS/token_counts_recommendation.json b/llm_insight/MBPP/7/QS/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..4059003bfe0265a876b18a035bfb1fee9b7a4afe --- /dev/null +++ b/llm_insight/MBPP/7/QS/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High accuracy tasks with varying token counts": [ + { + "codeqwen2.5-7b": "Consistently high performance across all token subsets." + }, + { + "codeqwen2.5-1.5b": "Strong performance with a smaller model size, suitable for resource-constrained environments." + } + ], + "Moderate accuracy tasks with cost-effectiveness": [ + { + "DeepSeekCoder-6.7b-instruct": "Balanced performance and cost, especially for moderate token counts." + }, + { + "CodeGemma-7b": "Good performance for most subsets, with a reasonable model size." + } + ], + "Low-resource environments": [ + { + "codeqwen2.5-1.5b": "Smaller model size with competitive performance." + }, + { + "DeepSeekCoder-1.3b-base": "Lowest resource requirements but with significant performance trade-offs." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/7/QS/token_counts_report.json b/llm_insight/MBPP/7/QS/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..6a0fb1d6dc46f5421ea891c13dfe4c415837b1fc --- /dev/null +++ b/llm_insight/MBPP/7/QS/token_counts_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "The model shows moderate performance across subsets, with a noticeable decline in token_subset_7 (25.6). This suggests limitations in handling larger token counts.", + "CodeGemma-7b-it": "Consistently better performance than CodeGemma-2b, especially in token_subset_4 (68.0). However, it still struggles with larger token counts (token_subset_7: 40.27).", + "CodeGemma-7b": "Strong performance across most subsets, particularly in token_subset_2 (65.43) and token_subset_4 (66.0). The drop in token_subset_7 (45.87) indicates scalability challenges.", + "DeepSeekCoder-1.3b-base": "Lowest performance among all models, especially in token_subset_7 (19.2). Not suitable for tasks requiring high accuracy across varying token counts.", + "DeepSeekCoder-6.7b-base": "High performance in token_subset_2 (74.29) and token_subset_4 (70.0), but significant drop in token_subset_7 (44.0). Better scalability than smaller models but still limited.", + "DeepSeekCoder-6.7b-instruct": "Consistent performance across subsets, with a peak in token_subset_1 (73.53) and token_subset_3 (72.86). Handles larger token counts better than other DeepSeek models (token_subset_7: 63.38).", + "codeqwen2.5-1.5b": "Strong performance across all subsets, with the highest score in token_subset_6 (75.0). Handles larger token counts relatively well (token_subset_7: 59.32).", + "codeqwen2.5-7b": "Best overall performance, with top scores in token_subset_1 (82.14), token_subset_2 (83.93), and token_subset_4 (82.14). Also handles larger token counts better than others (token_subset_7: 69.49).", + "global_insights": "1. Larger models generally perform better across all token subsets. 2. Performance tends to decline with increasing token counts, but the extent varies by model. 3. codeqwen2.5-7b and codeqwen2.5-1.5b show the most consistent performance across subsets. 4. DeepSeekCoder-6.7b-instruct and CodeGemma-7b are good alternatives for tasks with moderate token counts." +} \ No newline at end of file diff --git a/llm_insight/MBPP/8/EI/CC_recommendation.json b/llm_insight/MBPP/8/EI/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..52183606eb462a6288bfeb1028ef52de120f8886 --- /dev/null +++ b/llm_insight/MBPP/8/EI/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance and robustness": [ + { + "DeepSeekCoder-6.7b-instruct": "Consistently high performance across all subsets, especially in complex tasks." + }, + { + "codeqwen2.5-7b": "Top performer in most subsets, indicating strong general capabilities." + } + ], + "Balanced performance and cost-effectiveness": [ + { + "DeepSeekCoder-6.7b-base": "Good performance in most subsets with a lower cost compared to the instruct version." + }, + { + "CodeGemma-7b": "Decent performance in simpler tasks at a potentially lower cost." + } + ], + "Lightweight and moderate performance": [ + { + "codeqwen2.5-1.5b": "Reasonable performance in simpler tasks with a smaller model size." + }, + { + "DeepSeekCoder-1.3b-base": "Suitable for basic tasks where high performance is not critical." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/8/EI/CC_report.json b/llm_insight/MBPP/8/EI/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..de5a4764b1e4b52d2ddad5aa47d7b6eaf0d1e121 --- /dev/null +++ b/llm_insight/MBPP/8/EI/CC_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "The model shows moderate performance in subsets 1-5 but fails completely in subsets 6-8, indicating a limitation in handling more complex or specific tasks.", + "CodeGemma-7b-it": "Performs better than the 2b version across all subsets but still struggles with subsets 6-8, suggesting that while larger, it still has limitations in certain scenarios.", + "CodeGemma-7b": "Shows the best performance among the CodeGemma models, particularly in subsets 1-3, but like others, fails in subsets 6-8.", + "DeepSeekCoder-1.3b-base": "Performs similarly to CodeGemma-2b but shows slightly better results in subset 3. However, it also fails in subsets 6-8.", + "DeepSeekCoder-6.7b-base": "Demonstrates strong performance in subsets 1-5 and surprisingly achieves 100% in subset 8, indicating robustness in certain complex tasks.", + "DeepSeekCoder-6.7b-instruct": "The best performer among all models, achieving high scores across most subsets and perfect scores in subsets 7 and 8, showcasing its ability to handle diverse and complex tasks.", + "codeqwen2.5-1.5b": "Shows strong performance in subsets 1-4 but drops significantly in subset 5 and fails in subset 6, with a surprising 100% in subset 8.", + "codeqwen2.5-7b": "The top performer overall, with consistently high scores across subsets 1-4 and a perfect score in subset 8, though it struggles in subsets 5-7.", + "global_insights": "Larger models generally perform better, with DeepSeekCoder-6.7b-instruct and codeqwen2.5-7b leading the pack. Subsets 6-8 are particularly challenging, with most models failing, except for a few that show unexpected high performance in subset 8. This suggests that subset 8 might contain tasks that are either very specific or require advanced capabilities that only certain models possess." +} \ No newline at end of file diff --git a/llm_insight/MBPP/8/EI/token_counts_recommendation.json b/llm_insight/MBPP/8/EI/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..3b74670d7393cdbe8ecc0994dbf65b1b5222ed20 --- /dev/null +++ b/llm_insight/MBPP/8/EI/token_counts_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance and robustness across all token ranges": [ + { + "DeepSeekCoder-6.7b-instruct": "This model consistently performs well across all token subsets, making it ideal for scenarios requiring high accuracy and reliability." + }, + { + "codeqwen2.5-7b": "This model shows strong performance in most subsets and handles higher token counts well, making it a reliable choice for diverse inputs." + } + ], + "Cost-effective for mid-range token subsets": [ + { + "CodeGemma-7b": "This model offers a good balance of performance and cost, particularly effective in mid-range token subsets." + }, + { + "DeepSeekCoder-6.7b-base": "A cost-effective option that performs well in higher token subsets, though with some variability in mid-range subsets." + } + ], + "Budget-friendly for lower token counts": [ + { + "CodeGemma-2b": "Suitable for scenarios with lower token counts where budget is a constraint, though performance drops with higher tokens." + }, + { + "codeqwen2.5-1.5b": "A budget-friendly option that excels in specific token ranges, but may not be reliable for all scenarios." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/8/EI/token_counts_report.json b/llm_insight/MBPP/8/EI/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..f3a93b409dcb938ac1ee854ce06a8f87d969f3d6 --- /dev/null +++ b/llm_insight/MBPP/8/EI/token_counts_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "The model shows moderate performance in subsets with lower token counts but fails completely in the highest token subset (subset_8). Performance is inconsistent across subsets, suggesting sensitivity to token count variations.", + "CodeGemma-7b-it": "Similar to CodeGemma-2b, this model performs well in mid-range token subsets but struggles with higher token counts. The performance drop in subset_8 is notable.", + "CodeGemma-7b": "This model demonstrates better consistency across subsets compared to its smaller counterparts, but still shows a significant drop in subset_8. It performs well in subset_6, indicating robustness in certain token ranges.", + "DeepSeekCoder-1.3b-base": "The model's performance is generally poor, especially in higher token subsets. It fails completely in subset_6 and subset_8, indicating limitations with larger inputs.", + "DeepSeekCoder-6.7b-base": "This model shows strong performance in most subsets, particularly in subset_6 and subset_8, suggesting better handling of higher token counts. However, there is variability in mid-range subsets.", + "DeepSeekCoder-6.7b-instruct": "The best performer overall, with high scores across all subsets except subset_7. It handles higher token counts exceptionally well, achieving 100% in subset_6 and subset_8.", + "codeqwen2.5-1.5b": "This model shows high performance in subset_5 but fails in subset_8. The inconsistency suggests that while it excels in certain token ranges, it may not be reliable for all scenarios.", + "codeqwen2.5-7b": "The model performs well in most subsets, particularly in subset_1 and subset_3. It handles subset_8 well, indicating robustness with higher token counts.", + "global_insights": "Models generally perform better in lower to mid-range token subsets, with performance dropping in higher token counts. However, larger models like DeepSeekCoder-6.7b-instruct and codeqwen2.5-7b show better consistency and robustness across all token ranges. The EI split method reveals that token count significantly impacts model performance, with higher counts posing challenges for most models." +} \ No newline at end of file diff --git a/llm_insight/MBPP/8/QS/CC_recommendation.json b/llm_insight/MBPP/8/QS/CC_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..70bd0b97fbaab8a69cbcb6db8a54382a4bf721a6 --- /dev/null +++ b/llm_insight/MBPP/8/QS/CC_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance with no budget constraints": [ + { + "codeqwen2.5-7b": "Consistently the top performer across all subsets, ideal for high-stakes or critical applications." + }, + { + "DeepSeekCoder-6.7b-instruct": "Highly consistent and robust performance, suitable for scenarios requiring reliability." + } + ], + "Balanced performance and cost-effectiveness": [ + { + "CodeGemma-7b": "Offers strong performance at a potentially lower cost than the top-tier models." + }, + { + "codeqwen2.5-1.5b": "Provides competitive performance, especially in certain subsets, at a lower computational cost." + } + ], + "Budget-conscious with acceptable performance": [ + { + "DeepSeekCoder-6.7b-base": "Decent performance with some variability, suitable for less critical tasks." + }, + { + "CodeGemma-7b-it": "Better than the smallest models but more affordable than the largest ones." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/8/QS/CC_report.json b/llm_insight/MBPP/8/QS/CC_report.json new file mode 100644 index 0000000000000000000000000000000000000000..7be1a7418c0c898fb56bc5641156c9a03cd9b8a4 --- /dev/null +++ b/llm_insight/MBPP/8/QS/CC_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "CodeGemma-2b shows a declining trend in performance across subsets, with the lowest scores in CC_subset_7 and CC_subset_8. This suggests it struggles with more complex or nuanced tasks.", + "CodeGemma-7b-it": "CodeGemma-7b-it performs consistently better than CodeGemma-2b, with a noticeable drop in CC_subset_7 and CC_subset_8 but still maintaining relatively higher scores.", + "CodeGemma-7b": "CodeGemma-7b demonstrates robust performance across most subsets, though it also shows a decline in later subsets. It outperforms its smaller counterparts consistently.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base has erratic performance, with high variability across subsets. It performs poorly in CC_subset_7 but recovers slightly in CC_subset_8.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base shows strong performance in early subsets but experiences a dip in CC_subset_5 and CC_subset_7, though it recovers in CC_subset_8.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct is one of the top performers, maintaining high scores across most subsets with only minor fluctuations.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b has impressive performance, particularly in CC_subset_5, but shows some inconsistency in later subsets.", + "codeqwen2.5-7b": "codeqwen2.5-7b is the top performer overall, with the highest scores in most subsets, though it also shows a slight decline in CC_subset_8.", + "global_insights": "Larger models generally outperform smaller ones, with codeqwen2.5-7b leading the pack. Performance tends to decline in later subsets, suggesting these may represent more challenging tasks. The instruct-tuned models (e.g., DeepSeekCoder-6.7b-instruct) show more consistent performance compared to their base counterparts." +} \ No newline at end of file diff --git a/llm_insight/MBPP/8/QS/token_counts_recommendation.json b/llm_insight/MBPP/8/QS/token_counts_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..89465b4b1fca54fd810ba0cf601df6dbd70f4d97 --- /dev/null +++ b/llm_insight/MBPP/8/QS/token_counts_recommendation.json @@ -0,0 +1,34 @@ +{ + "High accuracy required regardless of cost": [ + { + "codeqwen2.5-7b": "Consistently top performer across all token subsets" + }, + { + "DeepSeekCoder-6.7b-instruct": "Strong performance in most subsets, particularly in lower token counts" + } + ], + "Cost-effective solution for moderate token counts": [ + { + "codeqwen2.5-1.5b": "Excellent performance for its size, especially in mid-range token subsets" + }, + { + "CodeGemma-7b": "Good balance of performance across subsets without the highest cost" + } + ], + "Budget-constrained scenarios": [ + { + "codeqwen2.5-1.5b": "Outperforms other small models significantly" + }, + { + "CodeGemma-7b-it": "Provides decent performance at lower cost than larger models" + } + ], + "Handling very long code snippets": [ + { + "codeqwen2.5-7b": "Maintains the best performance in highest token subsets" + }, + { + "codeqwen2.5-1.5b": "Surprisingly robust performance even in token_subset_7 and 8" + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/8/QS/token_counts_report.json b/llm_insight/MBPP/8/QS/token_counts_report.json new file mode 100644 index 0000000000000000000000000000000000000000..6f940c91c3f4f781524561899ed1875106468142 --- /dev/null +++ b/llm_insight/MBPP/8/QS/token_counts_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "The model shows a declining performance trend as token counts increase, indicating potential struggles with longer code snippets. It performs best in token_subset_2 (56.67) but drops significantly in token_subset_8 (28.0).", + "CodeGemma-7b-it": "This model demonstrates better performance in mid-range token subsets (token_subset_3: 63.33, token_subset_5: 62.77) but shows inconsistency with higher token counts.", + "CodeGemma-7b": "Consistently strong performance across most subsets, particularly in token_subset_2 (69.67) and token_subset_7 (63.38). However, it still shows a drop in token_subset_8 (45.54).", + "DeepSeekCoder-1.3b-base": "The smallest model struggles significantly with higher token counts, especially in token_subset_8 (18.15). Performance is mediocre across all subsets.", + "DeepSeekCoder-6.7b-base": "Excellent performance in token_subset_2 (75.0) and maintains relatively strong results in other subsets except token_subset_7 (47.69).", + "DeepSeekCoder-6.7b-instruct": "Shows strong performance in token_subset_1 (75.0) and token_subset_4 (71.67), with consistent results across most subsets. The drop in token_subset_5 (52.46) is notable.", + "codeqwen2.5-1.5b": "Surprisingly strong for its size, with excellent performance in token_subset_3 (77.08) and token_subset_7 (76.92). Maintains decent performance even in higher token counts.", + "codeqwen2.5-7b": "The top performer across almost all subsets, particularly in token_subset_2 (85.42) and token_subset_5 (82.69). Shows remarkable consistency even with increasing token counts.", + "global_insights": "1) Larger models generally perform better with increasing token counts, with codeqwen2.5-7b being the most consistent. 2) There's a clear performance drop in higher token subsets for most models except codeqwen2.5 series. 3) The 1.5b parameter models (codeqwen2.5-1.5b) outperform similarly sized models (DeepSeekCoder-1.3b-base) significantly. 4) Instruction-tuned models don't always outperform their base counterparts in this evaluation." +} \ No newline at end of file diff --git a/llm_insight/MBPP/problem_type_recommendation.json b/llm_insight/MBPP/problem_type_recommendation.json new file mode 100644 index 0000000000000000000000000000000000000000..d7d4a65a669294a337bff16d5c6f6af816de48e5 --- /dev/null +++ b/llm_insight/MBPP/problem_type_recommendation.json @@ -0,0 +1,26 @@ +{ + "High performance across all problem types": [ + { + "codeqwen2.5-1.5b": "It is the only model that handles Other problems and performs exceptionally well in Math and String problems." + }, + { + "codeqwen2.5-7b": "It leads in Math problems and performs well in Array and String problems, though it fails in Other problems." + } + ], + "Cost-effective for Math and String problems": [ + { + "CodeGemma-7b-it": "It provides a good balance of performance and cost, especially for Math problems." + }, + { + "DeepSeekCoder-6.7b-base": "It excels in Math problems and performs well in Array and String problems, making it a cost-effective choice." + } + ], + "General purpose with moderate performance": [ + { + "CodeGemma-7b": "It offers consistent performance across Array, String, and Math problems, though it lacks in Other problems." + }, + { + "DeepSeekCoder-6.7b-instruct": "It performs well in Array problems and is a decent choice for general purposes." + } + ] +} \ No newline at end of file diff --git a/llm_insight/MBPP/problem_type_report.json b/llm_insight/MBPP/problem_type_report.json new file mode 100644 index 0000000000000000000000000000000000000000..a310eafc1d131a6008e2860bc6278dfd4b27727a --- /dev/null +++ b/llm_insight/MBPP/problem_type_report.json @@ -0,0 +1,11 @@ +{ + "CodeGemma-2b": "CodeGemma-2b performs moderately on Array and String problems but shows strong performance in Math problems. It fails to handle Other problem types.", + "CodeGemma-7b-it": "CodeGemma-7b-it shows improvement over CodeGemma-2b across all problem types, especially in Math problems. However, it also fails in the Other category.", + "CodeGemma-7b": "CodeGemma-7b demonstrates consistent performance across Array, String, and Math problems, with a slight dip in String compared to CodeGemma-7b-it. It does not handle Other problems.", + "DeepSeekCoder-1.3b-base": "DeepSeekCoder-1.3b-base has the lowest performance among all models, particularly in Array problems. It also fails in the Other category.", + "DeepSeekCoder-6.7b-base": "DeepSeekCoder-6.7b-base excels in Math problems and performs well in Array and String problems. It does not handle Other problems.", + "DeepSeekCoder-6.7b-instruct": "DeepSeekCoder-6.7b-instruct shows strong performance in Array problems but has a noticeable drop in String and Math compared to its base counterpart. It fails in Other problems.", + "codeqwen2.5-1.5b": "codeqwen2.5-1.5b is the only model that successfully handles Other problems, achieving a perfect score. It also performs exceptionally well in Math and String problems.", + "codeqwen2.5-7b": "codeqwen2.5-7b leads in Math problems and performs well in Array and String problems. However, it fails in the Other category.", + "global_insights": "Math problems are generally well-handled by most models, with codeqwen2.5-7b leading. String problems show variability, with codeqwen2.5-1.5b performing best. Array problems are handled best by codeqwen2.5-7b. Only codeqwen2.5-1.5b can handle Other problems. Larger models generally perform better, but codeqwen2.5-1.5b is an exception, outperforming many larger models." +} \ No newline at end of file