Spaces:
Running
Running
ajaxzhan
commited on
Commit
·
32b50e6
1
Parent(s):
ee077b5
add analysis and vis support
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +428 -296
- app_back.py +363 -0
- llm_insight/HumanEval/3/EI/CC_recommendation.json +26 -0
- llm_insight/HumanEval/3/EI/CC_report.json +15 -0
- llm_insight/HumanEval/3/EI/line_counts_recommendation.json +23 -0
- llm_insight/HumanEval/3/EI/line_counts_report.json +15 -0
- llm_insight/HumanEval/3/EI/token_counts_recommendation.json +26 -0
- llm_insight/HumanEval/3/EI/token_counts_report.json +15 -0
- llm_insight/HumanEval/3/QS/CC_recommendation.json +26 -0
- llm_insight/HumanEval/3/QS/CC_report.json +14 -0
- llm_insight/HumanEval/3/QS/line_counts_recommendation.json +26 -0
- llm_insight/HumanEval/3/QS/line_counts_report.json +14 -0
- llm_insight/HumanEval/3/QS/token_counts_recommendation.json +26 -0
- llm_insight/HumanEval/3/QS/token_counts_report.json +14 -0
- llm_insight/HumanEval/4/EI/CC_recommendation.json +26 -0
- llm_insight/HumanEval/4/EI/CC_report.json +14 -0
- llm_insight/HumanEval/4/EI/line_counts_recommendation.json +26 -0
- llm_insight/HumanEval/4/EI/line_counts_report.json +14 -0
- llm_insight/HumanEval/4/EI/token_counts_recommendation.json +23 -0
- llm_insight/HumanEval/4/EI/token_counts_report.json +14 -0
- llm_insight/HumanEval/4/QS/CC_recommendation.json +26 -0
- llm_insight/HumanEval/4/QS/CC_report.json +14 -0
- llm_insight/HumanEval/4/QS/line_counts_recommendation.json +26 -0
- llm_insight/HumanEval/4/QS/line_counts_report.json +14 -0
- llm_insight/HumanEval/4/QS/token_counts_recommendation.json +26 -0
- llm_insight/HumanEval/4/QS/token_counts_report.json +14 -0
- llm_insight/HumanEval/5/EI/CC_recommendation.json +26 -0
- llm_insight/HumanEval/5/EI/CC_report.json +14 -0
- llm_insight/HumanEval/5/EI/line_counts_recommendation.json +26 -0
- llm_insight/HumanEval/5/EI/line_counts_report.json +14 -0
- llm_insight/HumanEval/5/EI/token_counts_recommendation.json +29 -0
- llm_insight/HumanEval/5/EI/token_counts_report.json +14 -0
- llm_insight/HumanEval/5/QS/CC_recommendation.json +29 -0
- llm_insight/HumanEval/5/QS/CC_report.json +14 -0
- llm_insight/HumanEval/5/QS/line_counts_recommendation.json +23 -0
- llm_insight/HumanEval/5/QS/line_counts_report.json +14 -0
- llm_insight/HumanEval/5/QS/token_counts_recommendation.json +26 -0
- llm_insight/HumanEval/5/QS/token_counts_report.json +15 -0
- llm_insight/HumanEval/6/EI/CC_recommendation.json +26 -0
- llm_insight/HumanEval/6/EI/CC_report.json +14 -0
- llm_insight/HumanEval/6/EI/line_counts_recommendation.json +31 -0
- llm_insight/HumanEval/6/EI/line_counts_report.json +14 -0
- llm_insight/HumanEval/6/EI/token_counts_recommendation.json +26 -0
- llm_insight/HumanEval/6/EI/token_counts_report.json +14 -0
- llm_insight/HumanEval/6/QS/CC_recommendation.json +26 -0
- llm_insight/HumanEval/6/QS/CC_report.json +14 -0
- llm_insight/HumanEval/6/QS/line_counts_recommendation.json +34 -0
- llm_insight/HumanEval/6/QS/line_counts_report.json +15 -0
- llm_insight/HumanEval/6/QS/token_counts_recommendation.json +23 -0
- llm_insight/HumanEval/6/QS/token_counts_report.json +15 -0
app.py
CHANGED
@@ -1,103 +1,37 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
-
import requests
|
4 |
-
import os
|
5 |
-
import shutil
|
6 |
-
import json
|
7 |
import pandas as pd
|
8 |
-
import
|
9 |
import plotly.express as px
|
10 |
-
|
|
|
11 |
# 根据用户选择的参数构建文件路径
|
12 |
num_parts = num_parts_dropdown
|
13 |
-
|
14 |
-
# line_counts_split = line_counts_radio
|
15 |
-
# cyclomatic_complexity_split = cyclomatic_complexity_radio
|
16 |
-
|
17 |
-
|
18 |
-
# 读取数据
|
19 |
-
dataframes = []
|
20 |
if dataset_radio == "HumanEval":
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
problem_type_df = pd.read_csv("/home/user/app/dividing_into_different_subsets/cata_result.csv")
|
45 |
-
dataframes.append(problem_type_df)
|
46 |
-
if dataset_radio == "MBPP":
|
47 |
-
if token_counts_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致
|
48 |
-
token_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/QS/token_counts_QS.csv")
|
49 |
-
dataframes.append(token_counts_df)
|
50 |
-
if token_counts_radio=="Equal Interval Partitioning":
|
51 |
-
token_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/EI/token_counts_EI.csv")
|
52 |
-
dataframes.append(token_counts_df)
|
53 |
-
if line_counts_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致
|
54 |
-
line_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/QS/line_counts_QS.csv")
|
55 |
-
dataframes.append(line_counts_df)
|
56 |
-
if token_counts_radio=="Equal Interval Partitioning":
|
57 |
-
line_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/EI/line_counts_EI.csv")
|
58 |
-
dataframes.append(line_counts_df)
|
59 |
-
if cyclomatic_complexity_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致
|
60 |
-
CC_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/QS/CC_QS.csv")
|
61 |
-
dataframes.append(CC_df)
|
62 |
-
if token_counts_radio=="Equal Interval Partitioning":
|
63 |
-
CC_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/EI/CC_EI.csv")
|
64 |
-
dataframes.append(CC_df)
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
#以下改为直接从一个划分文件中读取即可
|
69 |
-
if problem_type_checkbox:
|
70 |
-
problem_type_df = pd.read_csv("/home/user/app/dividing_into_different_subsets_mbpp/cata_result.csv")
|
71 |
-
dataframes.append(problem_type_df)
|
72 |
-
|
73 |
-
# 如果所有三个radio都有value,将三个文件中的所有行拼接
|
74 |
-
if len(dataframes) > 0:
|
75 |
-
combined_df = dataframes[0]
|
76 |
-
for df in dataframes[1:]:
|
77 |
-
combined_df = pd.merge(combined_df, df, left_index=True, right_index=True, suffixes=('', '_y'))
|
78 |
-
combined_df = combined_df.loc[:, ~combined_df.columns.str.endswith('_y')] # 去除重复的列
|
79 |
-
return combined_df
|
80 |
-
else:
|
81 |
-
return pd.DataFrame()
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
def execute_specified_python_files(directory_list, file_list):
|
87 |
-
for directory in directory_list:
|
88 |
-
for py_file in file_list:
|
89 |
-
file_path = os.path.join(directory, py_file)
|
90 |
-
if os.path.isfile(file_path) and py_file.endswith('.py'):
|
91 |
-
print(f"Executing {file_path}...")
|
92 |
-
try:
|
93 |
-
# 使用subprocess执行Python文件
|
94 |
-
subprocess.run(['python', file_path], check=True)
|
95 |
-
print(f"{file_path} executed successfully.")
|
96 |
-
except subprocess.CalledProcessError as e:
|
97 |
-
print(f"Error executing {file_path}: {e}")
|
98 |
-
else:
|
99 |
-
print(f"File {file_path} does not exist or is not a Python file.")
|
100 |
-
# 定义一个函数来生成 CSS 样式
|
101 |
def generate_css(line_counts, token_counts, cyclomatic_complexity, problem_type, show_high, show_medium, show_low):
|
102 |
css = """
|
103 |
#dataframe th {
|
@@ -139,225 +73,423 @@ def generate_css(line_counts, token_counts, cyclomatic_complexity, problem_type,
|
|
139 |
|
140 |
return css
|
141 |
|
|
|
|
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
options.append("The Number of Lines in Problem Descriptions")
|
151 |
-
if cyclomatic_complexity:
|
152 |
-
options.append("The Complexity of Reference Code")
|
153 |
-
if problem_type:
|
154 |
-
options.append("Problem Type")
|
155 |
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
-
def plot_csv(dataset_radio,radio,num):
|
159 |
-
print(dataset_radio,radio)
|
160 |
-
if dataset_radio=="HumanEval":
|
161 |
-
|
162 |
-
if radio=="The Number of Tokens in Problem Descriptions":
|
163 |
-
radio_choice="token_counts"
|
164 |
-
file_path = f'/home/user/app/dividing_into_different_subsets/{num}/QS/{radio_choice}_QS.csv'
|
165 |
-
elif radio=="The Number of Lines in Problem Descriptions":
|
166 |
-
radio_choice="line_counts"
|
167 |
-
file_path = f'/home/user/app/dividing_into_different_subsets/{num}/QS/{radio_choice}_QS.csv'
|
168 |
-
elif radio=="The Complexity of Reference Code":
|
169 |
-
radio_choice="CC"
|
170 |
-
file_path = f'/home/user/app/dividing_into_different_subsets/{num}/QS/{radio_choice}_QS.csv'
|
171 |
-
elif radio=="Problem Type":
|
172 |
-
radio_choice="problem_type"
|
173 |
-
file_path = f'/home/user/app/dividing_into_different_subsets/cata_result.csv'
|
174 |
-
print("test!")
|
175 |
-
elif dataset_radio=="MBPP":
|
176 |
-
if radio=="The Number of Tokens in Problem Descriptions":
|
177 |
-
radio_choice="token_counts"
|
178 |
-
file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/{num}/QS/{radio_choice}_QS.csv'
|
179 |
-
elif radio=="The Number of Lines in Problem Descriptions":
|
180 |
-
radio_choice="line_counts"
|
181 |
-
file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/{num}/QS/{radio_choice}_QS.csv'
|
182 |
-
elif radio=="The Complexity of Reference Code":
|
183 |
-
radio_choice="CC"
|
184 |
-
file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/{num}/QS/{radio_choice}_QS.csv'
|
185 |
-
elif radio=="Problem Type":
|
186 |
-
radio_choice="problem_type"
|
187 |
-
file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/cata_result.csv'
|
188 |
-
print("test!")
|
189 |
-
|
190 |
-
# file_path="E:/python-testn/pythonProject3/hh_1/dividing_into_different_subsets/3/QS/CC_QS.csv"
|
191 |
df = pd.read_csv(file_path)
|
192 |
-
# 将第一列作为索引
|
193 |
df.set_index('Model', inplace=True)
|
194 |
-
|
195 |
-
# 转置数据框,使得模型作为列,横轴作为行
|
196 |
df_transposed = df.T
|
197 |
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
return fig
|
208 |
|
209 |
-
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
-
|
213 |
-
if dataset == "MBPP":
|
214 |
-
return gr.update(visible=False)
|
215 |
-
else:
|
216 |
-
return gr.update(visible=True)
|
217 |
|
218 |
-
|
219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
|
221 |
-
|
|
|
222 |
gr.HTML("""
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
token_counts_checkbox = gr.Checkbox(label="I-The Number of Tokens in Problem Descriptions")
|
269 |
-
line_counts_checkbox = gr.Checkbox(label="II-The Number of Lines in Problem Descriptions")
|
270 |
-
dataset_radio.change(fn=toggle_line_counts_visibility, inputs=dataset_radio,
|
271 |
-
outputs=line_counts_checkbox)
|
272 |
-
cyclomatic_complexity_checkbox = gr.Checkbox(label="III-The Complexity of Reference Code")
|
273 |
-
problem_type_checkbox = gr.Checkbox(label="IV-Problem Types ")
|
274 |
-
css_code = """
|
275 |
-
.dropdown-container {
|
276 |
-
display: none;
|
277 |
-
}
|
278 |
-
"""
|
279 |
-
|
280 |
-
with gr.Column():
|
281 |
-
# gr.Markdown("<div class='markdown-class'>Choose Subsets </div>")
|
282 |
-
num_parts_dropdown = gr.Dropdown(choices=[0,3, 4, 5, 6, 7, 8], label="Choose the Number of Subsets",value="")
|
283 |
-
|
284 |
-
with gr.Row():
|
285 |
-
with gr.Column():
|
286 |
-
token_counts_radio = gr.Radio(
|
287 |
-
["Equal Frequency Partitioning", "Equal Interval Partitioning"],
|
288 |
-
label="Choose the Division Method for Perspective-I",
|
289 |
-
visible=False)
|
290 |
-
with gr.Column():
|
291 |
-
line_counts_radio = gr.Radio(
|
292 |
-
["Equal Frequency Partitioning", "Equal Interval Partitioning"],
|
293 |
-
label="Choose the Division Method for Perspective-II",
|
294 |
-
visible=False)
|
295 |
-
with gr.Column():
|
296 |
-
cyclomatic_complexity_radio = gr.Radio(
|
297 |
-
["Equal Frequency Partitioning", "Equal Interval Partitioning"],
|
298 |
-
label="Choose the Division Method for Perspective-III",
|
299 |
-
visible=False)
|
300 |
-
|
301 |
-
token_counts_checkbox.change(fn=lambda x: toggle_radio(x, token_counts_radio),
|
302 |
-
inputs=token_counts_checkbox, outputs=token_counts_radio)
|
303 |
-
line_counts_checkbox.change(fn=lambda x: toggle_radio(x, line_counts_radio),
|
304 |
-
inputs=line_counts_checkbox, outputs=line_counts_radio)
|
305 |
-
cyclomatic_complexity_checkbox.change(fn=lambda x: toggle_radio(x, cyclomatic_complexity_radio),
|
306 |
-
inputs=cyclomatic_complexity_checkbox,
|
307 |
-
outputs=cyclomatic_complexity_radio)
|
308 |
-
|
309 |
-
with gr.Tabs() as inner_tabs:
|
310 |
with gr.TabItem("Ranking Table"):
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
# 启动界面
|
363 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
|
|
|
|
|
|
|
|
3 |
import pandas as pd
|
4 |
+
import json
|
5 |
import plotly.express as px
|
6 |
+
|
7 |
+
def on_confirm(dataset_radio, num_parts_dropdown, perspective_radio, division_method_radio):
|
8 |
# 根据用户选择的参数构建文件路径
|
9 |
num_parts = num_parts_dropdown
|
10 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
if dataset_radio == "HumanEval":
|
12 |
+
base_path = "./dividing_into_different_subsets"
|
13 |
+
else: # MBPP
|
14 |
+
base_path = "./dividing_into_different_subsets_mbpp"
|
15 |
+
|
16 |
+
method = "QS" if division_method_radio == "Equal Frequency Partitioning" else "EI"
|
17 |
+
|
18 |
+
# 根据perspective选择读取对应的文件
|
19 |
+
if "Tokens" in perspective_radio:
|
20 |
+
df = pd.read_csv(f"{base_path}/{num_parts}/{method}/token_counts_{method}.csv")
|
21 |
+
elif "Lines" in perspective_radio:
|
22 |
+
df = pd.read_csv(f"{base_path}/{num_parts}/{method}/line_counts_{method}.csv")
|
23 |
+
elif "Complexity" in perspective_radio:
|
24 |
+
df = pd.read_csv(f"{base_path}/{num_parts}/{method}/CC_{method}.csv")
|
25 |
+
elif "Problem Types" in perspective_radio:
|
26 |
+
df = pd.read_csv(f"{base_path}/cata_result.csv")
|
27 |
+
|
28 |
+
# 加载分析报告
|
29 |
+
analysis_result,_ = load_analysis_report(dataset_radio, num_parts_dropdown, perspective_radio, division_method_radio)
|
30 |
+
# AI分析列
|
31 |
+
df["Analysis"] = df["Model"].map(lambda m: analysis_result.get(m, "No analysis provided."))
|
32 |
+
return df
|
33 |
+
|
34 |
+
# 生成 CSS 样式
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def generate_css(line_counts, token_counts, cyclomatic_complexity, problem_type, show_high, show_medium, show_low):
|
36 |
css = """
|
37 |
#dataframe th {
|
|
|
73 |
|
74 |
return css
|
75 |
|
76 |
+
# AI分析
|
77 |
+
def load_analysis_report(dataset_radio, num_parts_dropdown, perspective_radio, division_method_radio):
|
78 |
|
79 |
+
num_parts = num_parts_dropdown
|
80 |
+
method = "QS" if division_method_radio == "Equal Frequency Partitioning" else "EI"
|
81 |
+
|
82 |
+
# 根据perspective确定文件路径
|
83 |
+
if "Tokens" in perspective_radio:
|
84 |
+
perspective = "token_counts"
|
85 |
+
elif "Lines" in perspective_radio:
|
86 |
+
perspective = "line_counts"
|
87 |
+
elif "Complexity" in perspective_radio:
|
88 |
+
perspective = "CC"
|
89 |
+
else:
|
90 |
+
perspective = "problem_type"
|
91 |
|
92 |
+
base_path = "./llm_insight"
|
93 |
+
if perspective == "problem_type":
|
94 |
+
report_file = f"{base_path}/{dataset_radio}/{perspective}_report.json"
|
95 |
+
recommendation_file = f"{base_path}/{dataset_radio}/{perspective}_recommendation.json"
|
96 |
+
else:
|
97 |
+
report_file = f"{base_path}/{dataset_radio}/{num_parts}/{method}/{perspective}_report.json"
|
98 |
+
recommendation_file = f"{base_path}/{dataset_radio}/{num_parts}/{method}/{perspective}_recommendation.json"
|
99 |
|
100 |
+
try:
|
101 |
+
with open(report_file, 'r', encoding='utf-8') as f:
|
102 |
+
analysis_result = json.load(f)
|
103 |
+
except Exception as e:
|
104 |
+
analysis_result = f"[Error] error load analysis report: {e}"
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
+
try:
|
107 |
+
with open(recommendation_file, 'r', encoding='utf-8') as f:
|
108 |
+
recommendation_result = json.load(f)
|
109 |
+
except Exception as e:
|
110 |
+
recommendation_result = f"[Error] error load model recommendation: {e}"
|
111 |
+
|
112 |
+
return (analysis_result,recommendation_result)
|
113 |
+
|
114 |
+
# 可视化
|
115 |
+
def plot_visualization(dataset_radio, perspective_radio, num_parts, plot_type):
|
116 |
+
if dataset_radio == "HumanEval":
|
117 |
+
base_path = "./dividing_into_different_subsets"
|
118 |
+
else: # MBPP
|
119 |
+
base_path = "./dividing_into_different_subsets_mbpp"
|
120 |
+
|
121 |
+
if "Tokens" in perspective_radio:
|
122 |
+
file_path = f'{base_path}/{num_parts}/QS/token_counts_QS.csv'
|
123 |
+
elif "Lines" in perspective_radio:
|
124 |
+
file_path = f'{base_path}/{num_parts}/QS/line_counts_QS.csv'
|
125 |
+
elif "Complexity" in perspective_radio:
|
126 |
+
file_path = f'{base_path}/{num_parts}/QS/CC_QS.csv'
|
127 |
+
else: # Problem Types
|
128 |
+
file_path = f'{base_path}/cata_result.csv'
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
df = pd.read_csv(file_path)
|
|
|
131 |
df.set_index('Model', inplace=True)
|
|
|
|
|
132 |
df_transposed = df.T
|
133 |
|
134 |
+
if plot_type == "Line Chart":
|
135 |
+
fig = px.line(df_transposed,
|
136 |
+
x=df_transposed.index,
|
137 |
+
y=df_transposed.columns,
|
138 |
+
title='Model Performance Across Different Subsets',
|
139 |
+
labels={'value': 'Evaluation Score', 'index': 'Subsets'},
|
140 |
+
color_discrete_sequence=px.colors.qualitative.Plotly)
|
141 |
+
fig.update_traces(hovertemplate='%{y}')
|
142 |
+
elif plot_type == "Radar Chart": # Radar Chart
|
143 |
+
# 重新组织数据为雷达图所需格式
|
144 |
+
radar_data = []
|
145 |
+
for model in df.index:
|
146 |
+
for subset, score in df.loc[model].items():
|
147 |
+
radar_data.append({
|
148 |
+
'Model': model,
|
149 |
+
'Subset': subset,
|
150 |
+
'Score': score
|
151 |
+
})
|
152 |
+
|
153 |
+
radar_df = pd.DataFrame(radar_data)
|
154 |
+
|
155 |
+
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
|
156 |
+
|
157 |
+
# 创建雷达图
|
158 |
+
fig = px.line_polar(radar_df,
|
159 |
+
r='Score',
|
160 |
+
theta='Subset',
|
161 |
+
color='Model',
|
162 |
+
line_close=True,
|
163 |
+
color_discrete_sequence=colors,
|
164 |
+
title='Model Performance Radar Chart')
|
165 |
+
|
166 |
+
# 自定义每个模型的线条样式
|
167 |
+
for i, trace in enumerate(fig.data):
|
168 |
+
trace.update(
|
169 |
+
fill=None, # 移除填充
|
170 |
+
line=dict(
|
171 |
+
width=2,
|
172 |
+
dash='solid' if i % 2 == 0 else 'dash', # 交替使用实线和虚线
|
173 |
+
)
|
174 |
+
)
|
175 |
+
|
176 |
+
# 优化雷达图的显示
|
177 |
+
fig.update_layout(
|
178 |
+
polar=dict(
|
179 |
+
radialaxis=dict(
|
180 |
+
visible=True,
|
181 |
+
range=[0, 100],
|
182 |
+
showline=True,
|
183 |
+
linewidth=1,
|
184 |
+
gridcolor='lightgrey'
|
185 |
+
),
|
186 |
+
angularaxis=dict(
|
187 |
+
showline=True,
|
188 |
+
linewidth=1,
|
189 |
+
gridcolor='lightgrey'
|
190 |
+
)
|
191 |
+
),
|
192 |
+
showlegend=True,
|
193 |
+
legend=dict(
|
194 |
+
yanchor="middle", # 垂直居中
|
195 |
+
y=0.5,
|
196 |
+
xanchor="left",
|
197 |
+
x=1.2, # 将图例移到雷达图右侧
|
198 |
+
bgcolor="rgba(255, 255, 255, 0.8)", # 半透明白色背景
|
199 |
+
bordercolor="lightgrey", # 添加边框
|
200 |
+
borderwidth=1
|
201 |
+
),
|
202 |
+
margin=dict(r=150), # 增加右侧边距,为图例留出空间
|
203 |
+
paper_bgcolor='white'
|
204 |
+
)
|
205 |
+
else: # Heatmap
|
206 |
+
# 创建热力图
|
207 |
+
fig = px.imshow(df_transposed,
|
208 |
+
labels=dict(x="Model", y="Subset", color="Score"),
|
209 |
+
color_continuous_scale="RdYlBu_r", # 使用科研风格配色:红-黄-蓝
|
210 |
+
aspect="auto", # 自动调整宽高比
|
211 |
+
title="Model Performance Heatmap")
|
212 |
+
|
213 |
+
# 优化热力图显示
|
214 |
+
fig.update_layout(
|
215 |
+
title=dict(
|
216 |
+
text='Model Performance Distribution Across Subsets',
|
217 |
+
x=0.5,
|
218 |
+
y=0.95,
|
219 |
+
xanchor='center',
|
220 |
+
yanchor='top',
|
221 |
+
font=dict(size=14)
|
222 |
+
),
|
223 |
+
xaxis=dict(
|
224 |
+
title="Model",
|
225 |
+
tickangle=45, # 斜着显示模型名称
|
226 |
+
tickfont=dict(size=10),
|
227 |
+
side="bottom"
|
228 |
+
),
|
229 |
+
yaxis=dict(
|
230 |
+
title="Subset",
|
231 |
+
tickfont=dict(size=10)
|
232 |
+
),
|
233 |
+
coloraxis=dict(
|
234 |
+
colorbar=dict(
|
235 |
+
title="Score",
|
236 |
+
titleside="right",
|
237 |
+
tickfont=dict(size=10),
|
238 |
+
titlefont=dict(size=12),
|
239 |
+
len=0.9, # 色条长度
|
240 |
+
)
|
241 |
+
),
|
242 |
+
margin=dict(t=80, r=100, b=80, l=80), # 调整边距
|
243 |
+
paper_bgcolor='white',
|
244 |
+
plot_bgcolor='white'
|
245 |
+
)
|
246 |
+
|
247 |
+
# 添加具体数值标注
|
248 |
+
annotations = []
|
249 |
+
for i in range(len(df_transposed.index)):
|
250 |
+
for j in range(len(df_transposed.columns)):
|
251 |
+
annotations.append(
|
252 |
+
dict(
|
253 |
+
x=j,
|
254 |
+
y=i,
|
255 |
+
text=f"{df_transposed.iloc[i, j]:.1f}",
|
256 |
+
showarrow=False,
|
257 |
+
font=dict(size=9, color='black')
|
258 |
+
)
|
259 |
+
)
|
260 |
+
fig.update_layout(annotations=annotations)
|
261 |
|
262 |
return fig
|
263 |
|
264 |
+
# 旭日图
|
265 |
+
def plot_recommendation_sunburst(dataset_radio, num_parts_dropdown, perspective_radio, division_method_radio):
|
266 |
+
import plotly.graph_objects as go
|
267 |
+
_, recommendation_result = load_analysis_report(dataset_radio, num_parts_dropdown, perspective_radio, division_method_radio)
|
268 |
+
labels = ['Model Recommendation'] # 根节点
|
269 |
+
parents = ['']
|
270 |
+
values = []
|
271 |
+
customdata = ['Choose your preference model']
|
272 |
+
|
273 |
+
# 统计每个场景下模型数量
|
274 |
+
scenario_model_count = {}
|
275 |
+
total_model_count = 0
|
276 |
+
|
277 |
+
for scenario, model_list in recommendation_result.items():
|
278 |
+
# 处理模型
|
279 |
+
model_items = []
|
280 |
+
if isinstance(model_list, dict):
|
281 |
+
model_items = model_list.items()
|
282 |
+
elif isinstance(model_list, list):
|
283 |
+
for d in model_list:
|
284 |
+
if isinstance(d, dict):
|
285 |
+
for k, v in d.items():
|
286 |
+
model_items.append((k, v))
|
287 |
+
|
288 |
+
scenario_model_count[scenario] = len(model_items)
|
289 |
+
total_model_count += len(model_items)
|
290 |
+
|
291 |
+
# 根节点 value
|
292 |
+
values.append(total_model_count)
|
293 |
+
|
294 |
+
# 再次遍历,填充 labels/parents/values/customdata
|
295 |
+
for scenario, model_list in recommendation_result.items():
|
296 |
+
scenario_words = scenario.split()
|
297 |
+
short_label = " ".join(scenario_words[:3]) + "..." if len(scenario_words) > 3 else scenario
|
298 |
+
labels.append(short_label)
|
299 |
+
parents.append('Model Recommendation')
|
300 |
+
values.append(scenario_model_count[scenario])
|
301 |
+
customdata.append(scenario)
|
302 |
+
|
303 |
+
# 处理模型
|
304 |
+
model_items = []
|
305 |
+
if isinstance(model_list, dict):
|
306 |
+
model_items = model_list.items()
|
307 |
+
elif isinstance(model_list, list):
|
308 |
+
for d in model_list:
|
309 |
+
if isinstance(d, dict):
|
310 |
+
for k, v in d.items():
|
311 |
+
model_items.append((k, v))
|
312 |
+
|
313 |
+
for model, reason in model_items:
|
314 |
+
labels.append(model)
|
315 |
+
parents.append(short_label)
|
316 |
+
values.append(1)
|
317 |
+
customdata.append(reason)
|
318 |
+
|
319 |
+
fig = go.Figure(go.Sunburst(
|
320 |
+
labels=labels,
|
321 |
+
parents=parents,
|
322 |
+
values=values,
|
323 |
+
branchvalues="total",
|
324 |
+
hovertemplate='%{customdata}<extra></extra>',
|
325 |
+
customdata=customdata
|
326 |
+
))
|
327 |
+
fig.update_layout(margin=dict(t=10, l=10, r=10, b=10), height=500)
|
328 |
+
return fig
|
329 |
|
330 |
+
### Gradio代码部分 ###
|
|
|
|
|
|
|
|
|
331 |
|
332 |
+
# 自定义 CSS 样式
|
333 |
+
custom_css = """
|
334 |
+
<style>
|
335 |
+
body {
|
336 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
337 |
+
background-color: #f9f9f9;
|
338 |
+
}
|
339 |
+
.gr-label {
|
340 |
+
font-size: 15px;
|
341 |
+
}
|
342 |
+
.gr-button-primary {
|
343 |
+
background-color: #4CAF50;
|
344 |
+
color: white;
|
345 |
+
border-radius: 8px;
|
346 |
+
}
|
347 |
+
.gr-tabs > .tab-nav {
|
348 |
+
background-color: #e0e0e0;
|
349 |
+
border-bottom: 2px solid #ccc;
|
350 |
+
}
|
351 |
+
.gr-tabs > .tab-nav button.selected {
|
352 |
+
background-color: #ffffff !important;
|
353 |
+
border-bottom: 2px solid #4CAF50;
|
354 |
+
}
|
355 |
+
.gr-panel {
|
356 |
+
padding: 20px;
|
357 |
+
border-radius: 10px;
|
358 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
359 |
+
background-color: #fff;
|
360 |
+
}
|
361 |
+
.markdown-title {
|
362 |
+
font-size: 1.5em;
|
363 |
+
font-weight: bold;
|
364 |
+
margin-bottom: 10px;
|
365 |
+
}
|
366 |
+
.analysis-box {
|
367 |
+
background-color: #f1f8ff;
|
368 |
+
padding: 20px;
|
369 |
+
border-left: 5px solid #4CAF50;
|
370 |
+
border-radius: 6px;
|
371 |
+
margin-top: 10px;
|
372 |
+
}
|
373 |
+
.recommendation-box {
|
374 |
+
background-color: #fff3cd;
|
375 |
+
padding: 20px;
|
376 |
+
border-left: 5px solid #ff9800;
|
377 |
+
border-radius: 6px;
|
378 |
+
margin-top: 10px;
|
379 |
+
}
|
380 |
+
</style>
|
381 |
+
"""
|
382 |
|
383 |
+
# 构建界面
|
384 |
+
with gr.Blocks(css=custom_css) as iface:
|
385 |
gr.HTML("""
|
386 |
+
<div style='text-align:center; padding:15px;'>
|
387 |
+
<h1>Multi-view Code LLM Leaderboard</h1>
|
388 |
+
<p>Multi-view Leaderboard: Evaluating Large Language Models From Multiple Views</p>
|
389 |
+
</div>
|
390 |
+
""")
|
391 |
+
|
392 |
+
with gr.Row():
|
393 |
+
# 选择配置
|
394 |
+
with gr.Column(scale=1):
|
395 |
+
dataset_radio = gr.Radio(
|
396 |
+
["HumanEval", "MBPP"],
|
397 |
+
label="Select a dataset",
|
398 |
+
value="HumanEval"
|
399 |
+
)
|
400 |
+
num_parts_slider = gr.Slider(
|
401 |
+
minimum=3,
|
402 |
+
maximum=8,
|
403 |
+
step=1,
|
404 |
+
label="Choose the Number of Subsets",
|
405 |
+
value=3
|
406 |
+
)
|
407 |
+
|
408 |
+
# 将多个checkbox改为一个radio
|
409 |
+
perspective_radio = gr.Radio(
|
410 |
+
["I - Num of Tokens in Problem Desc",
|
411 |
+
"II - Num of Lines in Problem Desc",
|
412 |
+
"III - Complexity of Reference Code",
|
413 |
+
"IV - Problem Types"],
|
414 |
+
label="Choose Perspective",
|
415 |
+
value="I - Num of Tokens in Problem Desc"
|
416 |
+
)
|
417 |
+
|
418 |
+
# 统一的division method radio
|
419 |
+
division_method_radio = gr.Radio(
|
420 |
+
["Equal Frequency Partitioning", "Equal Interval Partitioning"],
|
421 |
+
label="Choose the Division Method",
|
422 |
+
visible=True
|
423 |
+
)
|
424 |
+
|
425 |
+
confirm_btn = gr.Button("Confirm", variant="primary")
|
426 |
+
|
427 |
+
# 核心展示
|
428 |
+
with gr.Column(scale=2):
|
429 |
+
with gr.Tabs():
|
430 |
+
# 表格
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
431 |
with gr.TabItem("Ranking Table"):
|
432 |
+
data_table = gr.Dataframe(headers=["Model", "Score","Analysis"],interactive=True)
|
433 |
+
# 可视化
|
434 |
+
with gr.TabItem("Visualization"):
|
435 |
+
plot_type = gr.Radio(
|
436 |
+
choices=["Line Chart", "Radar Chart","Heatmap"],
|
437 |
+
label="Select Plot Type",
|
438 |
+
value="Line Chart"
|
439 |
+
)
|
440 |
+
chart = gr.Plot()
|
441 |
+
# AI分析
|
442 |
+
with gr.TabItem("Model selection suggestions"):
|
443 |
+
with gr.Column():
|
444 |
+
gr.Markdown("<h2 class='markdown-title'>🎯 Model Recommendation</h2>")
|
445 |
+
recommendation_plot = gr.Plot()
|
446 |
+
scenario_legend = gr.Markdown(value="") # 新增图例
|
447 |
+
|
448 |
+
def update_perspective_options(dataset):
|
449 |
+
if dataset == "MBPP":
|
450 |
+
return gr.update(choices=[
|
451 |
+
"I - Num of Tokens in Problem Desc",
|
452 |
+
"III - Complexity of Reference Code",
|
453 |
+
"IV - Problem Types"
|
454 |
+
])
|
455 |
+
else:
|
456 |
+
return gr.update(choices=[
|
457 |
+
"I - Num of Tokens in Problem Desc",
|
458 |
+
"II - Num of Lines in Problem Desc",
|
459 |
+
"III - Complexity of Reference Code",
|
460 |
+
"IV - Problem Types"
|
461 |
+
])
|
462 |
+
|
463 |
+
dataset_radio.change(
|
464 |
+
fn=update_perspective_options,
|
465 |
+
inputs=dataset_radio,
|
466 |
+
outputs=perspective_radio
|
467 |
+
)
|
468 |
+
|
469 |
+
|
470 |
+
# 绑定事件
|
471 |
+
confirm_btn.click(
|
472 |
+
fn=on_confirm,
|
473 |
+
inputs=[dataset_radio, num_parts_slider, perspective_radio, division_method_radio],
|
474 |
+
outputs=data_table
|
475 |
+
).then(
|
476 |
+
fn=load_analysis_report,
|
477 |
+
inputs=[dataset_radio, num_parts_slider, perspective_radio, division_method_radio],
|
478 |
+
outputs=[gr.State()]
|
479 |
+
).then(
|
480 |
+
fn=plot_visualization,
|
481 |
+
inputs=[dataset_radio, perspective_radio, num_parts_slider, plot_type],
|
482 |
+
outputs=chart
|
483 |
+
).then(
|
484 |
+
fn=plot_recommendation_sunburst,
|
485 |
+
inputs=[dataset_radio, num_parts_slider, perspective_radio, division_method_radio],
|
486 |
+
outputs=[recommendation_plot] # 注意这里是列表
|
487 |
+
)
|
488 |
+
|
489 |
+
plot_type.change(
|
490 |
+
fn=plot_visualization,
|
491 |
+
inputs=[dataset_radio, perspective_radio, num_parts_slider, plot_type],
|
492 |
+
outputs=chart
|
493 |
+
)
|
494 |
# 启动界面
|
495 |
iface.launch()
|
app_back.py
ADDED
@@ -0,0 +1,363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import requests
|
4 |
+
import os
|
5 |
+
import shutil
|
6 |
+
import json
|
7 |
+
import pandas as pd
|
8 |
+
import subprocess
|
9 |
+
import plotly.express as px
|
10 |
+
def on_confirm(dataset_radio, num_parts_dropdown, token_counts_radio, line_counts_radio, cyclomatic_complexity_radio, problem_type_checkbox):
|
11 |
+
# 根据用户选择的参数构建文件路径
|
12 |
+
num_parts = num_parts_dropdown
|
13 |
+
# token_counts_split = token_counts_radio
|
14 |
+
# line_counts_split = line_counts_radio
|
15 |
+
# cyclomatic_complexity_split = cyclomatic_complexity_radio
|
16 |
+
|
17 |
+
|
18 |
+
# 读取数据
|
19 |
+
dataframes = []
|
20 |
+
if dataset_radio == "HumanEval":
|
21 |
+
if token_counts_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致
|
22 |
+
token_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/QS/token_counts_QS.csv")
|
23 |
+
dataframes.append(token_counts_df)
|
24 |
+
if token_counts_radio=="Equal Interval Partitioning":
|
25 |
+
token_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/EI/token_counts_EI.csv")
|
26 |
+
dataframes.append(token_counts_df)
|
27 |
+
if line_counts_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致
|
28 |
+
line_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/QS/line_counts_QS.csv")
|
29 |
+
dataframes.append(line_counts_df)
|
30 |
+
if token_counts_radio=="Equal Interval Partitioning":
|
31 |
+
line_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/EI/line_counts_EI.csv")
|
32 |
+
dataframes.append(line_counts_df)
|
33 |
+
if cyclomatic_complexity_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致
|
34 |
+
CC_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/QS/CC_QS.csv")
|
35 |
+
dataframes.append(CC_df)
|
36 |
+
if token_counts_radio=="Equal Interval Partitioning":
|
37 |
+
CC_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets/{num_parts}/EI/CC_EI.csv")
|
38 |
+
dataframes.append(CC_df)
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
#以下改为直接从一个划分文件中读取即可
|
43 |
+
if problem_type_checkbox:
|
44 |
+
problem_type_df = pd.read_csv("/home/user/app/dividing_into_different_subsets/cata_result.csv")
|
45 |
+
dataframes.append(problem_type_df)
|
46 |
+
if dataset_radio == "MBPP":
|
47 |
+
if token_counts_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致
|
48 |
+
token_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/QS/token_counts_QS.csv")
|
49 |
+
dataframes.append(token_counts_df)
|
50 |
+
if token_counts_radio=="Equal Interval Partitioning":
|
51 |
+
token_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/EI/token_counts_EI.csv")
|
52 |
+
dataframes.append(token_counts_df)
|
53 |
+
if line_counts_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致
|
54 |
+
line_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/QS/line_counts_QS.csv")
|
55 |
+
dataframes.append(line_counts_df)
|
56 |
+
if token_counts_radio=="Equal Interval Partitioning":
|
57 |
+
line_counts_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/EI/line_counts_EI.csv")
|
58 |
+
dataframes.append(line_counts_df)
|
59 |
+
if cyclomatic_complexity_radio=="Equal Frequency Partitioning":#等频划分,每个子集数据点的数量基本一致
|
60 |
+
CC_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/QS/CC_QS.csv")
|
61 |
+
dataframes.append(CC_df)
|
62 |
+
if token_counts_radio=="Equal Interval Partitioning":
|
63 |
+
CC_df = pd.read_csv(f"/home/user/app/dividing_into_different_subsets_mbpp/{num_parts}/EI/CC_EI.csv")
|
64 |
+
dataframes.append(CC_df)
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
#以下改为直接从一个划分文件中读取即可
|
69 |
+
if problem_type_checkbox:
|
70 |
+
problem_type_df = pd.read_csv("/home/user/app/dividing_into_different_subsets_mbpp/cata_result.csv")
|
71 |
+
dataframes.append(problem_type_df)
|
72 |
+
|
73 |
+
# 如果所有三个radio都有value,将三个文件中的所有行拼接
|
74 |
+
if len(dataframes) > 0:
|
75 |
+
combined_df = dataframes[0]
|
76 |
+
for df in dataframes[1:]:
|
77 |
+
combined_df = pd.merge(combined_df, df, left_index=True, right_index=True, suffixes=('', '_y'))
|
78 |
+
combined_df = combined_df.loc[:, ~combined_df.columns.str.endswith('_y')] # 去除重复的列
|
79 |
+
return combined_df
|
80 |
+
else:
|
81 |
+
return pd.DataFrame()
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
def execute_specified_python_files(directory_list, file_list):
|
87 |
+
for directory in directory_list:
|
88 |
+
for py_file in file_list:
|
89 |
+
file_path = os.path.join(directory, py_file)
|
90 |
+
if os.path.isfile(file_path) and py_file.endswith('.py'):
|
91 |
+
print(f"Executing {file_path}...")
|
92 |
+
try:
|
93 |
+
# 使用subprocess执行Python文件
|
94 |
+
subprocess.run(['python', file_path], check=True)
|
95 |
+
print(f"{file_path} executed successfully.")
|
96 |
+
except subprocess.CalledProcessError as e:
|
97 |
+
print(f"Error executing {file_path}: {e}")
|
98 |
+
else:
|
99 |
+
print(f"File {file_path} does not exist or is not a Python file.")
|
100 |
+
# 定义一个函数来生成 CSS 样式
|
101 |
+
def generate_css(line_counts, token_counts, cyclomatic_complexity, problem_type, show_high, show_medium, show_low):
|
102 |
+
css = """
|
103 |
+
#dataframe th {
|
104 |
+
background-color: #f2f2f2
|
105 |
+
|
106 |
+
}
|
107 |
+
"""
|
108 |
+
colors = ["#e6f7ff", "#ffeecc", "#e6ffe6", "#ffe6e6"]
|
109 |
+
categories = [line_counts, token_counts, cyclomatic_complexity]
|
110 |
+
category_index = 0
|
111 |
+
column_index = 1
|
112 |
+
|
113 |
+
for category in categories:
|
114 |
+
if category:
|
115 |
+
if show_high:
|
116 |
+
css += f"#dataframe td:nth-child({column_index + 1}) {{ background-color: {colors[category_index]}; }}\n"
|
117 |
+
column_index += 1
|
118 |
+
if show_medium:
|
119 |
+
css += f"#dataframe td:nth-child({column_index + 1}) {{ background-color: {colors[category_index]}; }}\n"
|
120 |
+
column_index += 1
|
121 |
+
if show_low:
|
122 |
+
css += f"#dataframe td:nth-child({column_index + 1}) {{ background-color: {colors[category_index]}; }}\n"
|
123 |
+
column_index += 1
|
124 |
+
category_index += 1
|
125 |
+
|
126 |
+
# 为 Problem Type 相关的三个子列设置固定颜色
|
127 |
+
if problem_type:
|
128 |
+
problem_type_color = "#d4f0fc" # 你可以选择任何你喜欢的颜色
|
129 |
+
css += f"#dataframe td:nth-child({column_index + 1}) {{ background-color: {problem_type_color}; }}\n"
|
130 |
+
css += f"#dataframe td:nth-child({column_index + 2}) {{ background-color: {problem_type_color}; }}\n"
|
131 |
+
css += f"#dataframe td:nth-child({column_index + 3}) {{ background-color: {problem_type_color}; }}\n"
|
132 |
+
|
133 |
+
# 隐藏 "data" 标识
|
134 |
+
css += """
|
135 |
+
.gradio-container .dataframe-container::before {
|
136 |
+
content: none !important;
|
137 |
+
}
|
138 |
+
"""
|
139 |
+
|
140 |
+
return css
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
+
|
145 |
+
def update_radio_options(token_counts, line_counts, cyclomatic_complexity, problem_type):
|
146 |
+
options = []
|
147 |
+
if token_counts:
|
148 |
+
options.append("The Number of Tokens in Problem Descriptions")
|
149 |
+
if line_counts:
|
150 |
+
options.append("The Number of Lines in Problem Descriptions")
|
151 |
+
if cyclomatic_complexity:
|
152 |
+
options.append("The Complexity of Reference Code")
|
153 |
+
if problem_type:
|
154 |
+
options.append("Problem Type")
|
155 |
+
|
156 |
+
return gr.update(choices=options)
|
157 |
+
|
158 |
+
def plot_csv(dataset_radio,radio,num):
|
159 |
+
print(dataset_radio,radio)
|
160 |
+
if dataset_radio=="HumanEval":
|
161 |
+
|
162 |
+
if radio=="The Number of Tokens in Problem Descriptions":
|
163 |
+
radio_choice="token_counts"
|
164 |
+
file_path = f'/home/user/app/dividing_into_different_subsets/{num}/QS/{radio_choice}_QS.csv'
|
165 |
+
elif radio=="The Number of Lines in Problem Descriptions":
|
166 |
+
radio_choice="line_counts"
|
167 |
+
file_path = f'/home/user/app/dividing_into_different_subsets/{num}/QS/{radio_choice}_QS.csv'
|
168 |
+
elif radio=="The Complexity of Reference Code":
|
169 |
+
radio_choice="CC"
|
170 |
+
file_path = f'/home/user/app/dividing_into_different_subsets/{num}/QS/{radio_choice}_QS.csv'
|
171 |
+
elif radio=="Problem Type":
|
172 |
+
radio_choice="problem_type"
|
173 |
+
file_path = f'/home/user/app/dividing_into_different_subsets/cata_result.csv'
|
174 |
+
print("test!")
|
175 |
+
elif dataset_radio=="MBPP":
|
176 |
+
if radio=="The Number of Tokens in Problem Descriptions":
|
177 |
+
radio_choice="token_counts"
|
178 |
+
file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/{num}/QS/{radio_choice}_QS.csv'
|
179 |
+
elif radio=="The Number of Lines in Problem Descriptions":
|
180 |
+
radio_choice="line_counts"
|
181 |
+
file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/{num}/QS/{radio_choice}_QS.csv'
|
182 |
+
elif radio=="The Complexity of Reference Code":
|
183 |
+
radio_choice="CC"
|
184 |
+
file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/{num}/QS/{radio_choice}_QS.csv'
|
185 |
+
elif radio=="Problem Type":
|
186 |
+
radio_choice="problem_type"
|
187 |
+
file_path = f'/home/user/app/dividing_into_different_subsets_mbpp/cata_result.csv'
|
188 |
+
print("test!")
|
189 |
+
|
190 |
+
# file_path="E:/python-testn/pythonProject3/hh_1/dividing_into_different_subsets/3/QS/CC_QS.csv"
|
191 |
+
df = pd.read_csv(file_path)
|
192 |
+
# 将第一列作为索引
|
193 |
+
df.set_index('Model', inplace=True)
|
194 |
+
|
195 |
+
# 转置数据框,使得模型作为列,横轴作为行
|
196 |
+
df_transposed = df.T
|
197 |
+
|
198 |
+
# 使用plotly绘制折线图
|
199 |
+
fig = px.line(df_transposed, x=df_transposed.index, y=df_transposed.columns,
|
200 |
+
title='Model Evaluation Results',
|
201 |
+
labels={'value': 'Evaluation Score', 'index': 'Evaluation Metric'},
|
202 |
+
color_discrete_sequence=px.colors.qualitative.Plotly)
|
203 |
+
|
204 |
+
# 设置悬停效果
|
205 |
+
fig.update_traces(hovertemplate='%{y}')
|
206 |
+
|
207 |
+
return fig
|
208 |
+
|
209 |
+
def toggle_radio(checkbox, radio):
|
210 |
+
return gr.update(visible=checkbox)
|
211 |
+
|
212 |
+
def toggle_line_counts_visibility(dataset):
|
213 |
+
if dataset == "MBPP":
|
214 |
+
return gr.update(visible=False)
|
215 |
+
else:
|
216 |
+
return gr.update(visible=True)
|
217 |
+
|
218 |
+
# 创建 Gradio 界面
|
219 |
+
import gradio as gr
|
220 |
+
|
221 |
+
with gr.Blocks() as iface:
|
222 |
+
gr.HTML("""
|
223 |
+
<style>
|
224 |
+
# body {
|
225 |
+
# max-width: 50%; /* 设置最大宽度为50% */
|
226 |
+
# margin: 0 auto; /* 将内容居中 */
|
227 |
+
# }
|
228 |
+
.title {
|
229 |
+
text-align: center;
|
230 |
+
font-size: 3em;
|
231 |
+
font-weight: bold;
|
232 |
+
margin-bottom: 0.5em;
|
233 |
+
}
|
234 |
+
.subtitle {
|
235 |
+
text-align: center;
|
236 |
+
font-size: 2em;
|
237 |
+
margin-bottom: 1em;
|
238 |
+
}
|
239 |
+
</style>
|
240 |
+
|
241 |
+
""")
|
242 |
+
|
243 |
+
with gr.Tabs() as tabs:
|
244 |
+
with gr.TabItem("Evaluation Result"):
|
245 |
+
with gr.Row():
|
246 |
+
with gr.Column(scale=2):
|
247 |
+
with gr.Row():
|
248 |
+
with gr.Column():
|
249 |
+
dataset_radio = gr.Radio(["HumanEval", "MBPP"], label="Select Dataset ")
|
250 |
+
|
251 |
+
|
252 |
+
with gr.Row():
|
253 |
+
custom_css = """
|
254 |
+
<style>
|
255 |
+
.markdown-class {
|
256 |
+
font-family: 'Helvetica', sans-serif;
|
257 |
+
font-size: 20px;
|
258 |
+
font-weight: bold;
|
259 |
+
color: #333;
|
260 |
+
}
|
261 |
+
</style>
|
262 |
+
"""
|
263 |
+
|
264 |
+
with gr.Column():
|
265 |
+
gr.Markdown(
|
266 |
+
f"{custom_css}<div class='markdown-class'> Choose Division Perspective </div>")
|
267 |
+
|
268 |
+
token_counts_checkbox = gr.Checkbox(label="I-The Number of Tokens in Problem Descriptions")
|
269 |
+
line_counts_checkbox = gr.Checkbox(label="II-The Number of Lines in Problem Descriptions")
|
270 |
+
dataset_radio.change(fn=toggle_line_counts_visibility, inputs=dataset_radio,
|
271 |
+
outputs=line_counts_checkbox)
|
272 |
+
cyclomatic_complexity_checkbox = gr.Checkbox(label="III-The Complexity of Reference Code")
|
273 |
+
problem_type_checkbox = gr.Checkbox(label="IV-Problem Types ")
|
274 |
+
css_code = """
|
275 |
+
.dropdown-container {
|
276 |
+
display: none;
|
277 |
+
}
|
278 |
+
"""
|
279 |
+
|
280 |
+
with gr.Column():
|
281 |
+
# gr.Markdown("<div class='markdown-class'>Choose Subsets </div>")
|
282 |
+
num_parts_dropdown = gr.Dropdown(choices=[0,3, 4, 5, 6, 7, 8], label="Choose the Number of Subsets",value="")
|
283 |
+
|
284 |
+
with gr.Row():
|
285 |
+
with gr.Column():
|
286 |
+
token_counts_radio = gr.Radio(
|
287 |
+
["Equal Frequency Partitioning", "Equal Interval Partitioning"],
|
288 |
+
label="Choose the Division Method for Perspective-I",
|
289 |
+
visible=False)
|
290 |
+
with gr.Column():
|
291 |
+
line_counts_radio = gr.Radio(
|
292 |
+
["Equal Frequency Partitioning", "Equal Interval Partitioning"],
|
293 |
+
label="Choose the Division Method for Perspective-II",
|
294 |
+
visible=False)
|
295 |
+
with gr.Column():
|
296 |
+
cyclomatic_complexity_radio = gr.Radio(
|
297 |
+
["Equal Frequency Partitioning", "Equal Interval Partitioning"],
|
298 |
+
label="Choose the Division Method for Perspective-III",
|
299 |
+
visible=False)
|
300 |
+
|
301 |
+
token_counts_checkbox.change(fn=lambda x: toggle_radio(x, token_counts_radio),
|
302 |
+
inputs=token_counts_checkbox, outputs=token_counts_radio)
|
303 |
+
line_counts_checkbox.change(fn=lambda x: toggle_radio(x, line_counts_radio),
|
304 |
+
inputs=line_counts_checkbox, outputs=line_counts_radio)
|
305 |
+
cyclomatic_complexity_checkbox.change(fn=lambda x: toggle_radio(x, cyclomatic_complexity_radio),
|
306 |
+
inputs=cyclomatic_complexity_checkbox,
|
307 |
+
outputs=cyclomatic_complexity_radio)
|
308 |
+
|
309 |
+
with gr.Tabs() as inner_tabs:
|
310 |
+
with gr.TabItem("Ranking Table"):
|
311 |
+
dataframe_output = gr.Dataframe(elem_id="dataframe")
|
312 |
+
css_output = gr.HTML()
|
313 |
+
confirm_button = gr.Button("Confirm ")
|
314 |
+
confirm_button.click(fn=on_confirm, inputs=[dataset_radio, num_parts_dropdown, token_counts_radio,
|
315 |
+
line_counts_radio, cyclomatic_complexity_radio,
|
316 |
+
problem_type_checkbox],
|
317 |
+
outputs=dataframe_output)
|
318 |
+
|
319 |
+
with gr.TabItem("Line chart"):
|
320 |
+
select_radio = gr.Radio(choices=[], label="Select One Perpective")
|
321 |
+
checkboxes = [token_counts_checkbox, line_counts_checkbox, cyclomatic_complexity_checkbox,
|
322 |
+
problem_type_checkbox]
|
323 |
+
for checkbox in checkboxes:
|
324 |
+
checkbox.change(fn=update_radio_options, inputs=checkboxes, outputs=select_radio)
|
325 |
+
select_radio.change(fn=plot_csv, inputs=[dataset_radio, select_radio, num_parts_dropdown],
|
326 |
+
outputs=gr.Plot(label="Line Plot "))
|
327 |
+
|
328 |
+
# with gr.TabItem("Upload Inference File"):
|
329 |
+
# gr.Markdown("Upload a JSON file")
|
330 |
+
# with gr.Row():
|
331 |
+
# with gr.Column():
|
332 |
+
# string_input = gr.Textbox(label="Enter the Model Name")
|
333 |
+
# number_input = gr.Number(label="Select the Number of Samples")
|
334 |
+
# dataset_choice = gr.Dropdown(label="Select Dataset", choices=["HumanEval", "MBPP"])
|
335 |
+
# with gr.Column():
|
336 |
+
# file_input = gr.File(label="Upload Generation Result in JSON file")
|
337 |
+
# upload_button = gr.Button("Confirm and Upload")
|
338 |
+
|
339 |
+
# json_output = gr.JSON(label="")
|
340 |
+
|
341 |
+
# upload_button.click(fn=generate_file, inputs=[file_input, string_input, number_input, dataset_choice],
|
342 |
+
# outputs=json_output)
|
343 |
+
|
344 |
+
css = """
|
345 |
+
#scale1 {
|
346 |
+
border: 1px solid rgba(0, 0, 0, 0.2);
|
347 |
+
padding: 10px;
|
348 |
+
border-radius: 8px;
|
349 |
+
background-color: #f9f9f9;
|
350 |
+
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
351 |
+
}
|
352 |
+
}
|
353 |
+
"""
|
354 |
+
gr.HTML(f"<style>{css}</style>")
|
355 |
+
|
356 |
+
# 初始化数据表格
|
357 |
+
# initial_df = show_data(False, False, False, False, False, False, False)
|
358 |
+
# initial_css = generate_css(False, False, False, False, True, False, False)
|
359 |
+
# dataframe_output.value = initial_df
|
360 |
+
# css_output.value = f"<style>{initial_css}</style>"
|
361 |
+
|
362 |
+
# 启动界面
|
363 |
+
iface.launch()
|
llm_insight/HumanEval/3/EI/CC_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High performance and robustness": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently high scores across all subsets, making it the most reliable choice."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"CodeFuse-DeepSeek-33b": "Strong performance in two subsets, though with a drop in one, indicating good but not perfect robustness."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Moderate performance with cost-effectiveness": [
|
11 |
+
{
|
12 |
+
"deepseek_coder_33b-instruct": "Decent performance across all subsets, offering a balance between cost and effectiveness."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"deepseek_coder-6.7b-instruct": "Good performance in two subsets, suitable for scenarios where some variability is acceptable."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Not recommended": [
|
19 |
+
{
|
20 |
+
"codegemma-2b": "Extremely low performance across all subsets."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"deepseek-coder-1.3b-base": "Poor performance, especially in one subset with 0.0 score."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/3/EI/CC_report.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance in CC_subset_1 (79.86) and CC_subset_3 (75.0), but a significant drop in CC_subset_2 (50.0). This suggests it may struggle with certain types of data in the CC perspective.",
|
3 |
+
"Nxcode-CQ-7B": "Consistently high performance across all subsets (88.65, 75.0, 87.5), indicating robustness in handling diverse data splits under the CC perspective.",
|
4 |
+
"codegemma-2b": "Poor performance across all subsets (30.56, 2.5, 2.5), suggesting it is not suitable for tasks under the CC perspective.",
|
5 |
+
"codegemma-7b": "Moderate performance in CC_subset_1 (43.4) but very low in CC_subset_2 (12.5) and CC_subset_3 (18.75), indicating limited applicability.",
|
6 |
+
"codegemma-7b-it": "Better than codegemma-7b but still inconsistent (56.32, 32.5, 21.25), with performance dropping in later subsets.",
|
7 |
+
"deepseek-coder-1.3b-base": "Very low performance, especially in CC_subset_3 (0.0), making it unsuitable for this perspective.",
|
8 |
+
"deepseek-coder-6.7b-base": "Moderate in CC_subset_1 (50.31) but poor in others (14.06, 5.0), indicating limited use.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong in CC_subset_1 (74.62) and decent in CC_subset_2 (53.12), but drops in CC_subset_3 (36.25).",
|
10 |
+
"deepseek_coder_33b-base": "Moderate performance across subsets (55.9, 31.87, 22.5), with a downward trend.",
|
11 |
+
"deepseek_coder_33b-instruct": "Consistently decent performance (68.65, 49.69, 41.25), though not as high as Nxcode-CQ-7B.",
|
12 |
+
"codeqwen1.5-7b": "Inconsistent performance (55.0, 17.19, 57.5), with a notable drop in CC_subset_2.",
|
13 |
+
"new": "Similar to codegemma-7b-it (56.32, 32.5, 21.25), indicating no significant improvement.",
|
14 |
+
"global_insights": "Nxcode-CQ-7B is the top performer across all subsets, showing robustness. CodeFuse-DeepSeek-33b and deepseek_coder_33b-instruct also perform well but with some variability. Smaller models like codegemma-2b and deepseek-coder-1.3b-base are not recommended for this perspective."
|
15 |
+
}
|
llm_insight/HumanEval/3/EI/line_counts_recommendation.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Small to medium line counts with high accuracy": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Excels in line_subset_1 (88.82) and line_subset_2 (87.27), making it ideal for small to medium line counts."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"deepseek_coder-6.7b-instruct": "Strong performance in line_subset_1 (75.65) and line_subset_2 (67.42), suitable for similar scenarios."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Large line counts with acceptable performance": [
|
11 |
+
{
|
12 |
+
"CodeFuse-DeepSeek-33b": "Best performance in line_subset_3 (87.5), making it the top choice for large line counts."
|
13 |
+
}
|
14 |
+
],
|
15 |
+
"Cost-effective for small line counts": [
|
16 |
+
{
|
17 |
+
"codegemma-7b-it": "Moderate performance in line_subset_1 (57.32) at a potentially lower cost than larger models."
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"deepseek_coder_33b-instruct": "Balanced performance in line_subset_1 (70.85) and line_subset_2 (60.0) for its size."
|
21 |
+
}
|
22 |
+
]
|
23 |
+
}
|
llm_insight/HumanEval/3/EI/line_counts_report.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance in line_subset_3 (87.5) but drops significantly in line_subset_2 (72.73). This suggests it handles larger line counts better but may struggle with medium-sized inputs.",
|
3 |
+
"Nxcode-CQ-7B": "This model excels in line_subset_1 (88.82) and line_subset_2 (87.27) but performs poorly in line_subset_3 (63.75), indicating it's optimized for small to medium line counts but not for larger ones.",
|
4 |
+
"codegemma-2b": "Consistently poor performance across all subsets, with the worst in line_subset_3 (1.88). Not suitable for any line count scenario.",
|
5 |
+
"codegemma-7b": "Better than codegemma-2b but still underperforms, especially in line_subset_3 (11.88). Limited utility across line counts.",
|
6 |
+
"codegemma-7b-it": "Shows improvement over other codegemma models, particularly in line_subset_1 (57.32), but still struggles with larger line counts (26.88 in line_subset_3).",
|
7 |
+
"deepseek-coder-1.3b-base": "Low performance across all subsets, with line_subset_1 being the best (35.69). Not recommended for any scenario.",
|
8 |
+
"deepseek-coder-6.7b-base": "Moderate performance in line_subset_1 (49.47) and line_subset_2 (40.61), but very poor in line_subset_3 (8.13). Avoid for large line counts.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong in line_subset_1 (75.65) and line_subset_2 (67.42), but drops in line_subset_3 (26.25). Suitable for small to medium line counts.",
|
10 |
+
"deepseek_coder_33b-base": "Decent in line_subset_1 (57.4) and line_subset_2 (44.55), but poor in line_subset_3 (15.0). Limited to small-medium line counts.",
|
11 |
+
"deepseek_coder_33b-instruct": "Good performance in line_subset_1 (70.85) and line_subset_2 (60.0), but struggles with line_subset_3 (18.75). Best for small-medium line counts.",
|
12 |
+
"codeqwen1.5-7b": "Moderate across all subsets, with line_subset_1 (53.7) being the best. Not outstanding in any scenario.",
|
13 |
+
"new": "Similar to codegemma-7b-it, with identical scores. Shows potential for small-medium line counts but not for large ones.",
|
14 |
+
"global_insights": "Models generally perform best in line_subset_1 and worst in line_subset_3, indicating a trend of decreasing performance with increasing line counts. Nxcode-CQ-7B and CodeFuse-DeepSeek-33b are exceptions, excelling in different subsets. Smaller models (e.g., codegemma-2b) are consistently poor, while larger models (e.g., deepseek_coder-6.7b-instruct) show better adaptability."
|
15 |
+
}
|
llm_insight/HumanEval/3/EI/token_counts_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High performance with cost-effectiveness": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently high performance in shorter and medium token subsets, making it suitable for most tasks."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"CodeFuse-DeepSeek-33b": "Strong performance in shorter and longer token subsets, ideal for tasks with varying token lengths."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Moderate performance with lower cost": [
|
11 |
+
{
|
12 |
+
"deepseek_coder-6.7b-instruct": "Good performance in shorter and medium token subsets, suitable for budget-conscious users."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"codegemma-7b-it": "Better performance than base models, offering a balance between cost and capability."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Not recommended for complex tasks": [
|
19 |
+
{
|
20 |
+
"codegemma-2b": "Poor performance across all subsets, not suitable for any serious tasks."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"deepseek-coder-1.3b-base": "Low scores across all subsets, indicating limited utility."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/3/EI/token_counts_report.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance in token_subset_1 and token_subset_3 but a significant drop in token_subset_2, indicating potential sensitivity to medium-length tokens.",
|
3 |
+
"Nxcode-CQ-7B": "Consistently high performance across token_subset_1 and token_subset_2, but a notable decline in token_subset_3, suggesting challenges with longer token sequences.",
|
4 |
+
"codegemma-2b": "Poor performance across all subsets, with particularly low scores in token_subset_3, indicating it may not be suitable for tasks involving varying token lengths.",
|
5 |
+
"codegemma-7b": "Moderate performance with a steady decline as token length increases, suggesting limited scalability with longer inputs.",
|
6 |
+
"codegemma-7b-it": "Better performance than codegemma-7b but still struggles with longer tokens, indicating some improvement but not enough for complex tasks.",
|
7 |
+
"deepseek-coder-1.3b-base": "Low scores across all subsets, with a slight improvement in token_subset_1, indicating limited capability.",
|
8 |
+
"deepseek-coder-6.7b-base": "Moderate performance in token_subset_1 but significant drops in longer tokens, suggesting scalability issues.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong performance in token_subset_1 and token_subset_2 but a sharp decline in token_subset_3, indicating challenges with very long tokens.",
|
10 |
+
"deepseek_coder_33b-base": "Moderate performance with a steady decline as token length increases, similar to codegemma-7b.",
|
11 |
+
"deepseek_coder_33b-instruct": "Good performance in token_subset_1 and token_subset_3 but a drop in token_subset_2, indicating inconsistency with medium-length tokens.",
|
12 |
+
"codeqwen1.5-7b": "Moderate performance across all subsets, with a steady decline as token length increases, suggesting limited scalability.",
|
13 |
+
"new": "Identical performance to codegemma-7b-it, indicating no additional improvements.",
|
14 |
+
"global_insights": "Models generally perform best on shorter token sequences (token_subset_1) and struggle with longer ones (token_subset_3). Nxcode-CQ-7B and CodeFuse-DeepSeek-33b are the top performers but show inconsistency across subsets. Smaller models like codegemma-2b and deepseek-coder-1.3b-base are not recommended for tasks involving varying token lengths."
|
15 |
+
}
|
llm_insight/HumanEval/3/QS/CC_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High accuracy and robustness": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently high performance across all subsets, making it ideal for scenarios requiring reliable and stable outputs."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"deepseek_coder-6.7b-instruct": "Good balance of performance and adaptability, suitable for diverse data conditions."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Moderate accuracy with cost-effectiveness": [
|
11 |
+
{
|
12 |
+
"deepseek_coder_33b-instruct": "Offers decent performance at a potentially lower cost compared to larger models like Nxcode-CQ-7B."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"codeqwen1.5-7b": "Provides moderate performance at a lower computational cost, suitable for budget-constrained scenarios."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Low-cost with acceptable performance": [
|
19 |
+
{
|
20 |
+
"codegemma-7b-it": "While not the best performer, it offers a cost-effective solution for less critical applications."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"deepseek-coder-6.7b-base": "A budget-friendly option with moderate performance, suitable for non-critical tasks."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/3/QS/CC_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows a consistent decline in performance across subsets, indicating potential sensitivity to dataset variations. It performs best in CC_subset_1 but drops significantly in CC_subset_3.",
|
3 |
+
"Nxcode-CQ-7B": "This model demonstrates robust performance across all subsets, with minimal variation. It maintains high accuracy, suggesting strong generalization capabilities.",
|
4 |
+
"codegemma-2b": "Performance degrades sharply across subsets, indicating poor adaptability to varying data conditions. The model struggles significantly in CC_subset_3.",
|
5 |
+
"codegemma-7b": "Similar to codegemma-2b, this model shows a steady decline in performance, though it starts from a higher baseline. It may not be suitable for diverse data conditions.",
|
6 |
+
"codegemma-7b-it": "While performance declines across subsets, the drop is less severe compared to other codegemma variants. It maintains moderate accuracy in CC_subset_3.",
|
7 |
+
"deepseek-coder-1.3b-base": "This model exhibits a steep performance drop, particularly in CC_subset_3, suggesting limited robustness to dataset changes.",
|
8 |
+
"deepseek-coder-6.7b-base": "Performance declines steadily but remains above smaller models like codegemma-2b. It shows moderate adaptability.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "This model maintains relatively high performance across subsets, with a slight decline in CC_subset_3. It shows good generalization.",
|
10 |
+
"deepseek_coder_33b-base": "Performance drops steadily but remains above smaller models. It shows moderate robustness to dataset variations.",
|
11 |
+
"deepseek_coder_33b-instruct": "The model maintains decent performance across subsets, with a gradual decline. It shows better adaptability than its base counterpart.",
|
12 |
+
"codeqwen1.5-7b": "Performance declines across subsets but remains above smaller models. It shows moderate adaptability to varying data conditions.",
|
13 |
+
"global_insights": "Larger models generally perform better across subsets, with Nxcode-CQ-7B being the most consistent. Performance degradation is observed in all models as dataset complexity increases (from CC_subset_1 to CC_subset_3). Instruct-tuned models tend to outperform their base counterparts, suggesting the value of instruction fine-tuning."
|
14 |
+
}
|
llm_insight/HumanEval/3/QS/line_counts_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High accuracy and consistency": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently outperforms all other models across all subsets, making it the best choice for high accuracy tasks."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"CodeFuse-DeepSeek-33b": "Strong performance and robustness, though slightly less consistent than Nxcode-CQ-7B."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Moderate accuracy with cost-effectiveness": [
|
11 |
+
{
|
12 |
+
"deepseek_coder-6.7b-instruct": "Good performance at a potentially lower cost compared to larger models."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"deepseek_coder_33b-instruct": "Better performance than its base version, suitable for tasks needing moderate accuracy."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Low-cost for simpler tasks": [
|
19 |
+
{
|
20 |
+
"codegemma-7b-it": "Decent performance for simpler tasks, though not suitable for high accuracy needs."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"codeqwen1.5-7b": "Moderate performance at a lower cost, suitable for less critical tasks."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/3/QS/line_counts_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance across all subsets, with the highest score in subset_1 (81.82) and a slight drop in subset_2 (72.22) and subset_3 (76.36). This indicates robustness but some variability based on line counts.",
|
3 |
+
"Nxcode-CQ-7B": "This model outperforms all others, especially in subset_1 (92.09) and maintains high scores in subset_2 (88.33) and subset_3 (81.45). Its consistency and high performance make it a top contender.",
|
4 |
+
"codegemma-2b": "The model performs poorly across all subsets, with the lowest scores (44.09, 17.5, 19.64). It is not suitable for tasks requiring high accuracy.",
|
5 |
+
"codegemma-7b": "Performance is mediocre, with scores (52.45, 35.19, 31.64) showing a significant drop as line counts increase. Not recommended for complex tasks.",
|
6 |
+
"codegemma-7b-it": "Better than its non-IT counterpart, but still shows a declining trend (66.36, 49.26, 43.73). Suitable for simpler tasks but not for high accuracy needs.",
|
7 |
+
"deepseek-coder-1.3b-base": "Low scores (47.45, 26.39, 23.0) indicate this model is not suitable for tasks requiring precision or handling larger line counts.",
|
8 |
+
"deepseek-coder-6.7b-base": "Moderate performance (63.36, 39.35, 34.18) but declines with larger subsets. Limited utility for complex tasks.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong performance (85.0, 66.85, 62.82) but not as consistent as Nxcode-CQ-7B. Good for tasks needing moderate accuracy.",
|
10 |
+
"deepseek_coder_33b-base": "Decent performance (68.0, 48.89, 41.27) but declines with larger subsets. Not the best choice for high accuracy.",
|
11 |
+
"deepseek_coder_33b-instruct": "Good performance (82.09, 62.31, 53.91) but still not as consistent as Nxcode-CQ-7B. Suitable for tasks needing moderate accuracy.",
|
12 |
+
"codeqwen1.5-7b": "Moderate scores (59.73, 48.7, 45.64) with a declining trend. Limited utility for high accuracy tasks.",
|
13 |
+
"global_insights": "Nxcode-CQ-7B is the top performer across all subsets, showing both high accuracy and consistency. Smaller models like codegemma-2b and deepseek-coder-1.3b-base perform poorly, especially with larger line counts. Instruct versions of models generally perform better than their base counterparts. Line count significantly impacts performance for most models, with larger subsets showing lower scores."
|
14 |
+
}
|
llm_insight/HumanEval/3/QS/token_counts_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High performance and robustness": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently high scores across all token subsets, making it reliable for varied token counts."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"deepseek_coder-6.7b-instruct": "Strong performance in all subsets, suitable for tasks requiring stability across token counts."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Balanced performance and cost-effectiveness": [
|
11 |
+
{
|
12 |
+
"deepseek_coder_33b-instruct": "Good performance with a reasonable drop in higher token counts, offering a balance between cost and capability."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"codeqwen1.5-7b": "Moderate performance with a stable decline, suitable for tasks where cost is a concern but performance cannot be compromised too much."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Low-cost for low token count tasks": [
|
19 |
+
{
|
20 |
+
"codegemma-7b-it": "Performs well in low to medium token counts, offering a cost-effective solution for smaller tasks."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"deepseek-coder-6.7b-base": "Better than smaller models and cost-effective for tasks with moderate token counts."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/3/QS/token_counts_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance in token_subset_2 (88.89) but drops significantly in token_subset_3 (69.09), indicating potential sensitivity to token count variations.",
|
3 |
+
"Nxcode-CQ-7B": "Consistently high performance across all subsets, with the highest score in token_subset_1 (90.73). This suggests robustness to token count changes.",
|
4 |
+
"codegemma-2b": "Performance declines sharply as token count increases, with the lowest score in token_subset_3 (11.82). Not suitable for high token count tasks.",
|
5 |
+
"codegemma-7b": "Similar to codegemma-2b but with better overall performance, though still struggles with higher token counts.",
|
6 |
+
"codegemma-7b-it": "Shows a steady decline with increasing token counts but maintains relatively better performance than other codegemma variants.",
|
7 |
+
"deepseek-coder-1.3b-base": "Performance drops significantly with higher token counts, similar to codegemma models.",
|
8 |
+
"deepseek-coder-6.7b-base": "Better than 1.3b-base but still shows a notable decline in token_subset_3 (25.09).",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong performance across all subsets, with the highest score in token_subset_1 (83.64). Robust to token count variations.",
|
10 |
+
"deepseek_coder_33b-base": "Moderate performance with a steady decline in higher token counts.",
|
11 |
+
"deepseek_coder_33b-instruct": "Good performance overall but shows a noticeable drop in token_subset_3 (50.36).",
|
12 |
+
"codeqwen1.5-7b": "Performance declines with higher token counts but remains relatively stable compared to other models.",
|
13 |
+
"global_insights": "Models like Nxcode-CQ-7B and deepseek_coder-6.7b-instruct show robustness across token count variations, while smaller models (e.g., codegemma-2b, deepseek-coder-1.3b-base) struggle with higher token counts. Larger models generally perform better but may still show declines in the highest token subset."
|
14 |
+
}
|
llm_insight/HumanEval/4/EI/CC_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High performance with no cost constraints": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently top performer across most subsets, making it the best choice for high-stakes CC tasks."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"CodeFuse-DeepSeek-33b": "Strong performance in multiple subsets, though slightly less consistent than Nxcode-CQ-7B."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Balanced performance and cost": [
|
11 |
+
{
|
12 |
+
"deepseek_coder_33b-instruct": "Reasonable performance across subsets, offering a good balance between cost and capability."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"codeqwen1.5-7b": "Moderate performance in some subsets, potentially more cost-effective than larger models."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Low cost with acceptable performance": [
|
19 |
+
{
|
20 |
+
"codegemma-7b-it": "Better than other small models, though still inconsistent. Suitable for less critical tasks."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"deepseek_coder-6.7b-instruct": "Better among smaller deepseek models, but still has significant performance drops."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/4/EI/CC_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance in CC_subset_1 and CC_subset_4 but drops significantly in CC_subset_2 and CC_subset_3. This suggests it may struggle with certain types of CC-related tasks.",
|
3 |
+
"Nxcode-CQ-7B": "This model performs exceptionally well in CC_subset_1, CC_subset_2, and CC_subset_4, but has a notable drop in CC_subset_3. It's consistently one of the top performers across most subsets.",
|
4 |
+
"codegemma-2b": "The model performs poorly across all subsets, particularly in CC_subset_3 and CC_subset_4 where it scores 0.0 and 2.5 respectively. It's not suitable for CC-related tasks.",
|
5 |
+
"codegemma-7b": "While better than its 2b counterpart, this model still struggles, especially in CC_subset_3 where it scores 0.0. It shows some improvement in other subsets but remains weak overall.",
|
6 |
+
"codegemma-7b-it": "This model shows moderate performance in CC_subset_1 and CC_subset_2 but drops in CC_subset_3 and CC_subset_4. It's better than other codegemma versions but still inconsistent.",
|
7 |
+
"deepseek-coder-1.3b-base": "The model performs poorly across all subsets, particularly in CC_subset_3 and CC_subset_4 where it scores 0.0. It's not recommended for CC tasks.",
|
8 |
+
"deepseek-coder-6.7b-base": "This model shows some capability in CC_subset_1 and CC_subset_2 but fails completely in CC_subset_3 and CC_subset_4. It's inconsistent and not reliable.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "The model performs well in CC_subset_1 and CC_subset_2 but drops significantly in CC_subset_3 and CC_subset_4. It's a better option among deepseek models but still inconsistent.",
|
10 |
+
"deepseek_coder_33b-base": "This model shows moderate performance in CC_subset_1 and CC_subset_2 but struggles in CC_subset_3 and CC_subset_4. It's better than smaller deepseek models but not top-tier.",
|
11 |
+
"deepseek_coder_33b-instruct": "The model performs reasonably well across all subsets, though it drops in CC_subset_3 and CC_subset_4. It's one of the better deepseek models but still not the best.",
|
12 |
+
"codeqwen1.5-7b": "This model shows moderate performance in CC_subset_1 and CC_subset_2, fails in CC_subset_3, but recovers somewhat in CC_subset_4. It's inconsistent but has some strengths.",
|
13 |
+
"global_insights": "Nxcode-CQ-7B is the top performer overall, excelling in most subsets. CodeFuse-DeepSeek-33b also shows strong performance in some subsets. Smaller models like codegemma-2b and deepseek-coder-1.3b-base perform poorly across the board. The CC_subset_3 appears to be the most challenging for all models, with many scoring 0.0. Models generally perform better in CC_subset_1 and CC_subset_2, suggesting these subsets may contain less complex tasks."
|
14 |
+
}
|
llm_insight/HumanEval/4/EI/line_counts_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High accuracy for small to medium tasks": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Best performance in subsets 1 and 2, suitable for tasks with smaller line counts."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"CodeFuse-DeepSeek-33b": "Consistent performance across all subsets, reliable for varied tasks."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Moderate accuracy with cost-effectiveness": [
|
11 |
+
{
|
12 |
+
"deepseek_coder-6.7b-instruct": "Good balance of performance and cost for smaller tasks."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"codegemma-7b-it": "Cost-effective for moderate line counts with decent performance."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Not recommended": [
|
19 |
+
{
|
20 |
+
"codegemma-2b": "Poor performance across all subsets."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"deepseek-coder-1.3b-base": "Inconsistent and low performance."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/4/EI/line_counts_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "Consistent performance across subsets, with a slight dip in line_subset_2. Demonstrates robustness in handling varying line counts.",
|
3 |
+
"Nxcode-CQ-7B": "Highest performance in line_subset_1 and line_subset_2, but significant drop in line_subset_3. May struggle with larger line counts.",
|
4 |
+
"codegemma-2b": "Poor performance across all subsets, especially in line_subset_3 and line_subset_4. Not suitable for tasks requiring high accuracy.",
|
5 |
+
"codegemma-7b": "Better than codegemma-2b but still underperforms in larger line counts. Limited scalability.",
|
6 |
+
"codegemma-7b-it": "Improved over codegemma-7b, especially in line_subset_4. Shows potential for tasks with moderate line counts.",
|
7 |
+
"deepseek-coder-1.3b-base": "Low performance overall, with a surprising spike in line_subset_4. Inconsistent behavior.",
|
8 |
+
"deepseek-coder-6.7b-base": "Moderate performance, but drops significantly in line_subset_3 and line_subset_4. Limited reliability.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong performance in line_subset_1 and line_subset_2, but drops in larger subsets. Best for smaller tasks.",
|
10 |
+
"deepseek_coder_33b-base": "Decent performance but inconsistent across subsets. May require fine-tuning.",
|
11 |
+
"deepseek_coder_33b-instruct": "Good performance in line_subset_1 and line_subset_2, but drops sharply in line_subset_4. Unpredictable for larger tasks.",
|
12 |
+
"codeqwen1.5-7b": "Moderate performance across subsets, with a dip in line_subset_3. Balanced but not outstanding.",
|
13 |
+
"global_insights": "Models generally perform better in smaller line counts (subsets 1 and 2). Larger subsets (3 and 4) show significant performance drops, indicating scalability issues. Nxcode-CQ-7B and CodeFuse-DeepSeek-33b are top performers but have trade-offs. Smaller models like codegemma-2b and deepseek-coder-1.3b-base are not recommended for any serious tasks."
|
14 |
+
}
|
llm_insight/HumanEval/4/EI/token_counts_recommendation.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High performance with long inputs": [
|
3 |
+
{
|
4 |
+
"CodeFuse-DeepSeek-33b": "This model is the only one that handles the longest token subset perfectly, making it ideal for tasks requiring processing of very long inputs."
|
5 |
+
}
|
6 |
+
],
|
7 |
+
"Balanced performance across most inputs": [
|
8 |
+
{
|
9 |
+
"Nxcode-CQ-7B": "This model performs consistently well across most token subsets, making it a reliable choice for general tasks with varying input lengths."
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"deepseek_coder-6.7b-instruct": "This model shows strong performance in shorter to medium token subsets, making it suitable for tasks where inputs are not extremely long."
|
13 |
+
}
|
14 |
+
],
|
15 |
+
"Cost-effective for shorter inputs": [
|
16 |
+
{
|
17 |
+
"codegemma-7b-it": "This model offers reasonable performance for shorter inputs at a lower computational cost compared to larger models."
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"codeqwen1.5-7b": "This model provides a balance between performance and cost for tasks with shorter to medium input lengths."
|
21 |
+
}
|
22 |
+
]
|
23 |
+
}
|
llm_insight/HumanEval/4/EI/token_counts_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance in token_subset_4 with a perfect score of 100.0, but its performance degrades as token counts decrease, indicating potential sensitivity to input length.",
|
3 |
+
"Nxcode-CQ-7B": "This model performs consistently well across token_subset_1 to token_subset_3 but shows a significant drop in token_subset_4, suggesting it may struggle with very long inputs.",
|
4 |
+
"codegemma-2b": "The model's performance is poor across all subsets, with particularly low scores in token_subset_3 and token_subset_4, indicating it may not be suitable for tasks requiring handling of varying token lengths.",
|
5 |
+
"codegemma-7b": "While better than its 2b counterpart, this model still struggles with longer token subsets, showing a clear decline in performance as token counts increase.",
|
6 |
+
"codegemma-7b-it": "This model shows a moderate improvement over the base 7b model, particularly in token_subset_3, but still fails to handle token_subset_4.",
|
7 |
+
"deepseek-coder-1.3b-base": "The model's performance is weak across all subsets, with scores dropping sharply as token counts increase, making it unsuitable for tasks with longer inputs.",
|
8 |
+
"deepseek-coder-6.7b-base": "This model shows better performance than the 1.3b version but still struggles with longer token subsets.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "The model performs well in token_subset_1 and token_subset_2 but shows a significant drop in token_subset_3 and fails in token_subset_4.",
|
10 |
+
"deepseek_coder_33b-base": "This model shows decent performance in token_subset_1 but declines as token counts increase, indicating limitations with longer inputs.",
|
11 |
+
"deepseek_coder_33b-instruct": "The model performs reasonably well in token_subset_1 and token_subset_2 but struggles with longer token subsets.",
|
12 |
+
"codeqwen1.5-7b": "This model shows moderate performance across subsets but fails to handle token_subset_4, similar to other models.",
|
13 |
+
"global_insights": "Most models struggle with longer token subsets, with only CodeFuse-DeepSeek-33b showing perfect performance in token_subset_4. Nxcode-CQ-7B performs consistently well across most subsets but fails in the longest subset. Smaller models generally perform worse, especially with longer inputs. The instruct versions of models tend to perform better than their base counterparts."
|
14 |
+
}
|
llm_insight/HumanEval/4/QS/CC_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High performance with robust generalization": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently high scores across all subsets, indicating strong generalization and reliability."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"CodeFuse-DeepSeek-33b": "Despite the decline, it maintains competitive performance, suitable for tasks where initial high accuracy is critical."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Cost-effective with moderate performance": [
|
11 |
+
{
|
12 |
+
"deepseek_coder-6.7b-instruct": "Balances performance and cost, especially beneficial with instruction tuning."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"deepseek_coder_33b-instruct": "Larger but still cost-effective for tasks needing stable performance."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Budget-constrained scenarios": [
|
19 |
+
{
|
20 |
+
"codegemma-7b-it": "Reasonable performance for its size, suitable when resources are limited."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"codeqwen1.5-7b": "Moderate performance at a lower computational cost."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/4/QS/CC_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows a consistent decline in performance across subsets, indicating potential overfitting or lack of generalization. The drop from 85.37 to 68.29 suggests sensitivity to data distribution shifts.",
|
3 |
+
"Nxcode-CQ-7B": "This model demonstrates strong performance across all subsets, with a slight dip in CC_subset_2. Its high scores in CC_subset_3 and CC_subset_4 indicate robustness to varying data conditions.",
|
4 |
+
"codegemma-2b": "Poor performance across all subsets, with particularly low scores in CC_subset_4. This suggests the model may lack the capacity for the task or requires significant fine-tuning.",
|
5 |
+
"codegemma-7b": "Better than its 2b counterpart but still underperforms compared to larger models. The performance drop in CC_subset_4 is significant, indicating limitations in handling more complex data.",
|
6 |
+
"codegemma-7b-it": "Improved over the base 7b model, especially in CC_subset_2 and CC_subset_4, suggesting that instruction tuning helps but may not be sufficient for top-tier performance.",
|
7 |
+
"deepseek-coder-1.3b-base": "Low scores across all subsets, similar to codegemma-2b, indicating that small models struggle with this task regardless of architecture.",
|
8 |
+
"deepseek-coder-6.7b-base": "Moderate performance with a notable drop in CC_subset_4. The model shows potential but may need further optimization or scaling.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong performance, especially in CC_subset_1 and CC_subset_3, indicating that instruction tuning significantly benefits this model size.",
|
10 |
+
"deepseek_coder_33b-base": "Decent performance but inconsistent, with a significant drop in CC_subset_4. The base model may need fine-tuning to stabilize performance.",
|
11 |
+
"deepseek_coder_33b-instruct": "Consistently good performance across subsets, though not the best. Instruction tuning helps, but the model may still lag behind the top performers.",
|
12 |
+
"codeqwen1.5-7b": "Moderate performance with a steady decline across subsets. The model shows promise but may need further scaling or tuning.",
|
13 |
+
"global_insights": "Larger models generally perform better, with Nxcode-CQ-7B and CodeFuse-DeepSeek-33b leading. Instruction tuning (e.g., deepseek_coder-6.7b-instruct) significantly boosts performance. Smaller models (codegemma-2b, deepseek-coder-1.3b-base) struggle, indicating a minimum model size is required for this task. Performance drops in later subsets suggest increasing complexity or distribution shifts that challenge generalization."
|
14 |
+
}
|
llm_insight/HumanEval/4/QS/line_counts_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High performance with moderate cost": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Maintains high performance across all subsets, making it suitable for tasks requiring consistent accuracy."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"deepseek_coder_33b-instruct": "Offers robust performance across subsets, ideal for tasks needing reliable results with varying line counts."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Balanced performance and cost": [
|
11 |
+
{
|
12 |
+
"CodeFuse-DeepSeek-33b": "Good performance in smaller subsets, suitable for tasks where line counts are generally low."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"deepseek_coder-6.7b-instruct": "Performs well in smaller subsets and moderately in larger ones, offering a balance between cost and performance."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Low cost with acceptable performance for small tasks": [
|
19 |
+
{
|
20 |
+
"codegemma-7b-it": "Performs adequately in smaller subsets, suitable for budget-conscious projects with limited line counts."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"codeqwen1.5-7b": "Offers moderate performance at a lower cost, ideal for smaller-scale tasks."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/4/QS/line_counts_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows consistent performance across subsets 1 and 2 but drops significantly in subsets 3 and 4, indicating potential sensitivity to line count variations.",
|
3 |
+
"Nxcode-CQ-7B": "This model maintains relatively high performance across all subsets, though there is a gradual decline as line counts increase, suggesting robustness but with some sensitivity to complexity.",
|
4 |
+
"codegemma-2b": "Performance drops sharply from subset 1 to subset 2 and remains low, indicating poor scalability with increasing line counts.",
|
5 |
+
"codegemma-7b": "Similar to codegemma-2b, this model shows a significant drop in performance after subset 1, though it performs slightly better overall.",
|
6 |
+
"codegemma-7b-it": "This model performs better than its non-IT counterpart but still shows a decline as line counts increase, suggesting limited scalability.",
|
7 |
+
"deepseek-coder-1.3b-base": "Performance declines sharply after subset 1, indicating poor handling of larger code segments.",
|
8 |
+
"deepseek-coder-6.7b-base": "Shows a moderate drop in performance across subsets, with a significant decline in subset 3, suggesting limitations with more complex code.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Maintains high performance in subsets 1 and 2 but drops in subsets 3 and 4, indicating sensitivity to line count increases.",
|
10 |
+
"deepseek_coder_33b-base": "Performance declines steadily across subsets, suggesting a linear relationship between line count and performance drop.",
|
11 |
+
"deepseek_coder_33b-instruct": "Shows a gradual decline in performance across subsets, but maintains relatively higher scores compared to other models.",
|
12 |
+
"codeqwen1.5-7b": "Performance drops gradually across subsets, indicating moderate sensitivity to line count increases.",
|
13 |
+
"global_insights": "Models generally perform better on smaller line counts (subsets 1 and 2) and decline as line counts increase (subsets 3 and 4). Nxcode-CQ-7B and deepseek_coder_33b-instruct show the most robustness across all subsets. Smaller models like codegemma-2b and deepseek-coder-1.3b-base struggle significantly with larger line counts."
|
14 |
+
}
|
llm_insight/HumanEval/4/QS/token_counts_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High performance and robustness across all token counts": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently high scores across all subsets, making it reliable for varied token counts."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"deepseek_coder-6.7b-instruct": "Strong performance in most subsets, though slightly drops in the highest token count."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Moderate performance with cost-effectiveness": [
|
11 |
+
{
|
12 |
+
"codegemma-7b-it": "Better performance than codegemma-7b and codegemma-2b, but still affordable."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"codeqwen1.5-7b": "Moderate performance with a balance of cost and capability."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Not recommended for high token count tasks": [
|
19 |
+
{
|
20 |
+
"codegemma-2b": "Poor performance across all subsets, especially in higher token counts."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"deepseek-coder-1.3b-base": "Sharp decline in performance with higher token counts."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/4/QS/token_counts_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance in token_subset_2 (85.37) but drops significantly in token_subset_4 (63.41). This suggests it handles moderate token counts well but struggles with higher token counts.",
|
3 |
+
"Nxcode-CQ-7B": "Consistently high performance across all subsets, especially in token_subset_1 (93.9). This indicates robustness across varying token counts.",
|
4 |
+
"codegemma-2b": "Poor performance across all subsets, with a steep decline in token_subset_4 (8.05). Not suitable for tasks with high token counts.",
|
5 |
+
"codegemma-7b": "Moderate performance, but drops significantly in token_subset_3 (27.56) and token_subset_4 (26.34). Limited capability with higher token counts.",
|
6 |
+
"codegemma-7b-it": "Better than codegemma-7b, but still shows a decline in token_subset_3 (37.07) and token_subset_4 (41.83).",
|
7 |
+
"deepseek-coder-1.3b-base": "Low performance across all subsets, with a sharp drop in token_subset_4 (12.32). Not recommended for high token count tasks.",
|
8 |
+
"deepseek-coder-6.7b-base": "Moderate performance, but declines in token_subset_3 (36.34) and token_subset_4 (23.9).",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong performance in token_subset_1 (85.98) and token_subset_3 (72.8), but drops in token_subset_4 (56.34).",
|
10 |
+
"deepseek_coder_33b-base": "Moderate performance, with a decline in token_subset_4 (32.8).",
|
11 |
+
"deepseek_coder_33b-instruct": "Strong performance across all subsets, though it drops in token_subset_4 (46.1).",
|
12 |
+
"codeqwen1.5-7b": "Moderate performance, with a decline in token_subset_3 (39.88) and token_subset_4 (40.37).",
|
13 |
+
"global_insights": "Nxcode-CQ-7B and deepseek_coder-6.7b-instruct show the most consistent performance across all token subsets. Smaller models like codegemma-2b and deepseek-coder-1.3b-base struggle with higher token counts. Larger models generally perform better but still show declines in the highest token subset."
|
14 |
+
}
|
llm_insight/HumanEval/5/EI/CC_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High accuracy and robustness": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently high performance across most subsets, making it reliable for diverse tasks."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"CodeFuse-DeepSeek-33b": "Strong performance in specific subsets, suitable for tasks similar to CC_subset_1 and CC_subset_4."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Moderate accuracy with cost-effectiveness": [
|
11 |
+
{
|
12 |
+
"deepseek_coder-6.7b-instruct": "Decent performance at a potentially lower computational cost compared to larger models."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"codeqwen1.5-7b": "Balanced performance and may offer a good trade-off between cost and accuracy."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Not recommended": [
|
19 |
+
{
|
20 |
+
"codegemma-2b": "Poor performance across all subsets."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"deepseek-coder-1.3b-base": "Very low scores, not suitable for any serious tasks."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/5/EI/CC_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance in CC_subset_4 (100%) and CC_subset_1 (81.63%), but struggles in CC_subset_3 (33.33%). This indicates variability in handling different subsets, possibly due to the nature of the tasks in each subset.",
|
3 |
+
"Nxcode-CQ-7B": "Consistently high performance across most subsets (CC_subset_1: 87.35%, CC_subset_2: 88.73%, CC_subset_3: 84.17%, CC_subset_5: 98.33%), except for CC_subset_4 (37.5%). This suggests robustness but a potential weakness in tasks similar to CC_subset_4.",
|
4 |
+
"codegemma-2b": "Poor performance across all subsets, with the highest score being 31.63% in CC_subset_1. This model is not suitable for tasks requiring high accuracy.",
|
5 |
+
"codegemma-7b": "Moderate performance, with the best score in CC_subset_1 (46.84%). Performance drops significantly in other subsets, indicating limited generalization.",
|
6 |
+
"codegemma-7b-it": "Better than codegemma-7b, with the highest score in CC_subset_1 (57.35%). However, performance is inconsistent, especially in CC_subset_4 (2.5%).",
|
7 |
+
"deepseek-coder-1.3b-base": "Very poor performance, with scores below 40% in all subsets. Not recommended for any serious tasks.",
|
8 |
+
"deepseek-coder-6.7b-base": "Slightly better than the 1.3b version, but still underperforms with a maximum score of 51.68% in CC_subset_1.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Decent performance in CC_subset_1 (73.16%) and CC_subset_2 (76.36%), but struggles in CC_subset_4 (0%).",
|
10 |
+
"deepseek_coder_33b-base": "Inconsistent performance, with highs in CC_subset_4 (40%) and lows in CC_subset_5 (3.33%).",
|
11 |
+
"deepseek_coder_33b-instruct": "Good performance in CC_subset_1 (67.86%), CC_subset_2 (67.09%), and CC_subset_3 (56.67%), but fails in CC_subset_4 (5%).",
|
12 |
+
"codeqwen1.5-7b": "Moderate performance across subsets, with the best score in CC_subset_4 (47.5%). Shows some consistency but lacks top-tier performance.",
|
13 |
+
"global_insights": "Nxcode-CQ-7B and CodeFuse-DeepSeek-33b are the top performers, with Nxcode-CQ-7B being more consistent. Smaller models like codegemma-2b and deepseek-coder-1.3b-base are not suitable for high-accuracy tasks. The instruct versions of models generally perform better than their base counterparts. CC_subset_4 appears to be the most challenging subset for most models."
|
14 |
+
}
|
llm_insight/HumanEval/5/EI/line_counts_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High Performance, High Cost": [
|
3 |
+
{
|
4 |
+
"CodeFuse-DeepSeek-33b": "Consistently high performance across most subsets, though with some variability."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"Nxcode-CQ-7B": "Strong performance in most scenarios, except for very long lines of code."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Balanced Performance and Cost": [
|
11 |
+
{
|
12 |
+
"deepseek_coder-6.7b-instruct": "Decent performance with instruction tuning, suitable for tasks not involving very long lines of code."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"codeqwen1.5-7b": "Moderate performance across most subsets, a good balance for general use."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Low Cost, Limited Performance": [
|
19 |
+
{
|
20 |
+
"codegemma-7b-it": "Better than its base version but still limited, suitable for low-budget scenarios."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"deepseek_coder_33b-instruct": "Instruction-tuned but inconsistent, only recommended for low-priority tasks."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/5/EI/line_counts_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance across most subsets, particularly excelling in line_subset_4 with a perfect score. However, there is variability in performance, indicating potential sensitivity to line count differences.",
|
3 |
+
"Nxcode-CQ-7B": "This model demonstrates consistent performance in line_subset_1, line_subset_2, and line_subset_3 but shows a significant drop in line_subset_4. This suggests potential limitations with very short or very long lines of code.",
|
4 |
+
"codegemma-2b": "The model performs poorly across all subsets, with particularly low scores in line_subset_4 and line_subset_5. This indicates a general lack of capability in handling varying line counts.",
|
5 |
+
"codegemma-7b": "While better than codegemma-2b, this model still struggles, especially with longer lines of code. Performance is inconsistent and generally low.",
|
6 |
+
"codegemma-7b-it": "This model shows improvement over its non-instruction-tuned counterpart but still has significant room for improvement, particularly in handling longer lines of code.",
|
7 |
+
"deepseek-coder-1.3b-base": "The model performs poorly, with very low scores in line_subset_4 and inconsistent performance elsewhere. Not suitable for tasks requiring robustness across line counts.",
|
8 |
+
"deepseek-coder-6.7b-base": "Performance is inconsistent, with a complete failure in line_subset_4. This model is not reliable for tasks involving varying line counts.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "The model shows decent performance in line_subset_1 and line_subset_2 but struggles with longer lines. The instruction tuning helps but is not sufficient for all scenarios.",
|
10 |
+
"deepseek_coder_33b-base": "Performance is middling, with no standout strengths. The model is inconsistent across subsets, indicating limited robustness.",
|
11 |
+
"deepseek_coder_33b-instruct": "This model performs well in line_subset_1 and line_subset_3 but shows significant drops in line_subset_4 and line_subset_5. Instruction tuning helps but does not fully address the limitations.",
|
12 |
+
"codeqwen1.5-7b": "The model shows moderate performance across most subsets but struggles with longer lines. It is a middle-of-the-road option with no extreme weaknesses or strengths.",
|
13 |
+
"global_insights": "Models generally perform better with shorter lines of code (line_subset_1 and line_subset_2) and struggle with longer lines (line_subset_4 and line_subset_5). Instruction-tuned models tend to perform better than their base counterparts, but the improvement is not always significant. The best-performing models are CodeFuse-DeepSeek-33b and Nxcode-CQ-7B, but even they have notable weaknesses."
|
14 |
+
}
|
llm_insight/HumanEval/5/EI/token_counts_recommendation.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High performance with balanced token handling": [
|
3 |
+
{
|
4 |
+
"CodeFuse-DeepSeek-33b": "Consistently high performance across most token subsets, making it reliable for diverse token ranges."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"Nxcode-CQ-7B": "Strong performance in lower to mid token subsets, ideal for tasks not requiring very high token counts."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Cost-effective for lower token tasks": [
|
11 |
+
{
|
12 |
+
"deepseek_coder-6.7b-instruct": "Good performance in lower to mid token subsets at a potentially lower cost."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"deepseek_coder_33b-instruct": "Balanced performance in lower subsets, suitable for tasks with moderate token counts."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Avoid for high token tasks": [
|
19 |
+
{
|
20 |
+
"codegemma-2b": "Poor performance across all subsets, especially in higher token counts."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"codegemma-7b": "Fails in higher token ranges, not recommended for robust tasks."
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"codegemma-7b-it": "Better than other codegemma variants but still inadequate for high token tasks."
|
27 |
+
}
|
28 |
+
]
|
29 |
+
}
|
llm_insight/HumanEval/5/EI/token_counts_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance across most subsets, particularly excelling in token_subset_5 with a perfect score. However, there is a noticeable dip in token_subset_3, suggesting potential variability in handling certain token ranges.",
|
3 |
+
"Nxcode-CQ-7B": "This model performs consistently well in subsets 1-4 but has a significant drop in token_subset_5, indicating a potential limitation with very high token counts.",
|
4 |
+
"codegemma-2b": "Poor performance across all subsets, especially in higher token counts, making it unsuitable for tasks requiring robust token handling.",
|
5 |
+
"codegemma-7b": "Moderate performance in lower token subsets but fails completely in higher token ranges, similar to codegemma-2b.",
|
6 |
+
"codegemma-7b-it": "Better than other codegemma variants but still struggles with higher token counts, showing a steep decline in performance.",
|
7 |
+
"deepseek-coder-1.3b-base": "Low performance across the board, with particularly poor results in higher token subsets.",
|
8 |
+
"deepseek-coder-6.7b-base": "Moderate performance in lower token subsets but fails to maintain consistency in higher ranges.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong performance in lower to mid token subsets but drops significantly in token_subset_5, similar to Nxcode-CQ-7B.",
|
10 |
+
"deepseek_coder_33b-base": "Decent performance in lower token subsets but struggles as token counts increase.",
|
11 |
+
"deepseek_coder_33b-instruct": "Good performance in lower subsets but inconsistent in higher token ranges, with a notable drop in token_subset_3.",
|
12 |
+
"codeqwen1.5-7b": "Moderate performance across subsets but fails in higher token counts, similar to other models.",
|
13 |
+
"global_insights": "Models generally perform better in lower token subsets, with performance degrading as token counts increase. CodeFuse-DeepSeek-33b and Nxcode-CQ-7B are exceptions, showing strong performance in most subsets except the highest token range. This suggests that token count is a critical factor in model performance, and models vary significantly in their ability to handle different token ranges."
|
14 |
+
}
|
llm_insight/HumanEval/5/QS/CC_recommendation.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High accuracy and robustness": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently high performance across all subsets, making it the most reliable choice for high-accuracy tasks."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"deepseek_coder-6.7b-instruct": "Strong performance across all subsets, particularly in subsets 1 and 4, indicating robustness."
|
8 |
+
},
|
9 |
+
{
|
10 |
+
"deepseek_coder_33b-instruct": "Reliable performance across all subsets, with high scores in subsets 1 and 4, making it a strong contender for high-accuracy tasks."
|
11 |
+
}
|
12 |
+
],
|
13 |
+
"Moderate accuracy with cost-effectiveness": [
|
14 |
+
{
|
15 |
+
"codegemma-7b-it": "Better performance than codegemma-7b and more cost-effective than larger models, suitable for tasks where moderate accuracy is acceptable."
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"codeqwen1.5-7b": "Moderate performance in subsets 1-4, making it a cost-effective choice for tasks where high accuracy is not critical."
|
19 |
+
}
|
20 |
+
],
|
21 |
+
"Low accuracy for lightweight tasks": [
|
22 |
+
{
|
23 |
+
"codegemma-2b": "Suitable for lightweight tasks where accuracy is not a priority."
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"deepseek-coder-1.3b-base": "Low performance but may be suitable for very basic tasks where cost is a major concern."
|
27 |
+
}
|
28 |
+
]
|
29 |
+
}
|
llm_insight/HumanEval/5/QS/CC_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance in subsets 1-4 but drops significantly in subset 5, indicating potential issues with specific types of data in subset 5.",
|
3 |
+
"Nxcode-CQ-7B": "Consistently high performance across all subsets, with the highest scores in subsets 1 and 3. This model is robust and reliable across different data types.",
|
4 |
+
"codegemma-2b": "Poor performance across all subsets, particularly in subset 5. This model is not suitable for tasks requiring high accuracy.",
|
5 |
+
"codegemma-7b": "Moderate performance with a noticeable drop in subset 5. The model struggles with certain data types but performs reasonably well in others.",
|
6 |
+
"codegemma-7b-it": "Better performance than codegemma-7b, especially in subsets 2-5, but still not as strong as the top-performing models.",
|
7 |
+
"deepseek-coder-1.3b-base": "Low performance across all subsets, similar to codegemma-2b. Not recommended for high-accuracy tasks.",
|
8 |
+
"deepseek-coder-6.7b-base": "Moderate performance with a significant drop in subset 5. The model is inconsistent across different data types.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong performance across all subsets, particularly in subsets 1 and 4. This model is reliable and performs well under various conditions.",
|
10 |
+
"deepseek_coder_33b-base": "Good performance in subsets 1-4 but drops in subset 5. The model is generally reliable but has some weaknesses.",
|
11 |
+
"deepseek_coder_33b-instruct": "Strong performance across all subsets, with particularly high scores in subsets 1 and 4. This model is robust and performs well under various conditions.",
|
12 |
+
"codeqwen1.5-7b": "Moderate performance with a noticeable drop in subset 5. The model is inconsistent but performs reasonably well in subsets 1-4.",
|
13 |
+
"global_insights": "The models Nxcode-CQ-7B, deepseek_coder-6.7b-instruct, and deepseek_coder_33b-instruct consistently perform well across all subsets, indicating robustness. Subset 5 appears to be challenging for most models, suggesting it contains more complex or diverse data. The codegemma and deepseek-coder-1.3b-base models generally underperform, making them less suitable for high-accuracy tasks."
|
14 |
+
}
|
llm_insight/HumanEval/5/QS/line_counts_recommendation.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High performance and robustness across varying line counts": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "This model consistently performs well across all subsets, making it ideal for tasks requiring stability and high accuracy regardless of line count."
|
5 |
+
}
|
6 |
+
],
|
7 |
+
"Balanced performance and cost-effectiveness": [
|
8 |
+
{
|
9 |
+
"deepseek_coder-6.7b-instruct": "This model offers strong performance in lower line counts and is more cost-effective than larger models, suitable for tasks where line counts are moderate."
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"CodeFuse-DeepSeek-33b": "Provides decent performance across subsets and is a good balance between cost and performance for tasks with varying line counts."
|
13 |
+
}
|
14 |
+
],
|
15 |
+
"Budget-conscious scenarios with lower line counts": [
|
16 |
+
{
|
17 |
+
"codeqwen1.5-7b": "A cost-effective option for tasks with lower line counts, though performance drops as line counts increase."
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"codegemma-7b-it": "Suitable for budget-conscious scenarios where line counts are low to moderate, though performance is not as robust as larger models."
|
21 |
+
}
|
22 |
+
]
|
23 |
+
}
|
llm_insight/HumanEval/5/QS/line_counts_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows consistent performance across subsets, with a peak in line_subset_2 (84.85). However, it has a noticeable dip in line_subset_4 (66.67), indicating potential sensitivity to certain line count distributions.",
|
3 |
+
"Nxcode-CQ-7B": "This model demonstrates strong and stable performance across all subsets, with the highest score in line_subset_1 (92.27) and maintaining above 87 in most subsets. It is robust to varying line counts.",
|
4 |
+
"codegemma-2b": "Performance is consistently low across all subsets, with the highest score in line_subset_1 (51.67) and dropping significantly in other subsets. This suggests the model struggles with tasks involving varying line counts.",
|
5 |
+
"codegemma-7b": "While better than codegemma-2b, this model still underperforms, with scores ranging from 28.44 to 60.45. It shows a declining trend as line counts increase.",
|
6 |
+
"codegemma-7b-it": "This model performs better than its non-IT counterpart, with scores ranging from 39.69 to 73.33. However, it still shows a decline as line counts increase.",
|
7 |
+
"deepseek-coder-1.3b-base": "The model's performance is poor, with scores ranging from 21.72 to 52.73. It shows a clear downward trend as line counts increase.",
|
8 |
+
"deepseek-coder-6.7b-base": "Performance is moderate, with scores ranging from 29.53 to 74.09. Like other models, it shows a decline with increasing line counts.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "This model shows strong performance in line_subset_1 (79.55) and line_subset_2 (84.39), but drops significantly in line_subset_5 (52.03). It is sensitive to higher line counts.",
|
10 |
+
"deepseek_coder_33b-base": "Performance is moderate, with scores ranging from 38.59 to 75.0. It shows a consistent decline as line counts increase.",
|
11 |
+
"deepseek_coder_33b-instruct": "This model performs well in line_subset_1 (82.27) but shows a steady decline to 50.16 in line_subset_5. It is less robust to higher line counts.",
|
12 |
+
"codeqwen1.5-7b": "Performance is moderate, with scores ranging from 41.72 to 63.18. It shows a steady decline as line counts increase.",
|
13 |
+
"global_insights": "Most models show a decline in performance as line counts increase, indicating a general sensitivity to larger code blocks. Nxcode-CQ-7B stands out as the most robust model across all subsets. Smaller models like codegemma-2b and deepseek-coder-1.3b-base struggle significantly, while larger models like Nxcode-CQ-7B and deepseek_coder-6.7b-instruct show better but varying performance."
|
14 |
+
}
|
llm_insight/HumanEval/5/QS/token_counts_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High performance and robustness": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently high performance across all token subsets, making it reliable for varying input sizes."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"deepseek_coder-6.7b-instruct": "Strong performance in most subsets, though it shows some inconsistency in the largest token subset."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Moderate performance with cost-effectiveness": [
|
11 |
+
{
|
12 |
+
"codegemma-7b-it": "Balances performance and cost, though it declines with larger token counts."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"deepseek_coder-33b-instruct": "Good performance in smaller token subsets and maintains a baseline in larger ones."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Low-cost with basic capabilities": [
|
19 |
+
{
|
20 |
+
"codeqwen1.5-7b": "Moderate performance at a lower cost, suitable for less demanding tasks."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"deepseek-coder-6.7b-base": "Basic performance at a lower cost, though it declines with larger token counts."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/5/QS/token_counts_report.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance in token_subset_3 (90.91) but has inconsistent results across other subsets, indicating potential instability with varying token counts.",
|
3 |
+
"Nxcode-CQ-7B": "Consistently high performance across all subsets, especially in token_subset_1 (95.45), suggesting robustness to token count variations.",
|
4 |
+
"codegemma-2b": "Poor performance across all subsets, with a significant drop in token_subset_5 (5.31), indicating it struggles with larger token counts.",
|
5 |
+
"codegemma-7b": "Moderate performance with a steady decline as token counts increase, suggesting limitations in handling larger inputs.",
|
6 |
+
"codegemma-7b-it": "Better than codegemma-7b but still shows a decline in performance with increasing token counts, though it maintains a baseline in token_subset_5 (39.22).",
|
7 |
+
"deepseek-coder-1.3b-base": "Low performance across all subsets, with a sharp drop in token_subset_4 (15.45) and token_subset_5 (10.94), indicating it is not suitable for larger token counts.",
|
8 |
+
"deepseek-coder-6.7b-base": "Moderate performance with a steady decline as token counts increase, similar to codegemma-7b.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong performance in token_subset_1 (87.88) and token_subset_4 (76.36), but a significant drop in token_subset_5 (51.72), indicating inconsistency.",
|
10 |
+
"deepseek_coder_33b-base": "Moderate performance with a steady decline as token counts increase, similar to other base models.",
|
11 |
+
"deepseek_coder_33b-instruct": "Strong performance in token_subset_1 (87.12) but declines steadily, though it maintains a baseline in token_subset_4 (64.7).",
|
12 |
+
"codeqwen1.5-7b": "Moderate performance with a steady decline as token counts increase, similar to other 7b models.",
|
13 |
+
"new_model": "Performance mirrors codegemma-7b-it, suggesting similar capabilities and limitations.",
|
14 |
+
"global_insights": "Nxcode-CQ-7B and deepseek_coder-6.7b-instruct show the highest and most consistent performance across subsets. Smaller models like codegemma-2b and deepseek-coder-1.3b-base struggle significantly with larger token counts. There is a general trend of declining performance as token counts increase, with some models like Nxcode-CQ-7B being exceptions."
|
15 |
+
}
|
llm_insight/HumanEval/6/EI/CC_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High accuracy tasks with tolerance for variability": [
|
3 |
+
{
|
4 |
+
"CodeFuse-DeepSeek-33b": "Achieves perfect scores in subsets 4 and 5, making it ideal for tasks where these subsets are representative."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"Nxcode-CQ-7B": "Excels in subsets 2 and 6, suitable for tasks prioritizing these data types."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Balanced performance and cost-effectiveness": [
|
11 |
+
{
|
12 |
+
"deepseek_coder-6.7b-instruct": "Offers strong performance in multiple subsets without the extreme variability of larger models."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"codeqwen1.5-7b": "Provides good performance in subset 5 and moderate performance elsewhere, suitable for balanced tasks."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Low-resource environments": [
|
19 |
+
{
|
20 |
+
"codegemma-7b-it": "Better performance than its non-IT counterpart, suitable for environments with limited computational resources."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"deepseek-coder-6.7b-base": "Moderate performance at a lower computational cost compared to larger models."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/6/EI/CC_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance in subsets 4 and 5 with perfect scores, but has a significant drop in subset 3 (46.67). This indicates potential variability in handling different types of data within the CC perspective.",
|
3 |
+
"Nxcode-CQ-7B": "This model excels in subsets 2 and 6 (91.41 and 98.33) but performs poorly in subset 4 (20.0). The wide range of scores suggests sensitivity to specific data characteristics.",
|
4 |
+
"codegemma-2b": "Consistently low performance across all subsets, with the highest score being 31.63 in subset 1. This model may not be suitable for tasks requiring high accuracy.",
|
5 |
+
"codegemma-7b": "Moderate performance with peaks in subset 1 (46.84) and subset 5 (40.0). The model struggles in subset 4 (0.0), indicating a potential weakness in certain data types.",
|
6 |
+
"codegemma-7b-it": "Better than its non-IT counterpart, with a high of 57.35 in subset 1. However, it still shows significant drops in subsets 4 and 5 (0.0 and 5.0).",
|
7 |
+
"deepseek-coder-1.3b-base": "Very low performance across all subsets, with no scores above 38.57. Not recommended for high-accuracy tasks.",
|
8 |
+
"deepseek-coder-6.7b-base": "Moderate performance with a high of 51.68 in subset 1. Similar to other models, it fails in subset 4 (0.0).",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong performance in subsets 1, 2, and 3 (73.16, 77.72, 56.67), but fails in subset 4 (0.0). This suggests good generalizability except for specific data types.",
|
10 |
+
"deepseek_coder_33b-base": "Variable performance with a high of 80.0 in subset 5 but 0.0 in subset 4. This inconsistency may limit its reliability.",
|
11 |
+
"deepseek_coder_33b-instruct": "Consistently good performance in subsets 1, 2, and 3 (67.86, 70.33, 53.0), but struggles in subset 4 (0.0). The model shows promise but has clear limitations.",
|
12 |
+
"codeqwen1.5-7b": "Strong performance in subset 5 (95.0) but fails in subset 4 (0.0). The model's performance is otherwise moderate, indicating potential for specific use cases.",
|
13 |
+
"global_insights": "1. Most models struggle with subset 4, indicating a potential outlier or particularly challenging data type within the CC perspective. 2. Larger models (e.g., 33b variants) generally perform better but are not immune to significant drops in specific subsets. 3. The Nxcode-CQ-7B and CodeFuse-DeepSeek-33b models show the highest peaks but also the most variability, suggesting a trade-off between peak performance and consistency."
|
14 |
+
}
|
llm_insight/HumanEval/6/EI/line_counts_recommendation.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Small to Medium Line Counts (Cost-Effective)": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently high performance in subsets 1-4, making it reliable for smaller to medium line counts."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"deepseek_coder-6.7b-instruct": "Strong performance in subsets 1-4, offering a good balance of cost and effectiveness."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Large Line Counts (High Performance)": [
|
11 |
+
{
|
12 |
+
"CodeFuse-DeepSeek-33b": "Exceptional performance in subset 6, though inconsistent in others. Best for tasks focusing on large line counts."
|
13 |
+
}
|
14 |
+
],
|
15 |
+
"General Purpose (Balanced)": [
|
16 |
+
{
|
17 |
+
"codeqwen1.5-7b": "Moderate performance across most subsets, suitable for general use where line counts vary."
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"deepseek_coder_33b-instruct": "More consistent than its base version, offering a balance for varied line counts."
|
21 |
+
}
|
22 |
+
],
|
23 |
+
"Avoid": [
|
24 |
+
{
|
25 |
+
"codegemma-2b": "Poor performance across all subsets."
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"deepseek-coder-1.3b-base": "Consistently low scores, not suitable for any scenario."
|
29 |
+
}
|
30 |
+
]
|
31 |
+
}
|
llm_insight/HumanEval/6/EI/line_counts_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance in subsets 1-3 and subset 6, but a significant drop in subset 4. This suggests it handles larger line counts well but struggles with medium-sized subsets.",
|
3 |
+
"Nxcode-CQ-7B": "Consistently high performance across subsets 1-4, with a notable drop in subsets 5-6. This indicates robustness for smaller to medium line counts but less effectiveness for larger ones.",
|
4 |
+
"codegemma-2b": "Poor performance across all subsets, with particularly low scores in subsets 4-6. Not suitable for tasks requiring handling of larger line counts.",
|
5 |
+
"codegemma-7b": "Moderate performance in subsets 1-3, but sharply declines in subsets 4-6. Better than codegemma-2b but still not reliable for larger line counts.",
|
6 |
+
"codegemma-7b-it": "Improved over codegemma-7b, especially in subsets 1-3, but still struggles with larger line counts. A middle-ground option among codegemma models.",
|
7 |
+
"deepseek-coder-1.3b-base": "Low performance across the board, with slight improvements in subset 6. Not recommended for any line count range.",
|
8 |
+
"deepseek-coder-6.7b-base": "Better than 1.3b but still inconsistent, with a sharp drop in subsets 4-6. Limited utility for larger line counts.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong in subsets 1-4, but performance halves in subsets 5-6. Good for smaller to medium line counts but not larger ones.",
|
10 |
+
"deepseek_coder_33b-base": "Variable performance, with high scores in subsets 1 and 6 but poor in subset 5. Unpredictable for consistent use.",
|
11 |
+
"deepseek_coder_33b-instruct": "Similar to 33b-base but slightly more consistent in subsets 1-4. Still not reliable for larger line counts.",
|
12 |
+
"codeqwen1.5-7b": "Moderate in subsets 1-3, but drops significantly in subsets 4-6. A balanced option but not for larger line counts.",
|
13 |
+
"global_insights": "Models generally perform better on smaller to medium line counts (subsets 1-3) and struggle with larger ones (subsets 4-6). Nxcode-CQ-7B and CodeFuse-DeepSeek-33b are top performers but have specific weaknesses. Codegemma models are consistently weak, while deepseek models show variability. Instruct versions of models often outperform their base counterparts."
|
14 |
+
}
|
llm_insight/HumanEval/6/EI/token_counts_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High performance with moderate token counts": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently high performance across most subsets except the largest."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"deepseek_coder-6.7b-instruct": "Robust performance for most token counts, though not the largest."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Cost-effective for small to medium tasks": [
|
11 |
+
{
|
12 |
+
"codegemma-7b-it": "Better performance than smaller codegemma models for mid-range token counts."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"deepseek-coder-6.7b-base": "Balanced performance and resource usage for smaller tasks."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Large and complex tasks": [
|
19 |
+
{
|
20 |
+
"CodeFuse-DeepSeek-33b": "Handles larger token counts better than most, though with some variability."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"deepseek_coder_33b-instruct": "Good for large inputs but inconsistent in some subsets."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/6/EI/token_counts_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance in subsets with higher token counts (token_subset_2 and token_subset_6), but struggles in token_subset_3 and token_subset_4. This suggests it may be better suited for larger code snippets or more complex tasks.",
|
3 |
+
"Nxcode-CQ-7B": "Consistently performs well across most subsets except token_subset_6, where it drops significantly. This indicates robustness in handling varying token counts but may have limitations with very large inputs.",
|
4 |
+
"codegemma-2b": "Performance degrades sharply as token count increases, making it unsuitable for larger or more complex code tasks.",
|
5 |
+
"codegemma-7b": "Similar to codegemma-2b but with slightly better performance across subsets. Still not ideal for larger inputs.",
|
6 |
+
"codegemma-7b-it": "Shows improvement over codegemma-7b, particularly in mid-range token subsets, but still fails in token_subset_6.",
|
7 |
+
"deepseek-coder-1.3b-base": "Struggles with larger token counts, performing poorly in subsets 3 through 6.",
|
8 |
+
"deepseek-coder-6.7b-base": "Better than the 1.3b version but still not suitable for larger inputs.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Performs well in subsets 1-5 but fails in token_subset_6. This suggests it is robust for most tasks but may not handle very large inputs well.",
|
10 |
+
"deepseek_coder_33b-base": "Shows moderate performance across subsets but struggles with larger inputs.",
|
11 |
+
"deepseek_coder_33b-instruct": "Performs well in subsets 1-2 and subset 5, but drops in subsets 3-4 and fails in subset 6. This indicates variability in handling different token counts.",
|
12 |
+
"codeqwen1.5-7b": "Moderate performance across subsets but struggles with larger inputs.",
|
13 |
+
"global_insights": "Models generally perform better with smaller token counts, with performance degrading as token count increases. Larger models (e.g., 33b variants) tend to handle larger inputs better but still have limitations. The Nxcode-CQ-7B and deepseek_coder-6.7b-instruct models show the most consistent performance across varying token counts, though they still struggle with the largest inputs."
|
14 |
+
}
|
llm_insight/HumanEval/6/QS/CC_recommendation.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High accuracy and reliability": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently high performance across all subsets, making it the most reliable choice for critical applications."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"CodeFuse-DeepSeek-33b": "Strong performance in most subsets, though with some instability in subset 6."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Moderate accuracy with cost-effectiveness": [
|
11 |
+
{
|
12 |
+
"deepseek_coder-6.7b-instruct": "Good performance in most subsets, offering a balance between cost and accuracy."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"deepseek_coder_33b-instruct": "Solid performance across most subsets, suitable for less critical tasks."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Low-cost with basic performance": [
|
19 |
+
{
|
20 |
+
"codegemma-7b-it": "Better than its non-IT counterpart, suitable for basic tasks where high accuracy is not required."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"deepseek-coder-6.7b-base": "Moderate performance, suitable for non-critical applications."
|
24 |
+
}
|
25 |
+
]
|
26 |
+
}
|
llm_insight/HumanEval/6/QS/CC_report.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows strong performance in subsets 2, 3, and 5, but struggles in subset 6 with a significant drop to 50.0. This indicates potential instability in handling certain types of data within the CC perspective.",
|
3 |
+
"Nxcode-CQ-7B": "Consistently high performance across all subsets, with the lowest score being 80.18 in subset 3. This model demonstrates robustness and reliability across different data splits.",
|
4 |
+
"codegemma-2b": "Poor performance across all subsets, particularly in subset 6 with a score of 2.29. This model is not suitable for tasks requiring high accuracy.",
|
5 |
+
"codegemma-7b": "Moderate performance with scores ranging from 14.37 to 61.61. The model shows some capability but is inconsistent, especially in subset 6.",
|
6 |
+
"codegemma-7b-it": "Better than its non-IT counterpart, with scores ranging from 33.33 to 65.36. Still, it lacks the consistency needed for critical applications.",
|
7 |
+
"deepseek-coder-1.3b-base": "Low performance across the board, with a significant drop in subset 6 to 6.67. Not recommended for any serious tasks.",
|
8 |
+
"deepseek-coder-6.7b-base": "Moderate performance, with scores between 13.75 and 69.64. Shows some promise but is inconsistent.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong performance in subsets 1, 4, and 5, with a notable drop in subset 6 to 54.58. This model is reliable but has some limitations.",
|
10 |
+
"deepseek_coder_33b-base": "Moderate to good performance, with scores ranging from 28.96 to 70.71. Shows potential but is not the top performer.",
|
11 |
+
"deepseek_coder_33b-instruct": "Good performance across most subsets, with scores between 45.42 and 80.18. A solid choice but not the best.",
|
12 |
+
"codeqwen1.5-7b": "Moderate performance, with scores ranging from 29.58 to 64.64. Inconsistent and not recommended for high-stakes tasks.",
|
13 |
+
"global_insights": "Nxcode-CQ-7B is the top performer, showing consistent high scores across all subsets. CodeFuse-DeepSeek-33b and deepseek_coder-6.7b-instruct also perform well but have notable drops in subset 6. Smaller models like codegemma-2b and deepseek-coder-1.3b-base perform poorly, indicating that model size and architecture play a significant role in performance. The CC perspective seems to favor larger, more robust models."
|
14 |
+
}
|
llm_insight/HumanEval/6/QS/line_counts_recommendation.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High accuracy tasks with variable line counts": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently high performance across most subsets."
|
5 |
+
},
|
6 |
+
{
|
7 |
+
"deepseek_coder-6.7b-instruct": "Strong in subsets with higher line counts."
|
8 |
+
}
|
9 |
+
],
|
10 |
+
"Cost-effective for moderate line counts": [
|
11 |
+
{
|
12 |
+
"CodeFuse-DeepSeek-33b": "Balanced performance and cost for moderate line counts."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"codeqwen1.5-7b": "Moderate performance at a lower cost."
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"Tasks with smaller line counts": [
|
19 |
+
{
|
20 |
+
"deepseek-coder-6.7b-base": "Suitable for smaller line counts at a lower cost."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"deepseek_coder_33b-base": "Better performance for small to medium line counts."
|
24 |
+
}
|
25 |
+
],
|
26 |
+
"Not recommended for any tasks": [
|
27 |
+
{
|
28 |
+
"codegemma-2b": "Poor performance across all subsets."
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"deepseek-coder-1.3b-base": "Low accuracy in all scenarios."
|
32 |
+
}
|
33 |
+
]
|
34 |
+
}
|
llm_insight/HumanEval/6/QS/line_counts_report.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows consistent performance across subsets, with a slight dip in subset_4. It performs well in subsets with moderate line counts.",
|
3 |
+
"Nxcode-CQ-7B": "This model excels in most subsets, particularly in subset_1 and subset_2, but shows a noticeable drop in subset_6. It is robust for larger line counts.",
|
4 |
+
"codegemma-2b": "Performance is poor across all subsets, with the lowest scores in subset_4 and subset_6. Not suitable for tasks requiring high accuracy.",
|
5 |
+
"codegemma-7b": "Better than codegemma-2b but still underperforms in subsets with higher line counts. Moderate performance overall.",
|
6 |
+
"codegemma-7b-it": "Improved over codegemma-7b, especially in subsets_1 and subset_5. Still struggles with higher line counts.",
|
7 |
+
"deepseek-coder-1.3b-base": "Low performance across all subsets, particularly in subset_4 and subset_6. Not recommended for complex tasks.",
|
8 |
+
"deepseek-coder-6.7b-base": "Moderate performance in subsets_1 and subset_2, but drops significantly in subset_6. Suitable for smaller line counts.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong performance in subset_2, but inconsistent in others. Good for tasks with variable line counts.",
|
10 |
+
"deepseek_coder_33b-base": "Decent performance in subset_1, but declines as line counts increase. Best for smaller to medium line counts.",
|
11 |
+
"deepseek_coder_33b-instruct": "High performance in subset_1 and subset_2, but drops in subset_6. Suitable for tasks with moderate line counts.",
|
12 |
+
"codeqwen1.5-7b": "Moderate performance across subsets, with a dip in subset_4. Best for tasks with consistent line counts.",
|
13 |
+
"new": "Similar to codegemma-7b-it, with better performance in subset_1 and subset_5. Still struggles with higher line counts.",
|
14 |
+
"global_insights": "Models like Nxcode-CQ-7B and deepseek_coder-6.7b-instruct perform well in subsets with higher line counts, while smaller models like codegemma-2b struggle. Larger models generally handle variability better, but cost-effectiveness must be considered."
|
15 |
+
}
|
llm_insight/HumanEval/6/QS/token_counts_recommendation.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"High accuracy and robustness across token counts": [
|
3 |
+
{
|
4 |
+
"Nxcode-CQ-7B": "Consistently high performance across all token subsets, making it reliable for varied token counts."
|
5 |
+
}
|
6 |
+
],
|
7 |
+
"Balanced performance and cost-effectiveness for low to medium token tasks": [
|
8 |
+
{
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong performance in lower token subsets and moderate in medium, suitable for cost-effective solutions."
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"deepseek_coder_33b-instruct": "High performance in low token subsets, good for tasks where token count is controlled."
|
13 |
+
}
|
14 |
+
],
|
15 |
+
"Budget-friendly for low token tasks": [
|
16 |
+
{
|
17 |
+
"codegemma-7b-it": "Moderate performance in low token subsets at a lower cost compared to larger models."
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"codeqwen1.5-7b": "Decent performance in low token subsets, suitable for budget constraints."
|
21 |
+
}
|
22 |
+
]
|
23 |
+
}
|
llm_insight/HumanEval/6/QS/token_counts_report.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"CodeFuse-DeepSeek-33b": "The model shows high variability in performance across token subsets, peaking at 96.43% in token_subset_3 but dropping to 62.5% in token_subset_6. This suggests sensitivity to token count variations.",
|
3 |
+
"Nxcode-CQ-7B": "Consistently high performance across all subsets, with the lowest score at 77.08% in token_subset_6. Demonstrates robustness to token count changes.",
|
4 |
+
"codegemma-2b": "Poor performance across all subsets, especially in token_subset_6 with only 1.25%. Not suitable for tasks requiring high accuracy.",
|
5 |
+
"codegemma-7b": "Moderate performance with significant drops in higher token subsets. Performance degrades as token count increases.",
|
6 |
+
"codegemma-7b-it": "Better than codegemma-7b but still shows a declining trend with increasing token counts. Best performance in token_subset_1 at 75.89%.",
|
7 |
+
"deepseek-coder-1.3b-base": "Low performance across all subsets, with a sharp decline in token_subset_6 to 5.62%. Not recommended for high-token tasks.",
|
8 |
+
"deepseek-coder-6.7b-base": "Moderate performance with a steady decline as token count increases. Best in token_subset_1 at 73.04%.",
|
9 |
+
"deepseek_coder-6.7b-instruct": "Strong performance in lower token subsets (85.71% in token_subset_1) but drops to 46.67% in token_subset_6. Good for low to medium token tasks.",
|
10 |
+
"deepseek_coder_33b-base": "Variable performance, peaking at 73.57% in token_subset_1 and dropping to 25.42% in token_subset_6. Sensitive to token count.",
|
11 |
+
"deepseek_coder_33b-instruct": "High performance in lower token subsets (86.25% in token_subset_1) but declines to 39.58% in token_subset_6. Suitable for low to medium token tasks.",
|
12 |
+
"codeqwen1.5-7b": "Moderate performance with a steady decline as token count increases. Best in token_subset_1 at 69.11%.",
|
13 |
+
"new": "Similar to codegemma-7b-it, with best performance in token_subset_1 at 75.89% and declining to 36.25% in token_subset_6.",
|
14 |
+
"global_insights": "Models generally perform better in lower token subsets, with performance degrading as token count increases. Nxcode-CQ-7B is the most robust across all subsets. Smaller models like codegemma-2b and deepseek-coder-1.3b-base perform poorly, especially in high-token subsets. Instruct models (e.g., deepseek_coder-6.7b-instruct) show better performance than their base counterparts but still decline with higher token counts."
|
15 |
+
}
|