lmy0802's picture
Upload 111 files
4daa863 verified
import json
# 读取数据
with open("sub_mbpp.json", "r", encoding="utf-8") as f:
data = json.load(f)
# 定义划分区间数
num_intervals = 8
token_min = min(item['token'] for item in data)
token_max = max(item['token'] for item in data)
token_interval_size = (token_max - token_min) / num_intervals
cyclomatic_complexity_min = min(item['cc'] for item in data)
cyclomatic_complexity_max = max(item['cc'] for item in data)
cyclomatic_complexity_interval_size = (cyclomatic_complexity_max - cyclomatic_complexity_min) / num_intervals
count1=0
count2=0
count3=0
count4=0
count5=0
count6=0
count7=0
count8=0
# 根据等距划分数据
for item in data:
# 计算 token 特征的区间
token_diff = int((item['token'] - token_min) // token_interval_size)
item['token_diff'] = min(token_diff,num_intervals-1)
if item['token_diff'] == 0:
count1 = count1 + 1
if item['token_diff'] == 1:
count2 = count2 + 1
if item['token_diff'] == 2:
count3 = count3 + 1
if item['token_diff'] == 3:
count4 = count4 + 1
if item['token_diff'] == 4:
count5 = count5 + 1
if item['token_diff'] == 5:
count6 = count6 + 1
if item['token_diff'] == 6:
count7 = count7 + 1
if item['token_diff'] == 7:
count8 = count8 + 1
# 计算 cyclomatic_complexity 特征的区间
CC_diff = int((item['cc'] - cyclomatic_complexity_min) // cyclomatic_complexity_interval_size)
item['CC_diff'] = min(CC_diff,num_intervals-1) # 确保区间索引在范围内
# 恢复原始顺序
data.sort(key=lambda x: x['id'])
print(count1,count2,count3,count4,count5,count6,count7,count8)
# 将更新后的数据写回JSON文件
with open('EI.json', 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)