Spaces:
Running
Running
import json | |
# 读取数据 | |
with open("sub_mbpp.json", "r", encoding="utf-8") as f: | |
data = json.load(f) | |
# 定义划分区间数 | |
num_intervals = 4 | |
token_min = min(item['token'] for item in data) | |
token_max = max(item['token'] for item in data) | |
token_interval_size = (token_max - token_min) / num_intervals | |
cyclomatic_complexity_min = min(item['cc'] for item in data) | |
cyclomatic_complexity_max = max(item['cc'] for item in data) | |
cyclomatic_complexity_interval_size = (cyclomatic_complexity_max - cyclomatic_complexity_min) / num_intervals | |
count1=0 | |
count2=0 | |
count3=0 | |
count4=0 | |
# 根据等距划分数据 | |
for item in data: | |
# 计算 token 特征的区间 | |
token_diff = int((item['token'] - token_min) // token_interval_size) | |
item['token_diff'] = min(token_diff,num_intervals-1) | |
if item['token_diff'] == 0: | |
count1 = count1 + 1 | |
if item['token_diff'] == 1: | |
count2 = count2 + 1 | |
if item['token_diff'] == 2: | |
count3 = count3 + 1 | |
if item['token_diff'] == 3: | |
count4 = count4 + 1 | |
# 计算 cyclomatic_complexity 特征的区间 | |
CC_diff = int((item['cc'] - cyclomatic_complexity_min) // cyclomatic_complexity_interval_size) | |
item['CC_diff'] = min(CC_diff,num_intervals-1) # 确保区间索引在范围内 | |
# 恢复原始顺序 | |
data.sort(key=lambda x: x['id']) | |
print(count1,count2,count3,count4) | |
# 将更新后的数据写回JSON文件 | |
with open('EI.json', 'w', encoding='utf-8') as file: | |
json.dump(data, file, ensure_ascii=False, indent=4) |