File size: 2,093 Bytes
4daa863
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import json

# 读取数据
with open("sub_mbpp.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 定义划分区间数
num_intervals = 6



token_min = min(item['token'] for item in data)
token_max = max(item['token'] for item in data)
token_interval_size = (token_max - token_min) / num_intervals

cyclomatic_complexity_min = min(item['cc'] for item in data)
cyclomatic_complexity_max = max(item['cc'] for item in data)
cyclomatic_complexity_interval_size = (cyclomatic_complexity_max - cyclomatic_complexity_min) / num_intervals
count1=0
count2=0
count3=0
count4=0
count5=0
count6=0

# 根据等距划分数据
for item in data:


    # 计算 token 特征的区间
    token_diff = int((item['token'] - token_min) // token_interval_size)
    item['token_diff'] = min(token_diff,num_intervals-1)
    # if item['token_diff'] == 0:
    #     count1 = count1 + 1
    # if item['token_diff'] == 1:
    #     count2 = count2 + 1
    # if item['token_diff'] == 2:
    #     count3 = count3 + 1
    # if item['token_diff'] == 3:
    #     count4 = count4 + 1
    # if item['token_diff'] == 4:
    #     count5 = count5 + 1
    # if item['token_diff'] == 5:
    #     count6 = count6 + 1

    # 计算 cyclomatic_complexity 特征的区间
    CC_diff = int((item['cc'] - cyclomatic_complexity_min) // cyclomatic_complexity_interval_size)
    item['CC_diff'] = min(CC_diff,num_intervals-1) # 确保区间索引在范围内
    if item['CC_diff'] == 0:
        count1 = count1 + 1
    if item['CC_diff'] == 1:
        count2 = count2 + 1
    if item['CC_diff'] == 2:
        count3 = count3 + 1
    if item['CC_diff'] == 3:
        count4 = count4 + 1
    if item['CC_diff'] == 4:
        count5 = count5 + 1
    if item['CC_diff'] == 5:
        count6 = count6 + 1

# 恢复原始顺序
data.sort(key=lambda x: x['id'])
print(count1,count2,count3,count4,count5,count6)

# 将更新后的数据写回JSON文件
with open('EI.json', 'w', encoding='utf-8') as file:
    json.dump(data, file, ensure_ascii=False, indent=4)