tianyaogavin commited on
Commit
25666e3
·
1 Parent(s): 5b39f9e

aggr voice segment test

Browse files
dataset/transcripts/test1_segment_10_aggregated_9_to_15_20250424_111404.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "dataset/audio/segments\\test1_segment_10_aggregated_9_to_15.wav",
3
+ "timestamp": "20250424_111404",
4
+ "segments": [
5
+ {
6
+ "text": "你会学习到如何使用音频数据集",
7
+ "start_time": 0.0,
8
+ "end_time": 1.62,
9
+ "confidence": 0.862060546875,
10
+ "verified": false,
11
+ "verified_text": null,
12
+ "verification_notes": null,
13
+ "segment_index": [
14
+ 9,
15
+ 10,
16
+ 11,
17
+ 12,
18
+ 13,
19
+ 14,
20
+ 15
21
+ ]
22
+ },
23
+ {
24
+ "text": "包括音频数据加载",
25
+ "start_time": 1.62,
26
+ "end_time": 3.0,
27
+ "confidence": 0.862060546875,
28
+ "verified": false,
29
+ "verified_text": null,
30
+ "verification_notes": null,
31
+ "segment_index": [
32
+ 9,
33
+ 10,
34
+ 11,
35
+ 12,
36
+ 13,
37
+ 14,
38
+ 15
39
+ ]
40
+ },
41
+ {
42
+ "text": "音频数据预处理",
43
+ "start_time": 3.0,
44
+ "end_time": 3.86,
45
+ "confidence": 0.862060546875,
46
+ "verified": false,
47
+ "verified_text": null,
48
+ "verification_notes": null,
49
+ "segment_index": [
50
+ 9,
51
+ 10,
52
+ 11,
53
+ 12,
54
+ 13,
55
+ 14,
56
+ 15
57
+ ]
58
+ },
59
+ {
60
+ "text": "以及高效加载大规模音频数据集的流逝加载方法",
61
+ "start_time": 3.86,
62
+ "end_time": 7.2,
63
+ "confidence": 0.862060546875,
64
+ "verified": false,
65
+ "verified_text": null,
66
+ "verification_notes": null,
67
+ "segment_index": [
68
+ 9,
69
+ 10,
70
+ 11,
71
+ 12,
72
+ 13,
73
+ 14,
74
+ 15
75
+ ]
76
+ },
77
+ {
78
+ "text": "完成本单元的学习后",
79
+ "start_time": 7.2,
80
+ "end_time": 8.28,
81
+ "confidence": 0.862060546875,
82
+ "verified": false,
83
+ "verified_text": null,
84
+ "verification_notes": null,
85
+ "segment_index": [
86
+ 9,
87
+ 10,
88
+ 11,
89
+ 12,
90
+ 13,
91
+ 14,
92
+ 15
93
+ ]
94
+ },
95
+ {
96
+ "text": "你会掌握基础的音频相关数",
97
+ "start_time": 8.28,
98
+ "end_time": 10.88,
99
+ "confidence": 0.862060546875,
100
+ "verified": false,
101
+ "verified_text": null,
102
+ "verification_notes": null,
103
+ "segment_index": [
104
+ 9,
105
+ 10,
106
+ 11,
107
+ 12,
108
+ 13,
109
+ 14,
110
+ 15
111
+ ]
112
+ },
113
+ {
114
+ "text": "并且掌握针对不同应用的音频数据处理工具",
115
+ "start_time": 10.88,
116
+ "end_time": 14.42,
117
+ "confidence": 0.862060546875,
118
+ "verified": false,
119
+ "verified_text": null,
120
+ "verification_notes": null,
121
+ "segment_index": [
122
+ 9,
123
+ 10,
124
+ 11,
125
+ 12,
126
+ 13,
127
+ 14,
128
+ 15
129
+ ]
130
+ }
131
+ ]
132
+ }
dataset/transcripts/test1_segment_10_aggregated_9_to_16_20250424_111418.json ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "dataset/audio/segments\\test1_segment_10_aggregated_9_to_16.wav",
3
+ "timestamp": "20250424_111418",
4
+ "segments": [
5
+ {
6
+ "text": "你会学习到如何使用音频数据集",
7
+ "start_time": 0.0,
8
+ "end_time": 1.62,
9
+ "confidence": 0.8543701171875,
10
+ "verified": false,
11
+ "verified_text": null,
12
+ "verification_notes": null,
13
+ "segment_index": [
14
+ 9,
15
+ 10,
16
+ 11,
17
+ 12,
18
+ 13,
19
+ 14,
20
+ 15,
21
+ 16
22
+ ]
23
+ },
24
+ {
25
+ "text": "包括音频数据加载",
26
+ "start_time": 1.62,
27
+ "end_time": 3.0,
28
+ "confidence": 0.8543701171875,
29
+ "verified": false,
30
+ "verified_text": null,
31
+ "verification_notes": null,
32
+ "segment_index": [
33
+ 9,
34
+ 10,
35
+ 11,
36
+ 12,
37
+ 13,
38
+ 14,
39
+ 15,
40
+ 16
41
+ ]
42
+ },
43
+ {
44
+ "text": "音频数据预处理",
45
+ "start_time": 3.0,
46
+ "end_time": 3.86,
47
+ "confidence": 0.8543701171875,
48
+ "verified": false,
49
+ "verified_text": null,
50
+ "verification_notes": null,
51
+ "segment_index": [
52
+ 9,
53
+ 10,
54
+ 11,
55
+ 12,
56
+ 13,
57
+ 14,
58
+ 15,
59
+ 16
60
+ ]
61
+ },
62
+ {
63
+ "text": "以及高效加载大规模音频数据集的流逝加载方法",
64
+ "start_time": 3.86,
65
+ "end_time": 7.2,
66
+ "confidence": 0.8543701171875,
67
+ "verified": false,
68
+ "verified_text": null,
69
+ "verification_notes": null,
70
+ "segment_index": [
71
+ 9,
72
+ 10,
73
+ 11,
74
+ 12,
75
+ 13,
76
+ 14,
77
+ 15,
78
+ 16
79
+ ]
80
+ },
81
+ {
82
+ "text": "完成本单元的学习后",
83
+ "start_time": 7.2,
84
+ "end_time": 8.28,
85
+ "confidence": 0.8543701171875,
86
+ "verified": false,
87
+ "verified_text": null,
88
+ "verification_notes": null,
89
+ "segment_index": [
90
+ 9,
91
+ 10,
92
+ 11,
93
+ 12,
94
+ 13,
95
+ 14,
96
+ 15,
97
+ 16
98
+ ]
99
+ },
100
+ {
101
+ "text": "你会掌握基础的音频相关数",
102
+ "start_time": 8.28,
103
+ "end_time": 10.88,
104
+ "confidence": 0.8543701171875,
105
+ "verified": false,
106
+ "verified_text": null,
107
+ "verification_notes": null,
108
+ "segment_index": [
109
+ 9,
110
+ 10,
111
+ 11,
112
+ 12,
113
+ 13,
114
+ 14,
115
+ 15,
116
+ 16
117
+ ]
118
+ },
119
+ {
120
+ "text": "并且掌握针对不同应用的音频数据处理工具",
121
+ "start_time": 10.88,
122
+ "end_time": 14.42,
123
+ "confidence": 0.8543701171875,
124
+ "verified": false,
125
+ "verified_text": null,
126
+ "verification_notes": null,
127
+ "segment_index": [
128
+ 9,
129
+ 10,
130
+ 11,
131
+ 12,
132
+ 13,
133
+ 14,
134
+ 15,
135
+ 16
136
+ ]
137
+ },
138
+ {
139
+ "text": "本单元的知识会成为后面章节的基础",
140
+ "start_time": 14.42,
141
+ "end_time": 16.8,
142
+ "confidence": 0.8543701171875,
143
+ "verified": false,
144
+ "verified_text": null,
145
+ "verification_notes": null,
146
+ "segment_index": [
147
+ 9,
148
+ 10,
149
+ 11,
150
+ 12,
151
+ 13,
152
+ 14,
153
+ 15,
154
+ 16
155
+ ]
156
+ }
157
+ ]
158
+ }
dataset/transcripts/test1_segment_17_aggregated_8_to_9_20250424_105049.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "dataset/audio/segments\\test1_segment_17_aggregated_8_to_9.wav",
3
+ "timestamp": "20250424_105049",
4
+ "segments": [
5
+ {
6
+ "text": "本单元的知识会成为后面章节的基础",
7
+ "start_time": 0.0,
8
+ "end_time": 2.32,
9
+ "confidence": 0.657958984375,
10
+ "verified": false,
11
+ "verified_text": null,
12
+ "verification_notes": null
13
+ },
14
+ {
15
+ "text": "音频数据处理",
16
+ "start_time": 2.32,
17
+ "end_time": 3.56,
18
+ "confidence": 0.657958984375,
19
+ "verified": false,
20
+ "verified_text": null,
21
+ "verification_notes": null
22
+ }
23
+ ]
24
+ }
dataset/transcripts/test1_segment_1_20250423_163518.json ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "dataset/audio/segments\\test1_segment_1.wav",
3
+ "timestamp": "20250423_163518",
4
+ "segments": [
5
+ {
6
+ "text": "第一單元",
7
+ "start_time": 3.26,
8
+ "end_time": 3.9,
9
+ "confidence": 0.546142578125,
10
+ "verified": false,
11
+ "verified_text": null,
12
+ "verification_notes": null
13
+ },
14
+ {
15
+ "text": "音频数据处理",
16
+ "start_time": 4.34,
17
+ "end_time": 5.74,
18
+ "confidence": 0.302734375,
19
+ "verified": false,
20
+ "verified_text": null,
21
+ "verification_notes": null
22
+ },
23
+ {
24
+ "text": "单元简介",
25
+ "start_time": 7.1,
26
+ "end_time": 7.859999999999999,
27
+ "confidence": 0.642578125,
28
+ "verified": false,
29
+ "verified_text": null,
30
+ "verification_notes": null
31
+ },
32
+ {
33
+ "text": "所有音频或语音相关的任务都需要使用音频文件",
34
+ "start_time": 8.8,
35
+ "end_time": 12.4,
36
+ "confidence": 0.93402099609375,
37
+ "verified": false,
38
+ "verified_text": null,
39
+ "verification_notes": null
40
+ },
41
+ {
42
+ "text": "在我们深入了解这些任务之前,我们需要了解音频文件的实际内容。",
43
+ "start_time": 12.8,
44
+ "end_time": 16.8,
45
+ "confidence": 0.844482421875,
46
+ "verified": false,
47
+ "verified_text": null,
48
+ "verification_notes": null
49
+ },
50
+ {
51
+ "text": "以及如何利用一瓶文件",
52
+ "start_time": 17.32,
53
+ "end_time": 18.72,
54
+ "confidence": 0.802001953125,
55
+ "verified": false,
56
+ "verified_text": null,
57
+ "verification_notes": null
58
+ },
59
+ {
60
+ "text": "本来原将为你介绍的",
61
+ "start_time": 19.76,
62
+ "end_time": 21.040000000000003,
63
+ "confidence": 0.7650146484375,
64
+ "verified": false,
65
+ "verified_text": null,
66
+ "verification_notes": null
67
+ },
68
+ {
69
+ "text": "本单元将为你介绍与音频数据相关的基本概念",
70
+ "start_time": 21.62,
71
+ "end_time": 25.62,
72
+ "confidence": 0.87860107421875,
73
+ "verified": false,
74
+ "verified_text": null,
75
+ "verification_notes": null
76
+ },
77
+ {
78
+ "text": "包括波形彩虹绿和冰补涂",
79
+ "start_time": 26.28,
80
+ "end_time": 28.080000000000002,
81
+ "confidence": 0.93768310546875,
82
+ "verified": false,
83
+ "verified_text": null,
84
+ "verification_notes": null
85
+ },
86
+ {
87
+ "text": "你会学习到如何使用音频数语集",
88
+ "start_time": 28.56,
89
+ "end_time": 30.36,
90
+ "confidence": 0.90057373046875,
91
+ "verified": false,
92
+ "verified_text": null,
93
+ "verification_notes": null
94
+ },
95
+ {
96
+ "text": "包括音频数语加载",
97
+ "start_time": 30.36,
98
+ "end_time": 31.599999999999998,
99
+ "confidence": 0.90057373046875,
100
+ "verified": false,
101
+ "verified_text": null,
102
+ "verification_notes": null
103
+ },
104
+ {
105
+ "text": "音频数据预处理以及",
106
+ "start_time": 31.98,
107
+ "end_time": 33.22,
108
+ "confidence": 0.597412109375,
109
+ "verified": false,
110
+ "verified_text": null,
111
+ "verification_notes": null
112
+ },
113
+ {
114
+ "text": "高效加载大规模音频数据集的流式加载方法",
115
+ "start_time": 33.54,
116
+ "end_time": 36.54,
117
+ "confidence": 0.76708984375,
118
+ "verified": false,
119
+ "verified_text": null,
120
+ "verification_notes": null
121
+ },
122
+ {
123
+ "text": "完成本大圆的学习后",
124
+ "start_time": 37.82,
125
+ "end_time": 38.94,
126
+ "confidence": 0.88128662109375,
127
+ "verified": false,
128
+ "verified_text": null,
129
+ "verification_notes": null
130
+ },
131
+ {
132
+ "text": "你会掌握",
133
+ "start_time": 39.34,
134
+ "end_time": 40.34,
135
+ "confidence": 0.375,
136
+ "verified": false,
137
+ "verified_text": null,
138
+ "verification_notes": null
139
+ },
140
+ {
141
+ "text": "基础的音频相关数",
142
+ "start_time": 40.86,
143
+ "end_time": 42.38,
144
+ "confidence": 0.30810546875,
145
+ "verified": false,
146
+ "verified_text": null,
147
+ "verification_notes": null
148
+ },
149
+ {
150
+ "text": "并且掌握针对不同应用的音频数据处理工具",
151
+ "start_time": 43.04,
152
+ "end_time": 46.6,
153
+ "confidence": 0.9736175537109375,
154
+ "verified": false,
155
+ "verified_text": null,
156
+ "verification_notes": null
157
+ },
158
+ {
159
+ "text": "本大员的支持会成为后面章节的基础",
160
+ "start_time": 47.5,
161
+ "end_time": 49.8,
162
+ "confidence": 0.82470703125,
163
+ "verified": false,
164
+ "verified_text": null,
165
+ "verification_notes": null
166
+ }
167
+ ]
168
+ }
dataset/transcripts/test1_segment_1_20250423_190011.json ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "dataset/audio/segments\\test1_segment_1.wav",
3
+ "timestamp": "20250423_190011",
4
+ "segments": [
5
+ {
6
+ "text": "第一单元",
7
+ "start_time": 3.26,
8
+ "end_time": 3.9,
9
+ "confidence": 0.85302734375,
10
+ "verified": false,
11
+ "verified_text": null,
12
+ "verification_notes": null
13
+ },
14
+ {
15
+ "text": "音频数据出来",
16
+ "start_time": 4.34,
17
+ "end_time": 5.56,
18
+ "confidence": 0.4482421875,
19
+ "verified": false,
20
+ "verified_text": null,
21
+ "verification_notes": null
22
+ },
23
+ {
24
+ "text": "单元简介",
25
+ "start_time": 7.1,
26
+ "end_time": 7.8,
27
+ "confidence": 0.854736328125,
28
+ "verified": false,
29
+ "verified_text": null,
30
+ "verification_notes": null
31
+ },
32
+ {
33
+ "text": "所有音频或语音相关的任务都需要使用音频文件",
34
+ "start_time": 8.8,
35
+ "end_time": 12.4,
36
+ "confidence": 0.981781005859375,
37
+ "verified": false,
38
+ "verified_text": null,
39
+ "verification_notes": null
40
+ },
41
+ {
42
+ "text": "在我们深入了解这些任务之前",
43
+ "start_time": 12.8,
44
+ "end_time": 14.600000000000001,
45
+ "confidence": 0.8140869140625,
46
+ "verified": false,
47
+ "verified_text": null,
48
+ "verification_notes": null
49
+ },
50
+ {
51
+ "text": "我们需要了解音频文件的实际内容",
52
+ "start_time": 14.600000000000001,
53
+ "end_time": 16.78,
54
+ "confidence": 0.8140869140625,
55
+ "verified": false,
56
+ "verified_text": null,
57
+ "verification_notes": null
58
+ },
59
+ {
60
+ "text": "以及如何利用音频文件",
61
+ "start_time": 17.32,
62
+ "end_time": 18.68,
63
+ "confidence": 0.793212890625,
64
+ "verified": false,
65
+ "verified_text": null,
66
+ "verification_notes": null
67
+ },
68
+ {
69
+ "text": "本单元将为你介绍的",
70
+ "start_time": 19.76,
71
+ "end_time": 21.12,
72
+ "confidence": 0.852783203125,
73
+ "verified": false,
74
+ "verified_text": null,
75
+ "verification_notes": null
76
+ },
77
+ {
78
+ "text": "本单元将为你介绍与音频数据相关的基本概念",
79
+ "start_time": 21.62,
80
+ "end_time": 25.580000000000002,
81
+ "confidence": 0.9444580078125,
82
+ "verified": false,
83
+ "verified_text": null,
84
+ "verification_notes": null
85
+ },
86
+ {
87
+ "text": "包括波形,彩虹率和冰普渡",
88
+ "start_time": 26.28,
89
+ "end_time": 28.28,
90
+ "confidence": 0.732666015625,
91
+ "verified": false,
92
+ "verified_text": null,
93
+ "verification_notes": null
94
+ },
95
+ {
96
+ "text": "你会学习到如何使用音频数据集,包括音频数据加载。",
97
+ "start_time": 28.56,
98
+ "end_time": 31.56,
99
+ "confidence": 0.953521728515625,
100
+ "verified": false,
101
+ "verified_text": null,
102
+ "verification_notes": null
103
+ },
104
+ {
105
+ "text": "音频数据处理以及",
106
+ "start_time": 31.98,
107
+ "end_time": 33.18,
108
+ "confidence": 0.685791015625,
109
+ "verified": false,
110
+ "verified_text": null,
111
+ "verification_notes": null
112
+ },
113
+ {
114
+ "text": "高效加载大规模音频数据集的流逝加载方法。",
115
+ "start_time": 33.54,
116
+ "end_time": 36.5,
117
+ "confidence": 0.88739013671875,
118
+ "verified": false,
119
+ "verified_text": null,
120
+ "verification_notes": null
121
+ },
122
+ {
123
+ "text": "完成本单元的学习后",
124
+ "start_time": 37.82,
125
+ "end_time": 38.94,
126
+ "confidence": 0.9327392578125,
127
+ "verified": false,
128
+ "verified_text": null,
129
+ "verification_notes": null
130
+ },
131
+ {
132
+ "text": "你会掌握",
133
+ "start_time": 39.34,
134
+ "end_time": 40.34,
135
+ "confidence": 0.73193359375,
136
+ "verified": false,
137
+ "verified_text": null,
138
+ "verification_notes": null
139
+ },
140
+ {
141
+ "text": "基础的音频相关数",
142
+ "start_time": 40.86,
143
+ "end_time": 42.4,
144
+ "confidence": 0.609619140625,
145
+ "verified": false,
146
+ "verified_text": null,
147
+ "verification_notes": null
148
+ },
149
+ {
150
+ "text": "并且掌握针对不同应用的音频数据处理工具",
151
+ "start_time": 43.04,
152
+ "end_time": 46.56,
153
+ "confidence": 0.96221923828125,
154
+ "verified": false,
155
+ "verified_text": null,
156
+ "verification_notes": null
157
+ },
158
+ {
159
+ "text": "本单元的知识会成为后面章节的基础",
160
+ "start_time": 47.5,
161
+ "end_time": 49.86,
162
+ "confidence": 0.75439453125,
163
+ "verified": false,
164
+ "verified_text": null,
165
+ "verification_notes": null
166
+ }
167
+ ]
168
+ }
dataset/transcripts/test1_segment_1_20250423_190044.json ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "dataset/audio/segments\\test1_segment_1.wav",
3
+ "timestamp": "20250423_190044",
4
+ "segments": [
5
+ {
6
+ "text": "第一單元",
7
+ "start_time": 3.26,
8
+ "end_time": 3.9,
9
+ "confidence": 0.546142578125,
10
+ "verified": false,
11
+ "verified_text": null,
12
+ "verification_notes": null
13
+ },
14
+ {
15
+ "text": "音频数据处理",
16
+ "start_time": 4.34,
17
+ "end_time": 5.74,
18
+ "confidence": 0.302734375,
19
+ "verified": false,
20
+ "verified_text": null,
21
+ "verification_notes": null
22
+ },
23
+ {
24
+ "text": "单元简介",
25
+ "start_time": 7.1,
26
+ "end_time": 7.859999999999999,
27
+ "confidence": 0.642578125,
28
+ "verified": false,
29
+ "verified_text": null,
30
+ "verification_notes": null
31
+ },
32
+ {
33
+ "text": "所有音频或语音相关的任务都需要使用音频文件",
34
+ "start_time": 8.8,
35
+ "end_time": 12.4,
36
+ "confidence": 0.93402099609375,
37
+ "verified": false,
38
+ "verified_text": null,
39
+ "verification_notes": null
40
+ },
41
+ {
42
+ "text": "在我们深入了解这些任务之前,我们需要了解音频文件的实际内容。",
43
+ "start_time": 12.8,
44
+ "end_time": 16.8,
45
+ "confidence": 0.844482421875,
46
+ "verified": false,
47
+ "verified_text": null,
48
+ "verification_notes": null
49
+ },
50
+ {
51
+ "text": "以及如何利用一瓶文件",
52
+ "start_time": 17.32,
53
+ "end_time": 18.72,
54
+ "confidence": 0.802001953125,
55
+ "verified": false,
56
+ "verified_text": null,
57
+ "verification_notes": null
58
+ },
59
+ {
60
+ "text": "本来原将为你介绍的",
61
+ "start_time": 19.76,
62
+ "end_time": 21.040000000000003,
63
+ "confidence": 0.7650146484375,
64
+ "verified": false,
65
+ "verified_text": null,
66
+ "verification_notes": null
67
+ },
68
+ {
69
+ "text": "本单元将为你介绍与音频数据相关的基本概念",
70
+ "start_time": 21.62,
71
+ "end_time": 25.62,
72
+ "confidence": 0.87860107421875,
73
+ "verified": false,
74
+ "verified_text": null,
75
+ "verification_notes": null
76
+ },
77
+ {
78
+ "text": "包括波形彩虹绿和冰补涂",
79
+ "start_time": 26.28,
80
+ "end_time": 28.080000000000002,
81
+ "confidence": 0.93768310546875,
82
+ "verified": false,
83
+ "verified_text": null,
84
+ "verification_notes": null
85
+ },
86
+ {
87
+ "text": "你会学习到如何使用音频数语集",
88
+ "start_time": 28.56,
89
+ "end_time": 30.36,
90
+ "confidence": 0.90057373046875,
91
+ "verified": false,
92
+ "verified_text": null,
93
+ "verification_notes": null
94
+ },
95
+ {
96
+ "text": "包括音频数语加载",
97
+ "start_time": 30.36,
98
+ "end_time": 31.599999999999998,
99
+ "confidence": 0.90057373046875,
100
+ "verified": false,
101
+ "verified_text": null,
102
+ "verification_notes": null
103
+ },
104
+ {
105
+ "text": "音频数据预处理以及",
106
+ "start_time": 31.98,
107
+ "end_time": 33.22,
108
+ "confidence": 0.597412109375,
109
+ "verified": false,
110
+ "verified_text": null,
111
+ "verification_notes": null
112
+ },
113
+ {
114
+ "text": "高效加载大规模音频数据集的流式加载方法",
115
+ "start_time": 33.54,
116
+ "end_time": 36.54,
117
+ "confidence": 0.76708984375,
118
+ "verified": false,
119
+ "verified_text": null,
120
+ "verification_notes": null
121
+ },
122
+ {
123
+ "text": "完成本大圆的学习后",
124
+ "start_time": 37.82,
125
+ "end_time": 38.94,
126
+ "confidence": 0.88128662109375,
127
+ "verified": false,
128
+ "verified_text": null,
129
+ "verification_notes": null
130
+ },
131
+ {
132
+ "text": "你会掌握",
133
+ "start_time": 39.34,
134
+ "end_time": 40.34,
135
+ "confidence": 0.375,
136
+ "verified": false,
137
+ "verified_text": null,
138
+ "verification_notes": null
139
+ },
140
+ {
141
+ "text": "基础的音频相关数",
142
+ "start_time": 40.86,
143
+ "end_time": 42.38,
144
+ "confidence": 0.30810546875,
145
+ "verified": false,
146
+ "verified_text": null,
147
+ "verification_notes": null
148
+ },
149
+ {
150
+ "text": "并且掌握针对不同应用的音频数据处理工具",
151
+ "start_time": 43.04,
152
+ "end_time": 46.6,
153
+ "confidence": 0.9736175537109375,
154
+ "verified": false,
155
+ "verified_text": null,
156
+ "verification_notes": null
157
+ },
158
+ {
159
+ "text": "本大员的支持会成为后面章节的基础",
160
+ "start_time": 47.5,
161
+ "end_time": 49.8,
162
+ "confidence": 0.82470703125,
163
+ "verified": false,
164
+ "verified_text": null,
165
+ "verification_notes": null
166
+ }
167
+ ]
168
+ }
dataset/transcripts/test1_segment_1_20250423_201934.json ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "dataset/audio/segments\\test1_segment_1.wav",
3
+ "timestamp": "20250423_201934",
4
+ "segments": [
5
+ {
6
+ "text": "第一单元",
7
+ "start_time": 3.26,
8
+ "end_time": 3.9,
9
+ "confidence": 0.85302734375,
10
+ "verified": false,
11
+ "verified_text": null,
12
+ "verification_notes": null
13
+ },
14
+ {
15
+ "text": "音频数据出来",
16
+ "start_time": 4.34,
17
+ "end_time": 5.56,
18
+ "confidence": 0.4482421875,
19
+ "verified": false,
20
+ "verified_text": null,
21
+ "verification_notes": null
22
+ },
23
+ {
24
+ "text": "单元简介",
25
+ "start_time": 7.1,
26
+ "end_time": 7.8,
27
+ "confidence": 0.854736328125,
28
+ "verified": false,
29
+ "verified_text": null,
30
+ "verification_notes": null
31
+ },
32
+ {
33
+ "text": "所有音频或语音相关的任务都需要使用音频文件",
34
+ "start_time": 8.8,
35
+ "end_time": 12.4,
36
+ "confidence": 0.981781005859375,
37
+ "verified": false,
38
+ "verified_text": null,
39
+ "verification_notes": null
40
+ },
41
+ {
42
+ "text": "在我们深入了解这些任务之前",
43
+ "start_time": 12.8,
44
+ "end_time": 14.600000000000001,
45
+ "confidence": 0.8140869140625,
46
+ "verified": false,
47
+ "verified_text": null,
48
+ "verification_notes": null
49
+ },
50
+ {
51
+ "text": "我们需要了解音频文件的实际内容",
52
+ "start_time": 14.600000000000001,
53
+ "end_time": 16.78,
54
+ "confidence": 0.8140869140625,
55
+ "verified": false,
56
+ "verified_text": null,
57
+ "verification_notes": null
58
+ },
59
+ {
60
+ "text": "以及如何利用音频文件",
61
+ "start_time": 17.32,
62
+ "end_time": 18.68,
63
+ "confidence": 0.793212890625,
64
+ "verified": false,
65
+ "verified_text": null,
66
+ "verification_notes": null
67
+ },
68
+ {
69
+ "text": "本单元将为你介绍的",
70
+ "start_time": 19.76,
71
+ "end_time": 21.12,
72
+ "confidence": 0.852783203125,
73
+ "verified": false,
74
+ "verified_text": null,
75
+ "verification_notes": null
76
+ },
77
+ {
78
+ "text": "本单元将为你介绍与音频数据相关的基本概念",
79
+ "start_time": 21.62,
80
+ "end_time": 25.580000000000002,
81
+ "confidence": 0.9444580078125,
82
+ "verified": false,
83
+ "verified_text": null,
84
+ "verification_notes": null
85
+ },
86
+ {
87
+ "text": "包括波形,彩虹率和冰普渡",
88
+ "start_time": 26.28,
89
+ "end_time": 28.28,
90
+ "confidence": 0.732666015625,
91
+ "verified": false,
92
+ "verified_text": null,
93
+ "verification_notes": null
94
+ },
95
+ {
96
+ "text": "你会学习到如何使用音频数据集,包括音频数据加载。",
97
+ "start_time": 28.56,
98
+ "end_time": 31.56,
99
+ "confidence": 0.953521728515625,
100
+ "verified": false,
101
+ "verified_text": null,
102
+ "verification_notes": null
103
+ },
104
+ {
105
+ "text": "音频数据处理以及",
106
+ "start_time": 31.98,
107
+ "end_time": 33.18,
108
+ "confidence": 0.685791015625,
109
+ "verified": false,
110
+ "verified_text": null,
111
+ "verification_notes": null
112
+ },
113
+ {
114
+ "text": "高效加载大规模音频数据集的流逝加载方法。",
115
+ "start_time": 33.54,
116
+ "end_time": 36.5,
117
+ "confidence": 0.88739013671875,
118
+ "verified": false,
119
+ "verified_text": null,
120
+ "verification_notes": null
121
+ },
122
+ {
123
+ "text": "完成本单元的学习后",
124
+ "start_time": 37.82,
125
+ "end_time": 38.94,
126
+ "confidence": 0.9327392578125,
127
+ "verified": false,
128
+ "verified_text": null,
129
+ "verification_notes": null
130
+ },
131
+ {
132
+ "text": "你会掌握",
133
+ "start_time": 39.34,
134
+ "end_time": 40.34,
135
+ "confidence": 0.73193359375,
136
+ "verified": false,
137
+ "verified_text": null,
138
+ "verification_notes": null
139
+ },
140
+ {
141
+ "text": "基础的音频相关数",
142
+ "start_time": 40.86,
143
+ "end_time": 42.4,
144
+ "confidence": 0.609619140625,
145
+ "verified": false,
146
+ "verified_text": null,
147
+ "verification_notes": null
148
+ },
149
+ {
150
+ "text": "并且掌握针对不同应用的音频数据处理工具",
151
+ "start_time": 43.04,
152
+ "end_time": 46.56,
153
+ "confidence": 0.96221923828125,
154
+ "verified": false,
155
+ "verified_text": null,
156
+ "verification_notes": null
157
+ },
158
+ {
159
+ "text": "本单元的知识会成为后面章节的基础",
160
+ "start_time": 47.5,
161
+ "end_time": 49.86,
162
+ "confidence": 0.75439453125,
163
+ "verified": false,
164
+ "verified_text": null,
165
+ "verification_notes": null
166
+ }
167
+ ]
168
+ }
dataset/transcripts/test1_segment_1_aggregated_0_to_8_20250424_111323.json ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "dataset/audio/segments\\test1_segment_1_aggregated_0_to_8.wav",
3
+ "timestamp": "20250424_111323",
4
+ "segments": [
5
+ {
6
+ "text": "第一单元,音频数据处理单元简介",
7
+ "start_time": 0.0,
8
+ "end_time": 2.52,
9
+ "confidence": 0.8226318359375,
10
+ "verified": false,
11
+ "verified_text": null,
12
+ "verification_notes": null,
13
+ "segment_index": [
14
+ 0,
15
+ 1,
16
+ 2,
17
+ 3,
18
+ 4,
19
+ 5,
20
+ 6,
21
+ 7,
22
+ 8
23
+ ]
24
+ },
25
+ {
26
+ "text": "所有音频或语音相关的任务都需要使用音频文件",
27
+ "start_time": 2.52,
28
+ "end_time": 6.18,
29
+ "confidence": 0.8226318359375,
30
+ "verified": false,
31
+ "verified_text": null,
32
+ "verification_notes": null,
33
+ "segment_index": [
34
+ 0,
35
+ 1,
36
+ 2,
37
+ 3,
38
+ 4,
39
+ 5,
40
+ 6,
41
+ 7,
42
+ 8
43
+ ]
44
+ },
45
+ {
46
+ "text": "在我们深入了解这些任务之前",
47
+ "start_time": 6.18,
48
+ "end_time": 8.02,
49
+ "confidence": 0.8226318359375,
50
+ "verified": false,
51
+ "verified_text": null,
52
+ "verification_notes": null,
53
+ "segment_index": [
54
+ 0,
55
+ 1,
56
+ 2,
57
+ 3,
58
+ 4,
59
+ 5,
60
+ 6,
61
+ 7,
62
+ 8
63
+ ]
64
+ },
65
+ {
66
+ "text": "我们需要了解音频文件的实际内容",
67
+ "start_time": 8.02,
68
+ "end_time": 10.18,
69
+ "confidence": 0.8226318359375,
70
+ "verified": false,
71
+ "verified_text": null,
72
+ "verification_notes": null,
73
+ "segment_index": [
74
+ 0,
75
+ 1,
76
+ 2,
77
+ 3,
78
+ 4,
79
+ 5,
80
+ 6,
81
+ 7,
82
+ 8
83
+ ]
84
+ },
85
+ {
86
+ "text": "以及如何利用音频文件",
87
+ "start_time": 10.18,
88
+ "end_time": 11.52,
89
+ "confidence": 0.8226318359375,
90
+ "verified": false,
91
+ "verified_text": null,
92
+ "verification_notes": null,
93
+ "segment_index": [
94
+ 0,
95
+ 1,
96
+ 2,
97
+ 3,
98
+ 4,
99
+ 5,
100
+ 6,
101
+ 7,
102
+ 8
103
+ ]
104
+ },
105
+ {
106
+ "text": "本单元将为你介绍的",
107
+ "start_time": 11.52,
108
+ "end_time": 12.700000000000001,
109
+ "confidence": 0.8226318359375,
110
+ "verified": false,
111
+ "verified_text": null,
112
+ "verification_notes": null,
113
+ "segment_index": [
114
+ 0,
115
+ 1,
116
+ 2,
117
+ 3,
118
+ 4,
119
+ 5,
120
+ 6,
121
+ 7,
122
+ 8
123
+ ]
124
+ },
125
+ {
126
+ "text": "本单元将为你介绍与音频数据相关的基本概念",
127
+ "start_time": 12.700000000000001,
128
+ "end_time": 16.9,
129
+ "confidence": 0.8226318359375,
130
+ "verified": false,
131
+ "verified_text": null,
132
+ "verification_notes": null,
133
+ "segment_index": [
134
+ 0,
135
+ 1,
136
+ 2,
137
+ 3,
138
+ 4,
139
+ 5,
140
+ 6,
141
+ 7,
142
+ 8
143
+ ]
144
+ },
145
+ {
146
+ "text": "包括波形、采样率和频谱图",
147
+ "start_time": 16.9,
148
+ "end_time": 18.68,
149
+ "confidence": 0.8226318359375,
150
+ "verified": false,
151
+ "verified_text": null,
152
+ "verification_notes": null,
153
+ "segment_index": [
154
+ 0,
155
+ 1,
156
+ 2,
157
+ 3,
158
+ 4,
159
+ 5,
160
+ 6,
161
+ 7,
162
+ 8
163
+ ]
164
+ }
165
+ ]
166
+ }
dataset/transcripts/test1_segment_8_aggregated_7_to_8_20250424_110257.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "dataset/audio/segments\\test1_segment_8_aggregated_7_to_8.wav",
3
+ "timestamp": "20250424_110257",
4
+ "segments": [
5
+ {
6
+ "text": "本单元将为你介绍与音频数据相关的基本概念",
7
+ "start_time": 0.0,
8
+ "end_time": 3.96,
9
+ "confidence": 0.9649658203125,
10
+ "verified": false,
11
+ "verified_text": null,
12
+ "verification_notes": null,
13
+ "segment_index": [
14
+ 7,
15
+ 8
16
+ ]
17
+ },
18
+ {
19
+ "text": "包括波形、采样率和频谱图",
20
+ "start_time": 3.96,
21
+ "end_time": 5.76,
22
+ "confidence": 0.9649658203125,
23
+ "verified": false,
24
+ "verified_text": null,
25
+ "verification_notes": null,
26
+ "segment_index": [
27
+ 7,
28
+ 8
29
+ ]
30
+ }
31
+ ]
32
+ }
dataset/transcripts/test1_segment_8_aggregated_7_to_8_20250424_110758.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "dataset/audio/segments\\test1_segment_8_aggregated_7_to_8.wav",
3
+ "timestamp": "20250424_110758",
4
+ "segments": [
5
+ {
6
+ "text": "本单元将为你介绍与音频数据相关的基本概念",
7
+ "start_time": 0.0,
8
+ "end_time": 3.96,
9
+ "confidence": 0.9649658203125,
10
+ "verified": false,
11
+ "verified_text": null,
12
+ "verification_notes": null,
13
+ "segment_index": [
14
+ 7,
15
+ 8
16
+ ]
17
+ },
18
+ {
19
+ "text": "包括波形、采样率和频谱图",
20
+ "start_time": 3.96,
21
+ "end_time": 5.76,
22
+ "confidence": 0.9649658203125,
23
+ "verified": false,
24
+ "verified_text": null,
25
+ "verification_notes": null,
26
+ "segment_index": [
27
+ 7,
28
+ 8
29
+ ]
30
+ }
31
+ ]
32
+ }
dataset/transcripts/test1_segment_9_aggregated_8_to_9_20250424_110138.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "dataset/audio/segments\\test1_segment_9_aggregated_8_to_9.wav",
3
+ "timestamp": "20250424_110138",
4
+ "segments": [
5
+ {
6
+ "text": "包括波形、采样率和频谱图。",
7
+ "start_time": 0.0,
8
+ "end_time": 1.68,
9
+ "confidence": 0.812255859375,
10
+ "verified": false,
11
+ "verified_text": null,
12
+ "verification_notes": null,
13
+ "segment_index": [
14
+ 8,
15
+ 9
16
+ ]
17
+ },
18
+ {
19
+ "text": "你会学习到如何使用音频数据集,包括音频数据加载。",
20
+ "start_time": 1.9000000000000001,
21
+ "end_time": 4.92,
22
+ "confidence": 0.812255859375,
23
+ "verified": false,
24
+ "verified_text": null,
25
+ "verification_notes": null,
26
+ "segment_index": [
27
+ 8,
28
+ 9
29
+ ]
30
+ }
31
+ ]
32
+ }
vad/README.md CHANGED
@@ -14,7 +14,12 @@
14
  - 支持批量处理音频片段
15
  - 保存带时间戳的转录文本
16
 
17
- ### 3. 人工验证界面
 
 
 
 
 
18
  - 交互式验证转录结果
19
  - 支持修改转录文本
20
  - 添加验证注释
@@ -55,8 +60,7 @@ python vad/main.py process dataset/audio/test1.wav
55
  这个命令会:
56
  1. 将音频切割成多个片段
57
  2. 对每个片段进行转录
58
- 3. 提供交互式界面进行验证
59
- 4. 保存结果到JSON文件
60
 
61
  ### 3. 验证已有的转录结果
62
 
@@ -65,6 +69,22 @@ python vad/main.py process dataset/audio/test1.wav
65
  python vad/main.py verify dataset/transcripts/your_transcript.json
66
  ```
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  ## 参数调整
69
 
70
  可以通过修改 `vad/audio_processor.py` 中的参数来优化切割效果:
@@ -141,12 +161,19 @@ python vad/main.py verify dataset/transcripts/meeting_001_20250422_182233.json
141
  - 建议使用16kHz采样率
142
  - 如果是多声道音频会自动转换为单声道
143
 
144
- 2. 性能考虑:
 
 
 
 
 
145
  - 转录速度取决于CPU性能和音频长度
146
  - 较长的音频文件会被自动切割成小片段处理
 
147
 
148
- 3. 后续优化方向:
149
  - 优化切割策略
150
  - 添加批量处理功能
151
  - 改进语义重组算法
152
  - 添加GUI界面
 
 
14
  - 支持批量处理音频片段
15
  - 保存带时间戳的转录文本
16
 
17
+ ### 3. 聚合转录
18
+ - 支持将多个连续片段聚合后进行整体转录
19
+ - 提高语义连贯性和转录准确度
20
+ - 适用于语义上相关的连续语音片段
21
+
22
+ ### 4. 人工验证界面
23
  - 交互式验证转录结果
24
  - 支持修改转录文本
25
  - 添加验证注释
 
60
  这个命令会:
61
  1. 将音频切割成多个片段
62
  2. 对每个片段进行转录
63
+ 3. 保存结果到JSON文件
 
64
 
65
  ### 3. 验证已有的转录结果
66
 
 
69
  python vad/main.py verify dataset/transcripts/your_transcript.json
70
  ```
71
 
72
+ ### 4. 聚合转录连续片段
73
+
74
+ ```bash
75
+ # 在项目根目录下运行
76
+ python vad/main.py aggregate --segments "0,1,2"
77
+ ```
78
+
79
+ 这个命令会:
80
+ 1. 自动查找已处理的音频片段
81
+ 2. 聚合指定的连续片段(这里是索引为0、1、2的片段)
82
+ 3. 对聚合后的音频进行整体转录
83
+ 4. 保存结果到JSON文件
84
+
85
+ 参数说明:
86
+ - `--segments`: 指定要聚合的片段索引,用逗号分隔,索引从0开始(必需)
87
+
88
  ## 参数调整
89
 
90
  可以通过修改 `vad/audio_processor.py` 中的参数来优化切割效果:
 
161
  - 建议使用16kHz采样率
162
  - 如果是多声道音频会自动转换为单声道
163
 
164
+ 2. 聚合转录建议:
165
+ - 建议聚合语义上相关的连续片段
166
+ - 聚合片段不宜过多,建议不超过5个片段
167
+ - 聚合转录适合解决单独转录时出现的语义断裂问题
168
+
169
+ 3. 性能考虑:
170
  - 转录速度取决于CPU性能和音频长度
171
  - 较长的音频文件会被自动切割成小片段处理
172
+ - 聚合转录可能需要更多内存和处理时间
173
 
174
+ 4. 后续优化方向:
175
  - 优化切割策略
176
  - 添加批量处理功能
177
  - 改进语义重组算法
178
  - 添加GUI界面
179
+ - 自动检测适合聚合的片段
vad/audio_transcriber.py CHANGED
@@ -1,10 +1,11 @@
1
  from faster_whisper import WhisperModel
2
  from audio_processor import AudioSegment
3
  import json
4
- from typing import List, Dict, Optional
5
  from dataclasses import dataclass
6
  import os
7
  from datetime import datetime
 
8
 
9
  @dataclass
10
  class TranscriptionResult:
@@ -15,9 +16,10 @@ class TranscriptionResult:
15
  verified: bool = False
16
  verified_text: Optional[str] = None
17
  verification_notes: Optional[str] = None
 
18
 
19
  class AudioTranscriber:
20
- def __init__(self, model: str = "small", device: str = "cuda", compute_type: str = "int8"):
21
  """
22
  初始化转录器
23
 
@@ -37,7 +39,7 @@ class AudioTranscriber:
37
  print("Model transcribe...")
38
  print(f"开始转录音频片段,长度: {len(segment.audio_data)} 采样点 ({len(segment.audio_data)/16000:.2f}秒)")
39
  segments_generator, info = self.model.transcribe(segment.audio_data,
40
- beam_size=3,
41
  language="zh")
42
  print(f"检测到语言: {info.language}, 语言概率: {info.language_probability:.2f}")
43
  segments = list(segments_generator)
@@ -59,6 +61,50 @@ class AudioTranscriber:
59
  results.append(result)
60
 
61
  return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  def save_transcription(self,
64
  results: List[TranscriptionResult],
@@ -84,7 +130,8 @@ class AudioTranscriber:
84
  "confidence": r.confidence,
85
  "verified": r.verified,
86
  "verified_text": r.verified_text,
87
- "verification_notes": r.verification_notes
 
88
  }
89
  for r in results
90
  ]
@@ -127,7 +174,8 @@ class AudioTranscriber:
127
  confidence=seg["confidence"],
128
  verified=seg["verified"],
129
  verified_text=seg.get("verified_text"),
130
- verification_notes=seg.get("verification_notes")
 
131
  )
132
  results.append(result)
133
 
 
1
  from faster_whisper import WhisperModel
2
  from audio_processor import AudioSegment
3
  import json
4
+ from typing import List, Dict, Optional, Tuple
5
  from dataclasses import dataclass
6
  import os
7
  from datetime import datetime
8
+ import numpy as np
9
 
10
  @dataclass
11
  class TranscriptionResult:
 
16
  verified: bool = False
17
  verified_text: Optional[str] = None
18
  verification_notes: Optional[str] = None
19
+ segment_index: Optional[int] = None # 添加片段索引字段
20
 
21
  class AudioTranscriber:
22
+ def __init__(self, model: str = "large", device: str = "cuda", compute_type: str = "int8"):
23
  """
24
  初始化转录器
25
 
 
39
  print("Model transcribe...")
40
  print(f"开始转录音频片段,长度: {len(segment.audio_data)} 采样点 ({len(segment.audio_data)/16000:.2f}秒)")
41
  segments_generator, info = self.model.transcribe(segment.audio_data,
42
+ beam_size=5,
43
  language="zh")
44
  print(f"检测到语言: {info.language}, 语言概率: {info.language_probability:.2f}")
45
  segments = list(segments_generator)
 
61
  results.append(result)
62
 
63
  return results
64
+
65
+ def transcribe_aggregated_segments(self, segments: List[AudioSegment]) -> List[TranscriptionResult]:
66
+ """
67
+ 将多个连续的音频片段聚合后进行转录,以提高语义准确度
68
+
69
+ Args:
70
+ segments: 要聚合的连续音频片段列表
71
+
72
+ Returns:
73
+ 转录结果列表
74
+ """
75
+ if not segments:
76
+ return []
77
+
78
+ print(f"开始聚合转录 {len(segments)} 个连续片段...")
79
+
80
+ # 记录第一个片段的开始时间和最后一个片段的结束时间
81
+ start_time = segments[0].start_time
82
+ end_time = segments[-1].end_time
83
+
84
+ # 计算总长度并创建合并的音频数据数组
85
+ total_length = sum(len(segment.audio_data) for segment in segments)
86
+ aggregated_audio = np.zeros(total_length, dtype=np.float32)
87
+
88
+ # 合并音频数据
89
+ current_position = 0
90
+ for segment in segments:
91
+ segment_length = len(segment.audio_data)
92
+ aggregated_audio[current_position:current_position + segment_length] = segment.audio_data
93
+ current_position += segment_length
94
+
95
+ print(f"聚合后音频长度: {len(aggregated_audio)} 采样点 ({len(aggregated_audio)/16000:.2f}秒)")
96
+
97
+ # 创建一个临时的聚合片段对象
98
+ aggregated_segment = AudioSegment(
99
+ start_time=start_time,
100
+ end_time=end_time,
101
+ audio_data=aggregated_audio,
102
+ is_speech=True
103
+ )
104
+
105
+ # 转录聚合后的音频
106
+ print("开始转录聚合后的音频...")
107
+ return self.transcribe_segment(aggregated_segment)
108
 
109
  def save_transcription(self,
110
  results: List[TranscriptionResult],
 
130
  "confidence": r.confidence,
131
  "verified": r.verified,
132
  "verified_text": r.verified_text,
133
+ "verification_notes": r.verification_notes,
134
+ "segment_index": r.segment_index # 添加片段索引到输出
135
  }
136
  for r in results
137
  ]
 
174
  confidence=seg["confidence"],
175
  verified=seg["verified"],
176
  verified_text=seg.get("verified_text"),
177
+ verification_notes=seg.get("verification_notes"),
178
+ segment_index=seg.get("segment_index") # 加载片段索引
179
  )
180
  results.append(result)
181
 
vad/main.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import argparse
3
  from audio_processor import AudioProcessor, AudioSegment
4
  from audio_transcriber import AudioTranscriber, TranscriptionResult
5
- from typing import List, Tuple
6
  import json
7
  import soundfile as sf
8
 
@@ -52,12 +52,90 @@ def transcribe_segments(segment_paths: List[str], original_segments: List[AudioS
52
 
53
  # 转录
54
  results = transcriber.transcribe_segment(segment)
 
 
 
 
 
55
  all_results.extend(results)
56
 
57
  # 保存转录结果
58
  output_path = transcriber.save_transcription(all_results, segment_paths[0])
59
  return output_path
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def verify_transcription(json_path: str):
62
  """
63
  交互式验证转录结果
@@ -99,15 +177,67 @@ def verify_transcription(json_path: str):
99
  transcriber.save_transcription(results, json_path)
100
  print("\n✅ 验证结果已保存")
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  def main():
103
  parser = argparse.ArgumentParser(description="音频处理和转录工具")
104
- parser.add_argument("action", choices=["process", "verify"], help="执行的操作: process(处理音频) 或 verify(验证转录)")
105
- parser.add_argument("input_path", help="输入文件路径 (音频文件或JSON文件)")
 
 
106
 
107
  args = parser.parse_args()
108
 
109
  try:
110
  if args.action == "process":
 
 
 
 
111
  print(f"处理音频文件: {args.input_path}")
112
  # 1. 切割音频
113
  segment_paths, original_segments = process_audio(args.input_path)
@@ -121,8 +251,43 @@ def main():
121
  print(f"✅ 转录完成,结果保存在: {json_path}")
122
 
123
  elif args.action == "verify":
 
 
 
 
124
  verify_transcription(args.input_path)
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  except Exception as e:
127
  print(f"错误: {e}")
128
 
 
2
  import argparse
3
  from audio_processor import AudioProcessor, AudioSegment
4
  from audio_transcriber import AudioTranscriber, TranscriptionResult
5
+ from typing import List, Tuple, Optional
6
  import json
7
  import soundfile as sf
8
 
 
52
 
53
  # 转录
54
  results = transcriber.transcribe_segment(segment)
55
+
56
+ # 设置片段索引
57
+ for result in results:
58
+ result.segment_index = i
59
+
60
  all_results.extend(results)
61
 
62
  # 保存转录结果
63
  output_path = transcriber.save_transcription(all_results, segment_paths[0])
64
  return output_path
65
 
66
+ def transcribe_aggregated_segments(segment_paths: List[str], original_segments: List[AudioSegment],
67
+ segment_indices: List[int]) -> str:
68
+ """
69
+ 聚合转录指定的连续音频片段并保存结果
70
+
71
+ Args:
72
+ segment_paths: 所有音频片段的路径列表
73
+ original_segments: 原始音频片段列表
74
+ segment_indices: 要聚合的片段索引列表
75
+
76
+ Returns:
77
+ 保存的转录结果文件路径
78
+ """
79
+ if not segment_indices:
80
+ raise ValueError("必须指定至少一个片段索引")
81
+
82
+ # 验证索引有效性
83
+ for idx in segment_indices:
84
+ if idx < 0 or idx >= len(segment_paths):
85
+ raise ValueError(f"无效的片段索引: {idx},有效范围: 0-{len(segment_paths)-1}")
86
+
87
+ # 按索引排序,确保按顺序处理
88
+ segment_indices.sort()
89
+
90
+ print(f"准备聚合转录片段索引: {segment_indices}")
91
+
92
+ transcriber = AudioTranscriber()
93
+ segments_to_aggregate = []
94
+
95
+ # 加载指定的音频片段
96
+ for idx in segment_indices:
97
+ path = segment_paths[idx]
98
+ print(f"加载片段 {idx}/{len(segment_paths)-1}: {path}")
99
+
100
+ # 读取音频数据
101
+ audio_data, _ = sf.read(path)
102
+
103
+ # 使用原始片段的时间戳
104
+ original_segment = original_segments[idx]
105
+
106
+ # 创建AudioSegment对象,保持原始时间戳
107
+ segment = AudioSegment(
108
+ start_time=original_segment.start_time,
109
+ end_time=original_segment.end_time,
110
+ audio_data=audio_data,
111
+ is_speech=True
112
+ )
113
+
114
+ segments_to_aggregate.append(segment)
115
+
116
+ # 聚合转录
117
+ print(f"开始聚合转录 {len(segments_to_aggregate)} 个片段...")
118
+ results = transcriber.transcribe_aggregated_segments(segments_to_aggregate)
119
+
120
+ # 设置聚合片段的索引信息
121
+ for result in results:
122
+ # 使用聚合的片段索引列表作为segment_index
123
+ result.segment_index = segment_indices
124
+
125
+ # 保存转录结果
126
+ # 使用第一个片段的路径作为基础,但添加"aggregated"标记
127
+ base_path = segment_paths[segment_indices[0]]
128
+ base_name = os.path.splitext(os.path.basename(base_path))[0]
129
+ aggregated_name = f"{base_name}_aggregated_{segment_indices[0]}_to_{segment_indices[-1]}"
130
+
131
+ # 创建一个临时路径用于保存
132
+ temp_path = os.path.join(os.path.dirname(base_path), f"{aggregated_name}.wav")
133
+
134
+ output_path = transcriber.save_transcription(results, temp_path)
135
+ print(f"✅ 聚合转录结果已保存到: {output_path}")
136
+
137
+ return output_path
138
+
139
  def verify_transcription(json_path: str):
140
  """
141
  交互式验证转录结果
 
177
  transcriber.save_transcription(results, json_path)
178
  print("\n✅ 验证结果已保存")
179
 
180
+ def get_existing_segments(base_dir="dataset/audio/segments"):
181
+ """
182
+ 获取已存在的音频片段列表
183
+ """
184
+ if not os.path.exists(base_dir):
185
+ return [], []
186
+
187
+ # 获取所有wav文件
188
+ segment_files = [f for f in os.listdir(base_dir) if f.endswith('.wav')]
189
+
190
+ # 创建一个列表来存储(索引, 路径)对
191
+ indexed_paths = []
192
+
193
+ # 从文件名中提取索引
194
+ for filename in segment_files:
195
+ parts = filename.split('_')
196
+ if len(parts) >= 3 and parts[-2] == "segment":
197
+ try:
198
+ segment_idx = int(parts[-1].split('.')[0]) - 1 # 转换为0-based索引
199
+ full_path = os.path.join(base_dir, filename)
200
+ indexed_paths.append((segment_idx, full_path))
201
+ except (ValueError, IndexError):
202
+ print(f"警告: 无法从文件名 {filename} 中提取片段索引")
203
+
204
+ # 按索引排序
205
+ indexed_paths.sort(key=lambda x: x[0])
206
+
207
+ # 提取排序后的路径
208
+ segment_paths = [path for _, path in indexed_paths]
209
+
210
+ # 创建临时的AudioSegment对象
211
+ original_segments = []
212
+ for idx, path in indexed_paths:
213
+ audio_data, sample_rate = sf.read(path)
214
+ # 简单估计时间戳(实际应用中可能需要更精确的方法)
215
+ duration = len(audio_data) / sample_rate
216
+ segment = AudioSegment(
217
+ start_time=0.0, # 这里使用相对时间
218
+ end_time=duration,
219
+ audio_data=audio_data,
220
+ is_speech=True
221
+ )
222
+ original_segments.append(segment)
223
+
224
+ return segment_paths, original_segments
225
+
226
  def main():
227
  parser = argparse.ArgumentParser(description="音频处理和转录工具")
228
+ parser.add_argument("action", choices=["process", "verify", "aggregate"],
229
+ help="执行的操作: process(处理音频), verify(验证转录), aggregate(聚合转录)")
230
+ parser.add_argument("input_path", nargs='?', help="输入文件路径 (音频文件或JSON文件)")
231
+ parser.add_argument("--segments", type=str, help="要聚合的片段索引,用逗号分隔,例如 '0,1,2'")
232
 
233
  args = parser.parse_args()
234
 
235
  try:
236
  if args.action == "process":
237
+ if not args.input_path:
238
+ print("❌ 使用 process 操作时必须指定输入文件路径")
239
+ return
240
+
241
  print(f"处理音频文件: {args.input_path}")
242
  # 1. 切割音频
243
  segment_paths, original_segments = process_audio(args.input_path)
 
251
  print(f"✅ 转录完成,结果保存在: {json_path}")
252
 
253
  elif args.action == "verify":
254
+ if not args.input_path:
255
+ print("❌ 使用 verify 操作时必须指定输入文件路径")
256
+ return
257
+
258
  verify_transcription(args.input_path)
259
 
260
+ elif args.action == "aggregate":
261
+ if not args.segments:
262
+ print("❌ 使用 aggregate 操作时必须指定 --segments 参数")
263
+ return
264
+
265
+ # 解析片段索引
266
+ try:
267
+ segment_indices = [int(idx.strip()) for idx in args.segments.split(",")]
268
+ except ValueError:
269
+ print("❌ 片段索引必须是整数,用逗号分隔")
270
+ return
271
+
272
+ # 获取已存在的音频片段
273
+ segment_paths, original_segments = get_existing_segments()
274
+ if not segment_paths:
275
+ print("❌ 未找到已处理的音频片段,请先使用 process 命令处理音频文件")
276
+ return
277
+
278
+ print(f"找到 {len(segment_paths)} 个已处理的音频片段")
279
+
280
+ # 验证索引有效性
281
+ max_idx = len(segment_paths) - 1
282
+ invalid_indices = [idx for idx in segment_indices if idx < 0 or idx > max_idx]
283
+ if invalid_indices:
284
+ print(f"❌ 无效的片段索引: {invalid_indices},有效范围: 0-{max_idx}")
285
+ return
286
+
287
+ # 聚合转录指定的片段
288
+ json_path = transcribe_aggregated_segments(segment_paths, original_segments, segment_indices)
289
+ print(f"✅ 聚合转录完成,结果保存在: {json_path}")
290
+
291
  except Exception as e:
292
  print(f"错误: {e}")
293