Commit
·
25666e3
1
Parent(s):
5b39f9e
aggr voice segment test
Browse files- dataset/transcripts/test1_segment_10_aggregated_9_to_15_20250424_111404.json +132 -0
- dataset/transcripts/test1_segment_10_aggregated_9_to_16_20250424_111418.json +158 -0
- dataset/transcripts/test1_segment_17_aggregated_8_to_9_20250424_105049.json +24 -0
- dataset/transcripts/test1_segment_1_20250423_163518.json +168 -0
- dataset/transcripts/test1_segment_1_20250423_190011.json +168 -0
- dataset/transcripts/test1_segment_1_20250423_190044.json +168 -0
- dataset/transcripts/test1_segment_1_20250423_201934.json +168 -0
- dataset/transcripts/test1_segment_1_aggregated_0_to_8_20250424_111323.json +166 -0
- dataset/transcripts/test1_segment_8_aggregated_7_to_8_20250424_110257.json +32 -0
- dataset/transcripts/test1_segment_8_aggregated_7_to_8_20250424_110758.json +32 -0
- dataset/transcripts/test1_segment_9_aggregated_8_to_9_20250424_110138.json +32 -0
- vad/README.md +32 -5
- vad/audio_transcriber.py +53 -5
- vad/main.py +168 -3
dataset/transcripts/test1_segment_10_aggregated_9_to_15_20250424_111404.json
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "dataset/audio/segments\\test1_segment_10_aggregated_9_to_15.wav",
|
3 |
+
"timestamp": "20250424_111404",
|
4 |
+
"segments": [
|
5 |
+
{
|
6 |
+
"text": "你会学习到如何使用音频数据集",
|
7 |
+
"start_time": 0.0,
|
8 |
+
"end_time": 1.62,
|
9 |
+
"confidence": 0.862060546875,
|
10 |
+
"verified": false,
|
11 |
+
"verified_text": null,
|
12 |
+
"verification_notes": null,
|
13 |
+
"segment_index": [
|
14 |
+
9,
|
15 |
+
10,
|
16 |
+
11,
|
17 |
+
12,
|
18 |
+
13,
|
19 |
+
14,
|
20 |
+
15
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"text": "包括音频数据加载",
|
25 |
+
"start_time": 1.62,
|
26 |
+
"end_time": 3.0,
|
27 |
+
"confidence": 0.862060546875,
|
28 |
+
"verified": false,
|
29 |
+
"verified_text": null,
|
30 |
+
"verification_notes": null,
|
31 |
+
"segment_index": [
|
32 |
+
9,
|
33 |
+
10,
|
34 |
+
11,
|
35 |
+
12,
|
36 |
+
13,
|
37 |
+
14,
|
38 |
+
15
|
39 |
+
]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"text": "音频数据预处理",
|
43 |
+
"start_time": 3.0,
|
44 |
+
"end_time": 3.86,
|
45 |
+
"confidence": 0.862060546875,
|
46 |
+
"verified": false,
|
47 |
+
"verified_text": null,
|
48 |
+
"verification_notes": null,
|
49 |
+
"segment_index": [
|
50 |
+
9,
|
51 |
+
10,
|
52 |
+
11,
|
53 |
+
12,
|
54 |
+
13,
|
55 |
+
14,
|
56 |
+
15
|
57 |
+
]
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"text": "以及高效加载大规模音频数据集的流逝加载方法",
|
61 |
+
"start_time": 3.86,
|
62 |
+
"end_time": 7.2,
|
63 |
+
"confidence": 0.862060546875,
|
64 |
+
"verified": false,
|
65 |
+
"verified_text": null,
|
66 |
+
"verification_notes": null,
|
67 |
+
"segment_index": [
|
68 |
+
9,
|
69 |
+
10,
|
70 |
+
11,
|
71 |
+
12,
|
72 |
+
13,
|
73 |
+
14,
|
74 |
+
15
|
75 |
+
]
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"text": "完成本单元的学习后",
|
79 |
+
"start_time": 7.2,
|
80 |
+
"end_time": 8.28,
|
81 |
+
"confidence": 0.862060546875,
|
82 |
+
"verified": false,
|
83 |
+
"verified_text": null,
|
84 |
+
"verification_notes": null,
|
85 |
+
"segment_index": [
|
86 |
+
9,
|
87 |
+
10,
|
88 |
+
11,
|
89 |
+
12,
|
90 |
+
13,
|
91 |
+
14,
|
92 |
+
15
|
93 |
+
]
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"text": "你会掌握基础的音频相关数",
|
97 |
+
"start_time": 8.28,
|
98 |
+
"end_time": 10.88,
|
99 |
+
"confidence": 0.862060546875,
|
100 |
+
"verified": false,
|
101 |
+
"verified_text": null,
|
102 |
+
"verification_notes": null,
|
103 |
+
"segment_index": [
|
104 |
+
9,
|
105 |
+
10,
|
106 |
+
11,
|
107 |
+
12,
|
108 |
+
13,
|
109 |
+
14,
|
110 |
+
15
|
111 |
+
]
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"text": "并且掌握针对不同应用的音频数据处理工具",
|
115 |
+
"start_time": 10.88,
|
116 |
+
"end_time": 14.42,
|
117 |
+
"confidence": 0.862060546875,
|
118 |
+
"verified": false,
|
119 |
+
"verified_text": null,
|
120 |
+
"verification_notes": null,
|
121 |
+
"segment_index": [
|
122 |
+
9,
|
123 |
+
10,
|
124 |
+
11,
|
125 |
+
12,
|
126 |
+
13,
|
127 |
+
14,
|
128 |
+
15
|
129 |
+
]
|
130 |
+
}
|
131 |
+
]
|
132 |
+
}
|
dataset/transcripts/test1_segment_10_aggregated_9_to_16_20250424_111418.json
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "dataset/audio/segments\\test1_segment_10_aggregated_9_to_16.wav",
|
3 |
+
"timestamp": "20250424_111418",
|
4 |
+
"segments": [
|
5 |
+
{
|
6 |
+
"text": "你会学习到如何使用音频数据集",
|
7 |
+
"start_time": 0.0,
|
8 |
+
"end_time": 1.62,
|
9 |
+
"confidence": 0.8543701171875,
|
10 |
+
"verified": false,
|
11 |
+
"verified_text": null,
|
12 |
+
"verification_notes": null,
|
13 |
+
"segment_index": [
|
14 |
+
9,
|
15 |
+
10,
|
16 |
+
11,
|
17 |
+
12,
|
18 |
+
13,
|
19 |
+
14,
|
20 |
+
15,
|
21 |
+
16
|
22 |
+
]
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"text": "包括音频数据加载",
|
26 |
+
"start_time": 1.62,
|
27 |
+
"end_time": 3.0,
|
28 |
+
"confidence": 0.8543701171875,
|
29 |
+
"verified": false,
|
30 |
+
"verified_text": null,
|
31 |
+
"verification_notes": null,
|
32 |
+
"segment_index": [
|
33 |
+
9,
|
34 |
+
10,
|
35 |
+
11,
|
36 |
+
12,
|
37 |
+
13,
|
38 |
+
14,
|
39 |
+
15,
|
40 |
+
16
|
41 |
+
]
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"text": "音频数据预处理",
|
45 |
+
"start_time": 3.0,
|
46 |
+
"end_time": 3.86,
|
47 |
+
"confidence": 0.8543701171875,
|
48 |
+
"verified": false,
|
49 |
+
"verified_text": null,
|
50 |
+
"verification_notes": null,
|
51 |
+
"segment_index": [
|
52 |
+
9,
|
53 |
+
10,
|
54 |
+
11,
|
55 |
+
12,
|
56 |
+
13,
|
57 |
+
14,
|
58 |
+
15,
|
59 |
+
16
|
60 |
+
]
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"text": "以及高效加载大规模音频数据集的流逝加载方法",
|
64 |
+
"start_time": 3.86,
|
65 |
+
"end_time": 7.2,
|
66 |
+
"confidence": 0.8543701171875,
|
67 |
+
"verified": false,
|
68 |
+
"verified_text": null,
|
69 |
+
"verification_notes": null,
|
70 |
+
"segment_index": [
|
71 |
+
9,
|
72 |
+
10,
|
73 |
+
11,
|
74 |
+
12,
|
75 |
+
13,
|
76 |
+
14,
|
77 |
+
15,
|
78 |
+
16
|
79 |
+
]
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"text": "完成本单元的学习后",
|
83 |
+
"start_time": 7.2,
|
84 |
+
"end_time": 8.28,
|
85 |
+
"confidence": 0.8543701171875,
|
86 |
+
"verified": false,
|
87 |
+
"verified_text": null,
|
88 |
+
"verification_notes": null,
|
89 |
+
"segment_index": [
|
90 |
+
9,
|
91 |
+
10,
|
92 |
+
11,
|
93 |
+
12,
|
94 |
+
13,
|
95 |
+
14,
|
96 |
+
15,
|
97 |
+
16
|
98 |
+
]
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"text": "你会掌握基础的音频相关数",
|
102 |
+
"start_time": 8.28,
|
103 |
+
"end_time": 10.88,
|
104 |
+
"confidence": 0.8543701171875,
|
105 |
+
"verified": false,
|
106 |
+
"verified_text": null,
|
107 |
+
"verification_notes": null,
|
108 |
+
"segment_index": [
|
109 |
+
9,
|
110 |
+
10,
|
111 |
+
11,
|
112 |
+
12,
|
113 |
+
13,
|
114 |
+
14,
|
115 |
+
15,
|
116 |
+
16
|
117 |
+
]
|
118 |
+
},
|
119 |
+
{
|
120 |
+
"text": "并且掌握针对不同应用的音频数据处理工具",
|
121 |
+
"start_time": 10.88,
|
122 |
+
"end_time": 14.42,
|
123 |
+
"confidence": 0.8543701171875,
|
124 |
+
"verified": false,
|
125 |
+
"verified_text": null,
|
126 |
+
"verification_notes": null,
|
127 |
+
"segment_index": [
|
128 |
+
9,
|
129 |
+
10,
|
130 |
+
11,
|
131 |
+
12,
|
132 |
+
13,
|
133 |
+
14,
|
134 |
+
15,
|
135 |
+
16
|
136 |
+
]
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"text": "本单元的知识会成为后面章节的基础",
|
140 |
+
"start_time": 14.42,
|
141 |
+
"end_time": 16.8,
|
142 |
+
"confidence": 0.8543701171875,
|
143 |
+
"verified": false,
|
144 |
+
"verified_text": null,
|
145 |
+
"verification_notes": null,
|
146 |
+
"segment_index": [
|
147 |
+
9,
|
148 |
+
10,
|
149 |
+
11,
|
150 |
+
12,
|
151 |
+
13,
|
152 |
+
14,
|
153 |
+
15,
|
154 |
+
16
|
155 |
+
]
|
156 |
+
}
|
157 |
+
]
|
158 |
+
}
|
dataset/transcripts/test1_segment_17_aggregated_8_to_9_20250424_105049.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "dataset/audio/segments\\test1_segment_17_aggregated_8_to_9.wav",
|
3 |
+
"timestamp": "20250424_105049",
|
4 |
+
"segments": [
|
5 |
+
{
|
6 |
+
"text": "本单元的知识会成为后面章节的基础",
|
7 |
+
"start_time": 0.0,
|
8 |
+
"end_time": 2.32,
|
9 |
+
"confidence": 0.657958984375,
|
10 |
+
"verified": false,
|
11 |
+
"verified_text": null,
|
12 |
+
"verification_notes": null
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"text": "音频数据处理",
|
16 |
+
"start_time": 2.32,
|
17 |
+
"end_time": 3.56,
|
18 |
+
"confidence": 0.657958984375,
|
19 |
+
"verified": false,
|
20 |
+
"verified_text": null,
|
21 |
+
"verification_notes": null
|
22 |
+
}
|
23 |
+
]
|
24 |
+
}
|
dataset/transcripts/test1_segment_1_20250423_163518.json
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "dataset/audio/segments\\test1_segment_1.wav",
|
3 |
+
"timestamp": "20250423_163518",
|
4 |
+
"segments": [
|
5 |
+
{
|
6 |
+
"text": "第一單元",
|
7 |
+
"start_time": 3.26,
|
8 |
+
"end_time": 3.9,
|
9 |
+
"confidence": 0.546142578125,
|
10 |
+
"verified": false,
|
11 |
+
"verified_text": null,
|
12 |
+
"verification_notes": null
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"text": "音频数据处理",
|
16 |
+
"start_time": 4.34,
|
17 |
+
"end_time": 5.74,
|
18 |
+
"confidence": 0.302734375,
|
19 |
+
"verified": false,
|
20 |
+
"verified_text": null,
|
21 |
+
"verification_notes": null
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"text": "单元简介",
|
25 |
+
"start_time": 7.1,
|
26 |
+
"end_time": 7.859999999999999,
|
27 |
+
"confidence": 0.642578125,
|
28 |
+
"verified": false,
|
29 |
+
"verified_text": null,
|
30 |
+
"verification_notes": null
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"text": "所有音频或语音相关的任务都需要使用音频文件",
|
34 |
+
"start_time": 8.8,
|
35 |
+
"end_time": 12.4,
|
36 |
+
"confidence": 0.93402099609375,
|
37 |
+
"verified": false,
|
38 |
+
"verified_text": null,
|
39 |
+
"verification_notes": null
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"text": "在我们深入了解这些任务之前,我们需要了解音频文件的实际内容。",
|
43 |
+
"start_time": 12.8,
|
44 |
+
"end_time": 16.8,
|
45 |
+
"confidence": 0.844482421875,
|
46 |
+
"verified": false,
|
47 |
+
"verified_text": null,
|
48 |
+
"verification_notes": null
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"text": "以及如何利用一瓶文件",
|
52 |
+
"start_time": 17.32,
|
53 |
+
"end_time": 18.72,
|
54 |
+
"confidence": 0.802001953125,
|
55 |
+
"verified": false,
|
56 |
+
"verified_text": null,
|
57 |
+
"verification_notes": null
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"text": "本来原将为你介绍的",
|
61 |
+
"start_time": 19.76,
|
62 |
+
"end_time": 21.040000000000003,
|
63 |
+
"confidence": 0.7650146484375,
|
64 |
+
"verified": false,
|
65 |
+
"verified_text": null,
|
66 |
+
"verification_notes": null
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"text": "本单元将为你介绍与音频数据相关的基本概念",
|
70 |
+
"start_time": 21.62,
|
71 |
+
"end_time": 25.62,
|
72 |
+
"confidence": 0.87860107421875,
|
73 |
+
"verified": false,
|
74 |
+
"verified_text": null,
|
75 |
+
"verification_notes": null
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"text": "包括波形彩虹绿和冰补涂",
|
79 |
+
"start_time": 26.28,
|
80 |
+
"end_time": 28.080000000000002,
|
81 |
+
"confidence": 0.93768310546875,
|
82 |
+
"verified": false,
|
83 |
+
"verified_text": null,
|
84 |
+
"verification_notes": null
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"text": "你会学习到如何使用音频数语集",
|
88 |
+
"start_time": 28.56,
|
89 |
+
"end_time": 30.36,
|
90 |
+
"confidence": 0.90057373046875,
|
91 |
+
"verified": false,
|
92 |
+
"verified_text": null,
|
93 |
+
"verification_notes": null
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"text": "包括音频数语加载",
|
97 |
+
"start_time": 30.36,
|
98 |
+
"end_time": 31.599999999999998,
|
99 |
+
"confidence": 0.90057373046875,
|
100 |
+
"verified": false,
|
101 |
+
"verified_text": null,
|
102 |
+
"verification_notes": null
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"text": "音频数据预处理以及",
|
106 |
+
"start_time": 31.98,
|
107 |
+
"end_time": 33.22,
|
108 |
+
"confidence": 0.597412109375,
|
109 |
+
"verified": false,
|
110 |
+
"verified_text": null,
|
111 |
+
"verification_notes": null
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"text": "高效加载大规模音频数据集的流式加载方法",
|
115 |
+
"start_time": 33.54,
|
116 |
+
"end_time": 36.54,
|
117 |
+
"confidence": 0.76708984375,
|
118 |
+
"verified": false,
|
119 |
+
"verified_text": null,
|
120 |
+
"verification_notes": null
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"text": "完成本大圆的学习后",
|
124 |
+
"start_time": 37.82,
|
125 |
+
"end_time": 38.94,
|
126 |
+
"confidence": 0.88128662109375,
|
127 |
+
"verified": false,
|
128 |
+
"verified_text": null,
|
129 |
+
"verification_notes": null
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"text": "你会掌握",
|
133 |
+
"start_time": 39.34,
|
134 |
+
"end_time": 40.34,
|
135 |
+
"confidence": 0.375,
|
136 |
+
"verified": false,
|
137 |
+
"verified_text": null,
|
138 |
+
"verification_notes": null
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"text": "基础的音频相关数",
|
142 |
+
"start_time": 40.86,
|
143 |
+
"end_time": 42.38,
|
144 |
+
"confidence": 0.30810546875,
|
145 |
+
"verified": false,
|
146 |
+
"verified_text": null,
|
147 |
+
"verification_notes": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"text": "并且掌握针对不同应用的音频数据处理工具",
|
151 |
+
"start_time": 43.04,
|
152 |
+
"end_time": 46.6,
|
153 |
+
"confidence": 0.9736175537109375,
|
154 |
+
"verified": false,
|
155 |
+
"verified_text": null,
|
156 |
+
"verification_notes": null
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"text": "本大员的支持会成为后面章节的基础",
|
160 |
+
"start_time": 47.5,
|
161 |
+
"end_time": 49.8,
|
162 |
+
"confidence": 0.82470703125,
|
163 |
+
"verified": false,
|
164 |
+
"verified_text": null,
|
165 |
+
"verification_notes": null
|
166 |
+
}
|
167 |
+
]
|
168 |
+
}
|
dataset/transcripts/test1_segment_1_20250423_190011.json
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "dataset/audio/segments\\test1_segment_1.wav",
|
3 |
+
"timestamp": "20250423_190011",
|
4 |
+
"segments": [
|
5 |
+
{
|
6 |
+
"text": "第一单元",
|
7 |
+
"start_time": 3.26,
|
8 |
+
"end_time": 3.9,
|
9 |
+
"confidence": 0.85302734375,
|
10 |
+
"verified": false,
|
11 |
+
"verified_text": null,
|
12 |
+
"verification_notes": null
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"text": "音频数据出来",
|
16 |
+
"start_time": 4.34,
|
17 |
+
"end_time": 5.56,
|
18 |
+
"confidence": 0.4482421875,
|
19 |
+
"verified": false,
|
20 |
+
"verified_text": null,
|
21 |
+
"verification_notes": null
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"text": "单元简介",
|
25 |
+
"start_time": 7.1,
|
26 |
+
"end_time": 7.8,
|
27 |
+
"confidence": 0.854736328125,
|
28 |
+
"verified": false,
|
29 |
+
"verified_text": null,
|
30 |
+
"verification_notes": null
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"text": "所有音频或语音相关的任务都需要使用音频文件",
|
34 |
+
"start_time": 8.8,
|
35 |
+
"end_time": 12.4,
|
36 |
+
"confidence": 0.981781005859375,
|
37 |
+
"verified": false,
|
38 |
+
"verified_text": null,
|
39 |
+
"verification_notes": null
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"text": "在我们深入了解这些任务之前",
|
43 |
+
"start_time": 12.8,
|
44 |
+
"end_time": 14.600000000000001,
|
45 |
+
"confidence": 0.8140869140625,
|
46 |
+
"verified": false,
|
47 |
+
"verified_text": null,
|
48 |
+
"verification_notes": null
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"text": "我们需要了解音频文件的实际内容",
|
52 |
+
"start_time": 14.600000000000001,
|
53 |
+
"end_time": 16.78,
|
54 |
+
"confidence": 0.8140869140625,
|
55 |
+
"verified": false,
|
56 |
+
"verified_text": null,
|
57 |
+
"verification_notes": null
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"text": "以及如何利用音频文件",
|
61 |
+
"start_time": 17.32,
|
62 |
+
"end_time": 18.68,
|
63 |
+
"confidence": 0.793212890625,
|
64 |
+
"verified": false,
|
65 |
+
"verified_text": null,
|
66 |
+
"verification_notes": null
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"text": "本单元将为你介绍的",
|
70 |
+
"start_time": 19.76,
|
71 |
+
"end_time": 21.12,
|
72 |
+
"confidence": 0.852783203125,
|
73 |
+
"verified": false,
|
74 |
+
"verified_text": null,
|
75 |
+
"verification_notes": null
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"text": "本单元将为你介绍与音频数据相关的基本概念",
|
79 |
+
"start_time": 21.62,
|
80 |
+
"end_time": 25.580000000000002,
|
81 |
+
"confidence": 0.9444580078125,
|
82 |
+
"verified": false,
|
83 |
+
"verified_text": null,
|
84 |
+
"verification_notes": null
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"text": "包括波形,彩虹率和冰普渡",
|
88 |
+
"start_time": 26.28,
|
89 |
+
"end_time": 28.28,
|
90 |
+
"confidence": 0.732666015625,
|
91 |
+
"verified": false,
|
92 |
+
"verified_text": null,
|
93 |
+
"verification_notes": null
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"text": "你会学习到如何使用音频数据集,包括音频数据加载。",
|
97 |
+
"start_time": 28.56,
|
98 |
+
"end_time": 31.56,
|
99 |
+
"confidence": 0.953521728515625,
|
100 |
+
"verified": false,
|
101 |
+
"verified_text": null,
|
102 |
+
"verification_notes": null
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"text": "音频数据处理以及",
|
106 |
+
"start_time": 31.98,
|
107 |
+
"end_time": 33.18,
|
108 |
+
"confidence": 0.685791015625,
|
109 |
+
"verified": false,
|
110 |
+
"verified_text": null,
|
111 |
+
"verification_notes": null
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"text": "高效加载大规模音频数据集的流逝加载方法。",
|
115 |
+
"start_time": 33.54,
|
116 |
+
"end_time": 36.5,
|
117 |
+
"confidence": 0.88739013671875,
|
118 |
+
"verified": false,
|
119 |
+
"verified_text": null,
|
120 |
+
"verification_notes": null
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"text": "完成本单元的学习后",
|
124 |
+
"start_time": 37.82,
|
125 |
+
"end_time": 38.94,
|
126 |
+
"confidence": 0.9327392578125,
|
127 |
+
"verified": false,
|
128 |
+
"verified_text": null,
|
129 |
+
"verification_notes": null
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"text": "你会掌握",
|
133 |
+
"start_time": 39.34,
|
134 |
+
"end_time": 40.34,
|
135 |
+
"confidence": 0.73193359375,
|
136 |
+
"verified": false,
|
137 |
+
"verified_text": null,
|
138 |
+
"verification_notes": null
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"text": "基础的音频相关数",
|
142 |
+
"start_time": 40.86,
|
143 |
+
"end_time": 42.4,
|
144 |
+
"confidence": 0.609619140625,
|
145 |
+
"verified": false,
|
146 |
+
"verified_text": null,
|
147 |
+
"verification_notes": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"text": "并且掌握针对不同应用的音频数据处理工具",
|
151 |
+
"start_time": 43.04,
|
152 |
+
"end_time": 46.56,
|
153 |
+
"confidence": 0.96221923828125,
|
154 |
+
"verified": false,
|
155 |
+
"verified_text": null,
|
156 |
+
"verification_notes": null
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"text": "本单元的知识会成为后面章节的基础",
|
160 |
+
"start_time": 47.5,
|
161 |
+
"end_time": 49.86,
|
162 |
+
"confidence": 0.75439453125,
|
163 |
+
"verified": false,
|
164 |
+
"verified_text": null,
|
165 |
+
"verification_notes": null
|
166 |
+
}
|
167 |
+
]
|
168 |
+
}
|
dataset/transcripts/test1_segment_1_20250423_190044.json
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "dataset/audio/segments\\test1_segment_1.wav",
|
3 |
+
"timestamp": "20250423_190044",
|
4 |
+
"segments": [
|
5 |
+
{
|
6 |
+
"text": "第一單元",
|
7 |
+
"start_time": 3.26,
|
8 |
+
"end_time": 3.9,
|
9 |
+
"confidence": 0.546142578125,
|
10 |
+
"verified": false,
|
11 |
+
"verified_text": null,
|
12 |
+
"verification_notes": null
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"text": "音频数据处理",
|
16 |
+
"start_time": 4.34,
|
17 |
+
"end_time": 5.74,
|
18 |
+
"confidence": 0.302734375,
|
19 |
+
"verified": false,
|
20 |
+
"verified_text": null,
|
21 |
+
"verification_notes": null
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"text": "单元简介",
|
25 |
+
"start_time": 7.1,
|
26 |
+
"end_time": 7.859999999999999,
|
27 |
+
"confidence": 0.642578125,
|
28 |
+
"verified": false,
|
29 |
+
"verified_text": null,
|
30 |
+
"verification_notes": null
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"text": "所有音频或语音相关的任务都需要使用音频文件",
|
34 |
+
"start_time": 8.8,
|
35 |
+
"end_time": 12.4,
|
36 |
+
"confidence": 0.93402099609375,
|
37 |
+
"verified": false,
|
38 |
+
"verified_text": null,
|
39 |
+
"verification_notes": null
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"text": "在我们深入了解这些任务之前,我们需要了解音频文件的实际内容。",
|
43 |
+
"start_time": 12.8,
|
44 |
+
"end_time": 16.8,
|
45 |
+
"confidence": 0.844482421875,
|
46 |
+
"verified": false,
|
47 |
+
"verified_text": null,
|
48 |
+
"verification_notes": null
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"text": "以及如何利用一瓶文件",
|
52 |
+
"start_time": 17.32,
|
53 |
+
"end_time": 18.72,
|
54 |
+
"confidence": 0.802001953125,
|
55 |
+
"verified": false,
|
56 |
+
"verified_text": null,
|
57 |
+
"verification_notes": null
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"text": "本来原将为你介绍的",
|
61 |
+
"start_time": 19.76,
|
62 |
+
"end_time": 21.040000000000003,
|
63 |
+
"confidence": 0.7650146484375,
|
64 |
+
"verified": false,
|
65 |
+
"verified_text": null,
|
66 |
+
"verification_notes": null
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"text": "本单元将为你介绍与音频数据相关的基本概念",
|
70 |
+
"start_time": 21.62,
|
71 |
+
"end_time": 25.62,
|
72 |
+
"confidence": 0.87860107421875,
|
73 |
+
"verified": false,
|
74 |
+
"verified_text": null,
|
75 |
+
"verification_notes": null
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"text": "包括波形彩虹绿和冰补涂",
|
79 |
+
"start_time": 26.28,
|
80 |
+
"end_time": 28.080000000000002,
|
81 |
+
"confidence": 0.93768310546875,
|
82 |
+
"verified": false,
|
83 |
+
"verified_text": null,
|
84 |
+
"verification_notes": null
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"text": "你会学习到如何使用音频数语集",
|
88 |
+
"start_time": 28.56,
|
89 |
+
"end_time": 30.36,
|
90 |
+
"confidence": 0.90057373046875,
|
91 |
+
"verified": false,
|
92 |
+
"verified_text": null,
|
93 |
+
"verification_notes": null
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"text": "包括音频数语加载",
|
97 |
+
"start_time": 30.36,
|
98 |
+
"end_time": 31.599999999999998,
|
99 |
+
"confidence": 0.90057373046875,
|
100 |
+
"verified": false,
|
101 |
+
"verified_text": null,
|
102 |
+
"verification_notes": null
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"text": "音频数据预处理以及",
|
106 |
+
"start_time": 31.98,
|
107 |
+
"end_time": 33.22,
|
108 |
+
"confidence": 0.597412109375,
|
109 |
+
"verified": false,
|
110 |
+
"verified_text": null,
|
111 |
+
"verification_notes": null
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"text": "高效加载大规模音频数据集的流式加载方法",
|
115 |
+
"start_time": 33.54,
|
116 |
+
"end_time": 36.54,
|
117 |
+
"confidence": 0.76708984375,
|
118 |
+
"verified": false,
|
119 |
+
"verified_text": null,
|
120 |
+
"verification_notes": null
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"text": "完成本大圆的学习后",
|
124 |
+
"start_time": 37.82,
|
125 |
+
"end_time": 38.94,
|
126 |
+
"confidence": 0.88128662109375,
|
127 |
+
"verified": false,
|
128 |
+
"verified_text": null,
|
129 |
+
"verification_notes": null
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"text": "你会掌握",
|
133 |
+
"start_time": 39.34,
|
134 |
+
"end_time": 40.34,
|
135 |
+
"confidence": 0.375,
|
136 |
+
"verified": false,
|
137 |
+
"verified_text": null,
|
138 |
+
"verification_notes": null
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"text": "基础的音频相关数",
|
142 |
+
"start_time": 40.86,
|
143 |
+
"end_time": 42.38,
|
144 |
+
"confidence": 0.30810546875,
|
145 |
+
"verified": false,
|
146 |
+
"verified_text": null,
|
147 |
+
"verification_notes": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"text": "并且掌握针对不同应用的音频数据处理工具",
|
151 |
+
"start_time": 43.04,
|
152 |
+
"end_time": 46.6,
|
153 |
+
"confidence": 0.9736175537109375,
|
154 |
+
"verified": false,
|
155 |
+
"verified_text": null,
|
156 |
+
"verification_notes": null
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"text": "本大员的支持会成为后面章节的基础",
|
160 |
+
"start_time": 47.5,
|
161 |
+
"end_time": 49.8,
|
162 |
+
"confidence": 0.82470703125,
|
163 |
+
"verified": false,
|
164 |
+
"verified_text": null,
|
165 |
+
"verification_notes": null
|
166 |
+
}
|
167 |
+
]
|
168 |
+
}
|
dataset/transcripts/test1_segment_1_20250423_201934.json
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "dataset/audio/segments\\test1_segment_1.wav",
|
3 |
+
"timestamp": "20250423_201934",
|
4 |
+
"segments": [
|
5 |
+
{
|
6 |
+
"text": "第一单元",
|
7 |
+
"start_time": 3.26,
|
8 |
+
"end_time": 3.9,
|
9 |
+
"confidence": 0.85302734375,
|
10 |
+
"verified": false,
|
11 |
+
"verified_text": null,
|
12 |
+
"verification_notes": null
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"text": "音频数据出来",
|
16 |
+
"start_time": 4.34,
|
17 |
+
"end_time": 5.56,
|
18 |
+
"confidence": 0.4482421875,
|
19 |
+
"verified": false,
|
20 |
+
"verified_text": null,
|
21 |
+
"verification_notes": null
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"text": "单元简介",
|
25 |
+
"start_time": 7.1,
|
26 |
+
"end_time": 7.8,
|
27 |
+
"confidence": 0.854736328125,
|
28 |
+
"verified": false,
|
29 |
+
"verified_text": null,
|
30 |
+
"verification_notes": null
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"text": "所有音频或语音相关的任务都需要使用音频文件",
|
34 |
+
"start_time": 8.8,
|
35 |
+
"end_time": 12.4,
|
36 |
+
"confidence": 0.981781005859375,
|
37 |
+
"verified": false,
|
38 |
+
"verified_text": null,
|
39 |
+
"verification_notes": null
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"text": "在我们深入了解这些任务之前",
|
43 |
+
"start_time": 12.8,
|
44 |
+
"end_time": 14.600000000000001,
|
45 |
+
"confidence": 0.8140869140625,
|
46 |
+
"verified": false,
|
47 |
+
"verified_text": null,
|
48 |
+
"verification_notes": null
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"text": "我们需要了解音频文件的实际内容",
|
52 |
+
"start_time": 14.600000000000001,
|
53 |
+
"end_time": 16.78,
|
54 |
+
"confidence": 0.8140869140625,
|
55 |
+
"verified": false,
|
56 |
+
"verified_text": null,
|
57 |
+
"verification_notes": null
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"text": "以及如何利用音频文件",
|
61 |
+
"start_time": 17.32,
|
62 |
+
"end_time": 18.68,
|
63 |
+
"confidence": 0.793212890625,
|
64 |
+
"verified": false,
|
65 |
+
"verified_text": null,
|
66 |
+
"verification_notes": null
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"text": "本单元将为你介绍的",
|
70 |
+
"start_time": 19.76,
|
71 |
+
"end_time": 21.12,
|
72 |
+
"confidence": 0.852783203125,
|
73 |
+
"verified": false,
|
74 |
+
"verified_text": null,
|
75 |
+
"verification_notes": null
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"text": "本单元将为你介绍与音频数据相关的基本概念",
|
79 |
+
"start_time": 21.62,
|
80 |
+
"end_time": 25.580000000000002,
|
81 |
+
"confidence": 0.9444580078125,
|
82 |
+
"verified": false,
|
83 |
+
"verified_text": null,
|
84 |
+
"verification_notes": null
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"text": "包括波形,彩虹率和冰普渡",
|
88 |
+
"start_time": 26.28,
|
89 |
+
"end_time": 28.28,
|
90 |
+
"confidence": 0.732666015625,
|
91 |
+
"verified": false,
|
92 |
+
"verified_text": null,
|
93 |
+
"verification_notes": null
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"text": "你会学习到如何使用音频数据集,包括音频数据加载。",
|
97 |
+
"start_time": 28.56,
|
98 |
+
"end_time": 31.56,
|
99 |
+
"confidence": 0.953521728515625,
|
100 |
+
"verified": false,
|
101 |
+
"verified_text": null,
|
102 |
+
"verification_notes": null
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"text": "音频数据处理以及",
|
106 |
+
"start_time": 31.98,
|
107 |
+
"end_time": 33.18,
|
108 |
+
"confidence": 0.685791015625,
|
109 |
+
"verified": false,
|
110 |
+
"verified_text": null,
|
111 |
+
"verification_notes": null
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"text": "高效加载大规模音频数据集的流逝加载方法。",
|
115 |
+
"start_time": 33.54,
|
116 |
+
"end_time": 36.5,
|
117 |
+
"confidence": 0.88739013671875,
|
118 |
+
"verified": false,
|
119 |
+
"verified_text": null,
|
120 |
+
"verification_notes": null
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"text": "完成本单元的学习后",
|
124 |
+
"start_time": 37.82,
|
125 |
+
"end_time": 38.94,
|
126 |
+
"confidence": 0.9327392578125,
|
127 |
+
"verified": false,
|
128 |
+
"verified_text": null,
|
129 |
+
"verification_notes": null
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"text": "你会掌握",
|
133 |
+
"start_time": 39.34,
|
134 |
+
"end_time": 40.34,
|
135 |
+
"confidence": 0.73193359375,
|
136 |
+
"verified": false,
|
137 |
+
"verified_text": null,
|
138 |
+
"verification_notes": null
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"text": "基础的音频相关数",
|
142 |
+
"start_time": 40.86,
|
143 |
+
"end_time": 42.4,
|
144 |
+
"confidence": 0.609619140625,
|
145 |
+
"verified": false,
|
146 |
+
"verified_text": null,
|
147 |
+
"verification_notes": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"text": "并且掌握针对不同应用的音频数据处理工具",
|
151 |
+
"start_time": 43.04,
|
152 |
+
"end_time": 46.56,
|
153 |
+
"confidence": 0.96221923828125,
|
154 |
+
"verified": false,
|
155 |
+
"verified_text": null,
|
156 |
+
"verification_notes": null
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"text": "本单元的知识会成为后面章节的基础",
|
160 |
+
"start_time": 47.5,
|
161 |
+
"end_time": 49.86,
|
162 |
+
"confidence": 0.75439453125,
|
163 |
+
"verified": false,
|
164 |
+
"verified_text": null,
|
165 |
+
"verification_notes": null
|
166 |
+
}
|
167 |
+
]
|
168 |
+
}
|
dataset/transcripts/test1_segment_1_aggregated_0_to_8_20250424_111323.json
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "dataset/audio/segments\\test1_segment_1_aggregated_0_to_8.wav",
|
3 |
+
"timestamp": "20250424_111323",
|
4 |
+
"segments": [
|
5 |
+
{
|
6 |
+
"text": "第一单元,音频数据处理单元简介",
|
7 |
+
"start_time": 0.0,
|
8 |
+
"end_time": 2.52,
|
9 |
+
"confidence": 0.8226318359375,
|
10 |
+
"verified": false,
|
11 |
+
"verified_text": null,
|
12 |
+
"verification_notes": null,
|
13 |
+
"segment_index": [
|
14 |
+
0,
|
15 |
+
1,
|
16 |
+
2,
|
17 |
+
3,
|
18 |
+
4,
|
19 |
+
5,
|
20 |
+
6,
|
21 |
+
7,
|
22 |
+
8
|
23 |
+
]
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"text": "所有音频或语音相关的任务都需要使用音频文件",
|
27 |
+
"start_time": 2.52,
|
28 |
+
"end_time": 6.18,
|
29 |
+
"confidence": 0.8226318359375,
|
30 |
+
"verified": false,
|
31 |
+
"verified_text": null,
|
32 |
+
"verification_notes": null,
|
33 |
+
"segment_index": [
|
34 |
+
0,
|
35 |
+
1,
|
36 |
+
2,
|
37 |
+
3,
|
38 |
+
4,
|
39 |
+
5,
|
40 |
+
6,
|
41 |
+
7,
|
42 |
+
8
|
43 |
+
]
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"text": "在我们深入了解这些任务之前",
|
47 |
+
"start_time": 6.18,
|
48 |
+
"end_time": 8.02,
|
49 |
+
"confidence": 0.8226318359375,
|
50 |
+
"verified": false,
|
51 |
+
"verified_text": null,
|
52 |
+
"verification_notes": null,
|
53 |
+
"segment_index": [
|
54 |
+
0,
|
55 |
+
1,
|
56 |
+
2,
|
57 |
+
3,
|
58 |
+
4,
|
59 |
+
5,
|
60 |
+
6,
|
61 |
+
7,
|
62 |
+
8
|
63 |
+
]
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"text": "我们需要了解音频文件的实际内容",
|
67 |
+
"start_time": 8.02,
|
68 |
+
"end_time": 10.18,
|
69 |
+
"confidence": 0.8226318359375,
|
70 |
+
"verified": false,
|
71 |
+
"verified_text": null,
|
72 |
+
"verification_notes": null,
|
73 |
+
"segment_index": [
|
74 |
+
0,
|
75 |
+
1,
|
76 |
+
2,
|
77 |
+
3,
|
78 |
+
4,
|
79 |
+
5,
|
80 |
+
6,
|
81 |
+
7,
|
82 |
+
8
|
83 |
+
]
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"text": "以及如何利用音频文件",
|
87 |
+
"start_time": 10.18,
|
88 |
+
"end_time": 11.52,
|
89 |
+
"confidence": 0.8226318359375,
|
90 |
+
"verified": false,
|
91 |
+
"verified_text": null,
|
92 |
+
"verification_notes": null,
|
93 |
+
"segment_index": [
|
94 |
+
0,
|
95 |
+
1,
|
96 |
+
2,
|
97 |
+
3,
|
98 |
+
4,
|
99 |
+
5,
|
100 |
+
6,
|
101 |
+
7,
|
102 |
+
8
|
103 |
+
]
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"text": "本单元将为你介绍的",
|
107 |
+
"start_time": 11.52,
|
108 |
+
"end_time": 12.700000000000001,
|
109 |
+
"confidence": 0.8226318359375,
|
110 |
+
"verified": false,
|
111 |
+
"verified_text": null,
|
112 |
+
"verification_notes": null,
|
113 |
+
"segment_index": [
|
114 |
+
0,
|
115 |
+
1,
|
116 |
+
2,
|
117 |
+
3,
|
118 |
+
4,
|
119 |
+
5,
|
120 |
+
6,
|
121 |
+
7,
|
122 |
+
8
|
123 |
+
]
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"text": "本单元将为你介绍与音频数据相关的基本概念",
|
127 |
+
"start_time": 12.700000000000001,
|
128 |
+
"end_time": 16.9,
|
129 |
+
"confidence": 0.8226318359375,
|
130 |
+
"verified": false,
|
131 |
+
"verified_text": null,
|
132 |
+
"verification_notes": null,
|
133 |
+
"segment_index": [
|
134 |
+
0,
|
135 |
+
1,
|
136 |
+
2,
|
137 |
+
3,
|
138 |
+
4,
|
139 |
+
5,
|
140 |
+
6,
|
141 |
+
7,
|
142 |
+
8
|
143 |
+
]
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"text": "包括波形、采样率和频谱图",
|
147 |
+
"start_time": 16.9,
|
148 |
+
"end_time": 18.68,
|
149 |
+
"confidence": 0.8226318359375,
|
150 |
+
"verified": false,
|
151 |
+
"verified_text": null,
|
152 |
+
"verification_notes": null,
|
153 |
+
"segment_index": [
|
154 |
+
0,
|
155 |
+
1,
|
156 |
+
2,
|
157 |
+
3,
|
158 |
+
4,
|
159 |
+
5,
|
160 |
+
6,
|
161 |
+
7,
|
162 |
+
8
|
163 |
+
]
|
164 |
+
}
|
165 |
+
]
|
166 |
+
}
|
dataset/transcripts/test1_segment_8_aggregated_7_to_8_20250424_110257.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "dataset/audio/segments\\test1_segment_8_aggregated_7_to_8.wav",
|
3 |
+
"timestamp": "20250424_110257",
|
4 |
+
"segments": [
|
5 |
+
{
|
6 |
+
"text": "本单元将为你介绍与音频数据相关的基本概念",
|
7 |
+
"start_time": 0.0,
|
8 |
+
"end_time": 3.96,
|
9 |
+
"confidence": 0.9649658203125,
|
10 |
+
"verified": false,
|
11 |
+
"verified_text": null,
|
12 |
+
"verification_notes": null,
|
13 |
+
"segment_index": [
|
14 |
+
7,
|
15 |
+
8
|
16 |
+
]
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"text": "包括波形、采样率和频谱图",
|
20 |
+
"start_time": 3.96,
|
21 |
+
"end_time": 5.76,
|
22 |
+
"confidence": 0.9649658203125,
|
23 |
+
"verified": false,
|
24 |
+
"verified_text": null,
|
25 |
+
"verification_notes": null,
|
26 |
+
"segment_index": [
|
27 |
+
7,
|
28 |
+
8
|
29 |
+
]
|
30 |
+
}
|
31 |
+
]
|
32 |
+
}
|
dataset/transcripts/test1_segment_8_aggregated_7_to_8_20250424_110758.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "dataset/audio/segments\\test1_segment_8_aggregated_7_to_8.wav",
|
3 |
+
"timestamp": "20250424_110758",
|
4 |
+
"segments": [
|
5 |
+
{
|
6 |
+
"text": "本单元将为你介绍与音频数据相关的基本概念",
|
7 |
+
"start_time": 0.0,
|
8 |
+
"end_time": 3.96,
|
9 |
+
"confidence": 0.9649658203125,
|
10 |
+
"verified": false,
|
11 |
+
"verified_text": null,
|
12 |
+
"verification_notes": null,
|
13 |
+
"segment_index": [
|
14 |
+
7,
|
15 |
+
8
|
16 |
+
]
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"text": "包括波形、采样率和频谱图",
|
20 |
+
"start_time": 3.96,
|
21 |
+
"end_time": 5.76,
|
22 |
+
"confidence": 0.9649658203125,
|
23 |
+
"verified": false,
|
24 |
+
"verified_text": null,
|
25 |
+
"verification_notes": null,
|
26 |
+
"segment_index": [
|
27 |
+
7,
|
28 |
+
8
|
29 |
+
]
|
30 |
+
}
|
31 |
+
]
|
32 |
+
}
|
dataset/transcripts/test1_segment_9_aggregated_8_to_9_20250424_110138.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "dataset/audio/segments\\test1_segment_9_aggregated_8_to_9.wav",
|
3 |
+
"timestamp": "20250424_110138",
|
4 |
+
"segments": [
|
5 |
+
{
|
6 |
+
"text": "包括波形、采样率和频谱图。",
|
7 |
+
"start_time": 0.0,
|
8 |
+
"end_time": 1.68,
|
9 |
+
"confidence": 0.812255859375,
|
10 |
+
"verified": false,
|
11 |
+
"verified_text": null,
|
12 |
+
"verification_notes": null,
|
13 |
+
"segment_index": [
|
14 |
+
8,
|
15 |
+
9
|
16 |
+
]
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"text": "你会学习到如何使用音频数据集,包括音频数据加载。",
|
20 |
+
"start_time": 1.9000000000000001,
|
21 |
+
"end_time": 4.92,
|
22 |
+
"confidence": 0.812255859375,
|
23 |
+
"verified": false,
|
24 |
+
"verified_text": null,
|
25 |
+
"verification_notes": null,
|
26 |
+
"segment_index": [
|
27 |
+
8,
|
28 |
+
9
|
29 |
+
]
|
30 |
+
}
|
31 |
+
]
|
32 |
+
}
|
vad/README.md
CHANGED
@@ -14,7 +14,12 @@
|
|
14 |
- 支持批量处理音频片段
|
15 |
- 保存带时间戳的转录文本
|
16 |
|
17 |
-
### 3.
|
|
|
|
|
|
|
|
|
|
|
18 |
- 交互式验证转录结果
|
19 |
- 支持修改转录文本
|
20 |
- 添加验证注释
|
@@ -55,8 +60,7 @@ python vad/main.py process dataset/audio/test1.wav
|
|
55 |
这个命令会:
|
56 |
1. 将音频切割成多个片段
|
57 |
2. 对每个片段进行转录
|
58 |
-
3.
|
59 |
-
4. 保存结果到JSON文件
|
60 |
|
61 |
### 3. 验证已有的转录结果
|
62 |
|
@@ -65,6 +69,22 @@ python vad/main.py process dataset/audio/test1.wav
|
|
65 |
python vad/main.py verify dataset/transcripts/your_transcript.json
|
66 |
```
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
## 参数调整
|
69 |
|
70 |
可以通过修改 `vad/audio_processor.py` 中的参数来优化切割效果:
|
@@ -141,12 +161,19 @@ python vad/main.py verify dataset/transcripts/meeting_001_20250422_182233.json
|
|
141 |
- 建议使用16kHz采样率
|
142 |
- 如果是多声道音频会自动转换为单声道
|
143 |
|
144 |
-
2.
|
|
|
|
|
|
|
|
|
|
|
145 |
- 转录速度取决于CPU性能和音频长度
|
146 |
- 较长的音频文件会被自动切割成小片段处理
|
|
|
147 |
|
148 |
-
|
149 |
- 优化切割策略
|
150 |
- 添加批量处理功能
|
151 |
- 改进语义重组算法
|
152 |
- 添加GUI界面
|
|
|
|
14 |
- 支持批量处理音频片段
|
15 |
- 保存带时间戳的转录文本
|
16 |
|
17 |
+
### 3. 聚合转录
|
18 |
+
- 支持将多个连续片段聚合后进行整体转录
|
19 |
+
- 提高语义连贯性和转录准确度
|
20 |
+
- 适用于语义上相关的连续语音片段
|
21 |
+
|
22 |
+
### 4. 人工验证界面
|
23 |
- 交互式验证转录结果
|
24 |
- 支持修改转录文本
|
25 |
- 添加验证注释
|
|
|
60 |
这个命令会:
|
61 |
1. 将音频切割成多个片段
|
62 |
2. 对每个片段进行转录
|
63 |
+
3. 保存结果到JSON文件
|
|
|
64 |
|
65 |
### 3. 验证已有的转录结果
|
66 |
|
|
|
69 |
python vad/main.py verify dataset/transcripts/your_transcript.json
|
70 |
```
|
71 |
|
72 |
+
### 4. 聚合转录连续片段
|
73 |
+
|
74 |
+
```bash
|
75 |
+
# 在项目根目录下运行
|
76 |
+
python vad/main.py aggregate --segments "0,1,2"
|
77 |
+
```
|
78 |
+
|
79 |
+
这个命令会:
|
80 |
+
1. 自动查找已处理的音频片段
|
81 |
+
2. 聚合指定的连续片段(这里是索引为0、1、2的片段)
|
82 |
+
3. 对聚合后的音频进行整体转录
|
83 |
+
4. 保存结果到JSON文件
|
84 |
+
|
85 |
+
参数说明:
|
86 |
+
- `--segments`: 指定要聚合的片段索引,用逗号分隔,索引从0开始(必需)
|
87 |
+
|
88 |
## 参数调整
|
89 |
|
90 |
可以通过修改 `vad/audio_processor.py` 中的参数来优化切割效果:
|
|
|
161 |
- 建议使用16kHz采样率
|
162 |
- 如果是多声道音频会自动转换为单声道
|
163 |
|
164 |
+
2. 聚合转录建议:
|
165 |
+
- 建议聚合语义上相关的连续片段
|
166 |
+
- 聚合片段不宜过多,建议不超过5个片段
|
167 |
+
- 聚合转录适合解决单独转录时出现的语义断裂问题
|
168 |
+
|
169 |
+
3. 性能考虑:
|
170 |
- 转录速度取决于CPU性能和音频长度
|
171 |
- 较长的音频文件会被自动切割成小片段处理
|
172 |
+
- 聚合转录可能需要更多内存和处理时间
|
173 |
|
174 |
+
4. 后续优化方向:
|
175 |
- 优化切割策略
|
176 |
- 添加批量处理功能
|
177 |
- 改进语义重组算法
|
178 |
- 添加GUI界面
|
179 |
+
- 自动检测适合聚合的片段
|
vad/audio_transcriber.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
from faster_whisper import WhisperModel
|
2 |
from audio_processor import AudioSegment
|
3 |
import json
|
4 |
-
from typing import List, Dict, Optional
|
5 |
from dataclasses import dataclass
|
6 |
import os
|
7 |
from datetime import datetime
|
|
|
8 |
|
9 |
@dataclass
|
10 |
class TranscriptionResult:
|
@@ -15,9 +16,10 @@ class TranscriptionResult:
|
|
15 |
verified: bool = False
|
16 |
verified_text: Optional[str] = None
|
17 |
verification_notes: Optional[str] = None
|
|
|
18 |
|
19 |
class AudioTranscriber:
|
20 |
-
def __init__(self, model: str = "
|
21 |
"""
|
22 |
初始化转录器
|
23 |
|
@@ -37,7 +39,7 @@ class AudioTranscriber:
|
|
37 |
print("Model transcribe...")
|
38 |
print(f"开始转录音频片段,长度: {len(segment.audio_data)} 采样点 ({len(segment.audio_data)/16000:.2f}秒)")
|
39 |
segments_generator, info = self.model.transcribe(segment.audio_data,
|
40 |
-
beam_size=
|
41 |
language="zh")
|
42 |
print(f"检测到语言: {info.language}, 语言概率: {info.language_probability:.2f}")
|
43 |
segments = list(segments_generator)
|
@@ -59,6 +61,50 @@ class AudioTranscriber:
|
|
59 |
results.append(result)
|
60 |
|
61 |
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
def save_transcription(self,
|
64 |
results: List[TranscriptionResult],
|
@@ -84,7 +130,8 @@ class AudioTranscriber:
|
|
84 |
"confidence": r.confidence,
|
85 |
"verified": r.verified,
|
86 |
"verified_text": r.verified_text,
|
87 |
-
"verification_notes": r.verification_notes
|
|
|
88 |
}
|
89 |
for r in results
|
90 |
]
|
@@ -127,7 +174,8 @@ class AudioTranscriber:
|
|
127 |
confidence=seg["confidence"],
|
128 |
verified=seg["verified"],
|
129 |
verified_text=seg.get("verified_text"),
|
130 |
-
verification_notes=seg.get("verification_notes")
|
|
|
131 |
)
|
132 |
results.append(result)
|
133 |
|
|
|
1 |
from faster_whisper import WhisperModel
|
2 |
from audio_processor import AudioSegment
|
3 |
import json
|
4 |
+
from typing import List, Dict, Optional, Tuple
|
5 |
from dataclasses import dataclass
|
6 |
import os
|
7 |
from datetime import datetime
|
8 |
+
import numpy as np
|
9 |
|
10 |
@dataclass
|
11 |
class TranscriptionResult:
|
|
|
16 |
verified: bool = False
|
17 |
verified_text: Optional[str] = None
|
18 |
verification_notes: Optional[str] = None
|
19 |
+
segment_index: Optional[int] = None # 添加片段索引字段
|
20 |
|
21 |
class AudioTranscriber:
|
22 |
+
def __init__(self, model: str = "large", device: str = "cuda", compute_type: str = "int8"):
|
23 |
"""
|
24 |
初始化转录器
|
25 |
|
|
|
39 |
print("Model transcribe...")
|
40 |
print(f"开始转录音频片段,长度: {len(segment.audio_data)} 采样点 ({len(segment.audio_data)/16000:.2f}秒)")
|
41 |
segments_generator, info = self.model.transcribe(segment.audio_data,
|
42 |
+
beam_size=5,
|
43 |
language="zh")
|
44 |
print(f"检测到语言: {info.language}, 语言概率: {info.language_probability:.2f}")
|
45 |
segments = list(segments_generator)
|
|
|
61 |
results.append(result)
|
62 |
|
63 |
return results
|
64 |
+
|
65 |
+
def transcribe_aggregated_segments(self, segments: List[AudioSegment]) -> List[TranscriptionResult]:
|
66 |
+
"""
|
67 |
+
将多个连续的音频片段聚合后进行转录,以提高语义准确度
|
68 |
+
|
69 |
+
Args:
|
70 |
+
segments: 要聚合的连续音频片段列表
|
71 |
+
|
72 |
+
Returns:
|
73 |
+
转录结果列表
|
74 |
+
"""
|
75 |
+
if not segments:
|
76 |
+
return []
|
77 |
+
|
78 |
+
print(f"开始聚合转录 {len(segments)} 个连续片段...")
|
79 |
+
|
80 |
+
# 记录第一个片段的开始时间和最后一个片段的结束时间
|
81 |
+
start_time = segments[0].start_time
|
82 |
+
end_time = segments[-1].end_time
|
83 |
+
|
84 |
+
# 计算总长度并创建合并的音频数据数组
|
85 |
+
total_length = sum(len(segment.audio_data) for segment in segments)
|
86 |
+
aggregated_audio = np.zeros(total_length, dtype=np.float32)
|
87 |
+
|
88 |
+
# 合并音频数据
|
89 |
+
current_position = 0
|
90 |
+
for segment in segments:
|
91 |
+
segment_length = len(segment.audio_data)
|
92 |
+
aggregated_audio[current_position:current_position + segment_length] = segment.audio_data
|
93 |
+
current_position += segment_length
|
94 |
+
|
95 |
+
print(f"聚合后音频长度: {len(aggregated_audio)} 采样点 ({len(aggregated_audio)/16000:.2f}秒)")
|
96 |
+
|
97 |
+
# 创建一个临时的聚合片段对象
|
98 |
+
aggregated_segment = AudioSegment(
|
99 |
+
start_time=start_time,
|
100 |
+
end_time=end_time,
|
101 |
+
audio_data=aggregated_audio,
|
102 |
+
is_speech=True
|
103 |
+
)
|
104 |
+
|
105 |
+
# 转录聚合后的音频
|
106 |
+
print("开始转录聚合后的音频...")
|
107 |
+
return self.transcribe_segment(aggregated_segment)
|
108 |
|
109 |
def save_transcription(self,
|
110 |
results: List[TranscriptionResult],
|
|
|
130 |
"confidence": r.confidence,
|
131 |
"verified": r.verified,
|
132 |
"verified_text": r.verified_text,
|
133 |
+
"verification_notes": r.verification_notes,
|
134 |
+
"segment_index": r.segment_index # 添加片段索引到输出
|
135 |
}
|
136 |
for r in results
|
137 |
]
|
|
|
174 |
confidence=seg["confidence"],
|
175 |
verified=seg["verified"],
|
176 |
verified_text=seg.get("verified_text"),
|
177 |
+
verification_notes=seg.get("verification_notes"),
|
178 |
+
segment_index=seg.get("segment_index") # 加载片段索引
|
179 |
)
|
180 |
results.append(result)
|
181 |
|
vad/main.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import argparse
|
3 |
from audio_processor import AudioProcessor, AudioSegment
|
4 |
from audio_transcriber import AudioTranscriber, TranscriptionResult
|
5 |
-
from typing import List, Tuple
|
6 |
import json
|
7 |
import soundfile as sf
|
8 |
|
@@ -52,12 +52,90 @@ def transcribe_segments(segment_paths: List[str], original_segments: List[AudioS
|
|
52 |
|
53 |
# 转录
|
54 |
results = transcriber.transcribe_segment(segment)
|
|
|
|
|
|
|
|
|
|
|
55 |
all_results.extend(results)
|
56 |
|
57 |
# 保存转录结果
|
58 |
output_path = transcriber.save_transcription(all_results, segment_paths[0])
|
59 |
return output_path
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def verify_transcription(json_path: str):
|
62 |
"""
|
63 |
交互式验证转录结果
|
@@ -99,15 +177,67 @@ def verify_transcription(json_path: str):
|
|
99 |
transcriber.save_transcription(results, json_path)
|
100 |
print("\n✅ 验证结果已保存")
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
def main():
|
103 |
parser = argparse.ArgumentParser(description="音频处理和转录工具")
|
104 |
-
parser.add_argument("action", choices=["process", "verify"
|
105 |
-
|
|
|
|
|
106 |
|
107 |
args = parser.parse_args()
|
108 |
|
109 |
try:
|
110 |
if args.action == "process":
|
|
|
|
|
|
|
|
|
111 |
print(f"处理音频文件: {args.input_path}")
|
112 |
# 1. 切割音频
|
113 |
segment_paths, original_segments = process_audio(args.input_path)
|
@@ -121,8 +251,43 @@ def main():
|
|
121 |
print(f"✅ 转录完成,结果保存在: {json_path}")
|
122 |
|
123 |
elif args.action == "verify":
|
|
|
|
|
|
|
|
|
124 |
verify_transcription(args.input_path)
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
except Exception as e:
|
127 |
print(f"错误: {e}")
|
128 |
|
|
|
2 |
import argparse
|
3 |
from audio_processor import AudioProcessor, AudioSegment
|
4 |
from audio_transcriber import AudioTranscriber, TranscriptionResult
|
5 |
+
from typing import List, Tuple, Optional
|
6 |
import json
|
7 |
import soundfile as sf
|
8 |
|
|
|
52 |
|
53 |
# 转录
|
54 |
results = transcriber.transcribe_segment(segment)
|
55 |
+
|
56 |
+
# 设置片段索引
|
57 |
+
for result in results:
|
58 |
+
result.segment_index = i
|
59 |
+
|
60 |
all_results.extend(results)
|
61 |
|
62 |
# 保存转录结果
|
63 |
output_path = transcriber.save_transcription(all_results, segment_paths[0])
|
64 |
return output_path
|
65 |
|
66 |
+
def transcribe_aggregated_segments(segment_paths: List[str], original_segments: List[AudioSegment],
|
67 |
+
segment_indices: List[int]) -> str:
|
68 |
+
"""
|
69 |
+
聚合转录指定的连续音频片段并保存结果
|
70 |
+
|
71 |
+
Args:
|
72 |
+
segment_paths: 所有音频片段的路径列表
|
73 |
+
original_segments: 原始音频片段列表
|
74 |
+
segment_indices: 要聚合的片段索引列表
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
保存的转录结果文件路径
|
78 |
+
"""
|
79 |
+
if not segment_indices:
|
80 |
+
raise ValueError("必须指定至少一个片段索引")
|
81 |
+
|
82 |
+
# 验证索引有效性
|
83 |
+
for idx in segment_indices:
|
84 |
+
if idx < 0 or idx >= len(segment_paths):
|
85 |
+
raise ValueError(f"无效的片段索引: {idx},有效范围: 0-{len(segment_paths)-1}")
|
86 |
+
|
87 |
+
# 按索引排序,确保按顺序处理
|
88 |
+
segment_indices.sort()
|
89 |
+
|
90 |
+
print(f"准备聚合转录片段索引: {segment_indices}")
|
91 |
+
|
92 |
+
transcriber = AudioTranscriber()
|
93 |
+
segments_to_aggregate = []
|
94 |
+
|
95 |
+
# 加载指定的音频片段
|
96 |
+
for idx in segment_indices:
|
97 |
+
path = segment_paths[idx]
|
98 |
+
print(f"加载片段 {idx}/{len(segment_paths)-1}: {path}")
|
99 |
+
|
100 |
+
# 读取音频数据
|
101 |
+
audio_data, _ = sf.read(path)
|
102 |
+
|
103 |
+
# 使用原始片段的时间戳
|
104 |
+
original_segment = original_segments[idx]
|
105 |
+
|
106 |
+
# 创建AudioSegment对象,保持原始时间戳
|
107 |
+
segment = AudioSegment(
|
108 |
+
start_time=original_segment.start_time,
|
109 |
+
end_time=original_segment.end_time,
|
110 |
+
audio_data=audio_data,
|
111 |
+
is_speech=True
|
112 |
+
)
|
113 |
+
|
114 |
+
segments_to_aggregate.append(segment)
|
115 |
+
|
116 |
+
# 聚合转录
|
117 |
+
print(f"开始聚合转录 {len(segments_to_aggregate)} 个片段...")
|
118 |
+
results = transcriber.transcribe_aggregated_segments(segments_to_aggregate)
|
119 |
+
|
120 |
+
# 设置聚合片段的索引信息
|
121 |
+
for result in results:
|
122 |
+
# 使用聚合的片段索引列表作为segment_index
|
123 |
+
result.segment_index = segment_indices
|
124 |
+
|
125 |
+
# 保存转录结果
|
126 |
+
# 使用第一个片段的路径作为基础,但添加"aggregated"标记
|
127 |
+
base_path = segment_paths[segment_indices[0]]
|
128 |
+
base_name = os.path.splitext(os.path.basename(base_path))[0]
|
129 |
+
aggregated_name = f"{base_name}_aggregated_{segment_indices[0]}_to_{segment_indices[-1]}"
|
130 |
+
|
131 |
+
# 创建一个临时路径用于保存
|
132 |
+
temp_path = os.path.join(os.path.dirname(base_path), f"{aggregated_name}.wav")
|
133 |
+
|
134 |
+
output_path = transcriber.save_transcription(results, temp_path)
|
135 |
+
print(f"✅ 聚合转录结果已保存到: {output_path}")
|
136 |
+
|
137 |
+
return output_path
|
138 |
+
|
139 |
def verify_transcription(json_path: str):
|
140 |
"""
|
141 |
交互式验证转录结果
|
|
|
177 |
transcriber.save_transcription(results, json_path)
|
178 |
print("\n✅ 验证结果已保存")
|
179 |
|
180 |
+
def get_existing_segments(base_dir="dataset/audio/segments"):
|
181 |
+
"""
|
182 |
+
获取已存在的音频片段列表
|
183 |
+
"""
|
184 |
+
if not os.path.exists(base_dir):
|
185 |
+
return [], []
|
186 |
+
|
187 |
+
# 获取所有wav文件
|
188 |
+
segment_files = [f for f in os.listdir(base_dir) if f.endswith('.wav')]
|
189 |
+
|
190 |
+
# 创建一个列表来存储(索引, 路径)对
|
191 |
+
indexed_paths = []
|
192 |
+
|
193 |
+
# 从文件名中提取索引
|
194 |
+
for filename in segment_files:
|
195 |
+
parts = filename.split('_')
|
196 |
+
if len(parts) >= 3 and parts[-2] == "segment":
|
197 |
+
try:
|
198 |
+
segment_idx = int(parts[-1].split('.')[0]) - 1 # 转换为0-based索引
|
199 |
+
full_path = os.path.join(base_dir, filename)
|
200 |
+
indexed_paths.append((segment_idx, full_path))
|
201 |
+
except (ValueError, IndexError):
|
202 |
+
print(f"警告: 无法从文件名 {filename} 中提取片段索引")
|
203 |
+
|
204 |
+
# 按索引排序
|
205 |
+
indexed_paths.sort(key=lambda x: x[0])
|
206 |
+
|
207 |
+
# 提取排序后的路径
|
208 |
+
segment_paths = [path for _, path in indexed_paths]
|
209 |
+
|
210 |
+
# 创建临时的AudioSegment对象
|
211 |
+
original_segments = []
|
212 |
+
for idx, path in indexed_paths:
|
213 |
+
audio_data, sample_rate = sf.read(path)
|
214 |
+
# 简单估计时间戳(实际应用中可能需要更精确的方法)
|
215 |
+
duration = len(audio_data) / sample_rate
|
216 |
+
segment = AudioSegment(
|
217 |
+
start_time=0.0, # 这里使用相对时间
|
218 |
+
end_time=duration,
|
219 |
+
audio_data=audio_data,
|
220 |
+
is_speech=True
|
221 |
+
)
|
222 |
+
original_segments.append(segment)
|
223 |
+
|
224 |
+
return segment_paths, original_segments
|
225 |
+
|
226 |
def main():
|
227 |
parser = argparse.ArgumentParser(description="音频处理和转录工具")
|
228 |
+
parser.add_argument("action", choices=["process", "verify", "aggregate"],
|
229 |
+
help="执行的操作: process(处理音频), verify(验证转录), aggregate(聚合转录)")
|
230 |
+
parser.add_argument("input_path", nargs='?', help="输入文件路径 (音频文件或JSON文件)")
|
231 |
+
parser.add_argument("--segments", type=str, help="要聚合的片段索引,用逗号分隔,例如 '0,1,2'")
|
232 |
|
233 |
args = parser.parse_args()
|
234 |
|
235 |
try:
|
236 |
if args.action == "process":
|
237 |
+
if not args.input_path:
|
238 |
+
print("❌ 使用 process 操作时必须指定输入文件路径")
|
239 |
+
return
|
240 |
+
|
241 |
print(f"处理音频文件: {args.input_path}")
|
242 |
# 1. 切割音频
|
243 |
segment_paths, original_segments = process_audio(args.input_path)
|
|
|
251 |
print(f"✅ 转录完成,结果保存在: {json_path}")
|
252 |
|
253 |
elif args.action == "verify":
|
254 |
+
if not args.input_path:
|
255 |
+
print("❌ 使用 verify 操作时必须指定输入文件路径")
|
256 |
+
return
|
257 |
+
|
258 |
verify_transcription(args.input_path)
|
259 |
|
260 |
+
elif args.action == "aggregate":
|
261 |
+
if not args.segments:
|
262 |
+
print("❌ 使用 aggregate 操作时必须指定 --segments 参数")
|
263 |
+
return
|
264 |
+
|
265 |
+
# 解析片段索引
|
266 |
+
try:
|
267 |
+
segment_indices = [int(idx.strip()) for idx in args.segments.split(",")]
|
268 |
+
except ValueError:
|
269 |
+
print("❌ 片段索引必须是整数,用逗号分隔")
|
270 |
+
return
|
271 |
+
|
272 |
+
# 获取已存在的音频片段
|
273 |
+
segment_paths, original_segments = get_existing_segments()
|
274 |
+
if not segment_paths:
|
275 |
+
print("❌ 未找到已处理的音频片段,请先使用 process 命令处理音频文件")
|
276 |
+
return
|
277 |
+
|
278 |
+
print(f"找到 {len(segment_paths)} 个已处理的音频片段")
|
279 |
+
|
280 |
+
# 验证索引有效性
|
281 |
+
max_idx = len(segment_paths) - 1
|
282 |
+
invalid_indices = [idx for idx in segment_indices if idx < 0 or idx > max_idx]
|
283 |
+
if invalid_indices:
|
284 |
+
print(f"❌ 无效的片段索引: {invalid_indices},有效范围: 0-{max_idx}")
|
285 |
+
return
|
286 |
+
|
287 |
+
# 聚合转录指定的片段
|
288 |
+
json_path = transcribe_aggregated_segments(segment_paths, original_segments, segment_indices)
|
289 |
+
print(f"✅ 聚合转录完成,结果保存在: {json_path}")
|
290 |
+
|
291 |
except Exception as e:
|
292 |
print(f"错误: {e}")
|
293 |
|