Commit
·
64f8498
1
Parent(s):
9d802fa
add vad test framework
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- dataset/audio/converter.py +31 -0
- dataset/audio/segments/test1_segment_1.wav +3 -0
- dataset/audio/segments/test1_segment_10.wav +3 -0
- dataset/audio/segments/test1_segment_11.wav +3 -0
- dataset/audio/segments/test1_segment_12.wav +3 -0
- dataset/audio/segments/test1_segment_2.wav +3 -0
- dataset/audio/segments/test1_segment_3.wav +3 -0
- dataset/audio/segments/test1_segment_4.wav +3 -0
- dataset/audio/segments/test1_segment_5.wav +3 -0
- dataset/audio/segments/test1_segment_6.wav +3 -0
- dataset/audio/segments/test1_segment_7.wav +3 -0
- dataset/audio/segments/test1_segment_8.wav +3 -0
- dataset/audio/segments/test1_segment_9.wav +3 -0
- dataset/audio/temp/test1_segments_20250423_133311.json +80 -0
- dataset/audio/temp/test1_segments_20250423_140123.json +80 -0
- dataset/audio/temp/test1_segments_20250423_140325.json +80 -0
- dataset/audio/temp/test1_segments_20250423_140503.json +80 -0
- dataset/audio/temp/test1_segments_20250423_140556.json +80 -0
- dataset/audio/test1.m4a +3 -0
- dataset/audio/test1.wav +3 -0
- dataset/transcripts/test1.json +4 -0
- dataset/transcripts/test1_segment_1_20250423_133335.json +150 -0
- dataset/transcripts/test1_segment_1_20250423_140126.json +159 -0
- dataset/transcripts/test1_segment_1_20250423_140600.json +159 -0
- vad/README.md +152 -0
- vad/__init__.py +10 -0
- vad/__pycache__/__init__.cpython-312.pyc +0 -0
- vad/__pycache__/audio_processor.cpython-312.pyc +0 -0
- vad/__pycache__/audio_transcriber.cpython-312.pyc +0 -0
- vad/__pycache__/main.cpython-312.pyc +0 -0
- vad/audio_processor.py +212 -0
- vad/audio_transcriber.py +163 -0
- vad/dataset/audio/segments/test1_segment_1.wav +3 -0
- vad/dataset/audio/segments/test1_segment_10.wav +3 -0
- vad/dataset/audio/segments/test1_segment_11.wav +3 -0
- vad/dataset/audio/segments/test1_segment_12.wav +3 -0
- vad/dataset/audio/segments/test1_segment_13.wav +3 -0
- vad/dataset/audio/segments/test1_segment_14.wav +3 -0
- vad/dataset/audio/segments/test1_segment_15.wav +3 -0
- vad/dataset/audio/segments/test1_segment_16.wav +3 -0
- vad/dataset/audio/segments/test1_segment_17.wav +3 -0
- vad/dataset/audio/segments/test1_segment_18.wav +3 -0
- vad/dataset/audio/segments/test1_segment_2.wav +3 -0
- vad/dataset/audio/segments/test1_segment_3.wav +3 -0
- vad/dataset/audio/segments/test1_segment_4.wav +3 -0
- vad/dataset/audio/segments/test1_segment_5.wav +3 -0
- vad/dataset/audio/segments/test1_segment_6.wav +3 -0
- vad/dataset/audio/segments/test1_segment_7.wav +3 -0
- vad/dataset/audio/segments/test1_segment_8.wav +3 -0
- vad/dataset/audio/segments/test1_segment_9.wav +3 -0
dataset/audio/converter.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
+
import imageio_ffmpeg
|
3 |
+
|
4 |
+
input_file = "test1.m4a"
|
5 |
+
output_file = "test1.wav"
|
6 |
+
ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
|
7 |
+
|
8 |
+
cmd = [
|
9 |
+
ffmpeg_path,
|
10 |
+
"-y", # 覆盖输出文件
|
11 |
+
"-i", input_file, # 输入文件
|
12 |
+
"-ar", "16000", # 采样率
|
13 |
+
"-ac", "1", # 单声道
|
14 |
+
output_file
|
15 |
+
]
|
16 |
+
|
17 |
+
print(f"🎬 正在转换: {input_file} -> {output_file}")
|
18 |
+
res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
19 |
+
|
20 |
+
# 输出 stderr 日志(FFmpeg 输出信息都在 stderr)
|
21 |
+
stderr = res.stderr.decode()
|
22 |
+
print("------ FFmpeg stderr ------")
|
23 |
+
print(stderr)
|
24 |
+
print("----------------------------")
|
25 |
+
|
26 |
+
# 检查是否生成成功
|
27 |
+
import os
|
28 |
+
if os.path.exists(output_file):
|
29 |
+
print(f"✅ 成功生成 {output_file}")
|
30 |
+
else:
|
31 |
+
print("❌ 转换失败,请检查上面的错误信息")
|
dataset/audio/segments/test1_segment_1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:877aee33d778b34af2f0b819ac822d80316e97b73cb3823c1f436dbef8efcb0e
|
3 |
+
size 35564
|
dataset/audio/segments/test1_segment_10.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3135d983a5260d846e6cf165583efa3a0ef379bd86c885e678a63b41f66f548b
|
3 |
+
size 48044
|
dataset/audio/segments/test1_segment_11.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9a966cbb2e82ebd278692adad509a18061306b73b715fc4a93468c27ed61627b
|
3 |
+
size 111404
|
dataset/audio/segments/test1_segment_12.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52cfbcdc17cc5f190df467310f1a91c89e27f79662b2ce13f4ff5ec07015afec
|
3 |
+
size 71084
|
dataset/audio/segments/test1_segment_2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:81375721eb3a532941083c9781f53f5e0f1ccbe1ef4108f98a019de400f5c564
|
3 |
+
size 117164
|
dataset/audio/segments/test1_segment_3.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd6120ff04e7365640b9e3a1fb062bc1c31ce0dc54904bd27e25ac5a0b068cde
|
3 |
+
size 149804
|
dataset/audio/segments/test1_segment_4.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99bc0d18ffd0d10742b8d6b5450e537eccd1497c2247e714fa8efe6beb602abd
|
3 |
+
size 41324
|
dataset/audio/segments/test1_segment_5.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a9e2196db3537028898b87442f074523251b33219302e6eb8518fb33396c30bd
|
3 |
+
size 122924
|
dataset/audio/segments/test1_segment_6.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e929f7966a425a559b7442a2914cb99b0df74f1d02938264642dc71f160fc383
|
3 |
+
size 113324
|
dataset/audio/segments/test1_segment_7.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65d800356647c415d80e59fac63db01df31ce51a497aacf43f98aa0e6ec468cb
|
3 |
+
size 77804
|
dataset/audio/segments/test1_segment_8.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c1c574a7c20332f85c6260febf6eae232473a798404ca29f1b54ac39e5b2d35c
|
3 |
+
size 91244
|
dataset/audio/segments/test1_segment_9.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f943b20eb3aafa0befb884f5d125e0596d3f419d8a3c5546ff3cf878603c36b8
|
3 |
+
size 67244
|
dataset/audio/temp/test1_segments_20250423_133311.json
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "../dataset/audio/test1.wav",
|
3 |
+
"timestamp": "20250423_133311",
|
4 |
+
"total_frames": 1821,
|
5 |
+
"speech_frames": 1167,
|
6 |
+
"segments": [
|
7 |
+
{
|
8 |
+
"start_time": 4.56,
|
9 |
+
"end_time": 5.67,
|
10 |
+
"duration": 1.1100000000000003,
|
11 |
+
"is_speech": true
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"start_time": 8.4,
|
15 |
+
"end_time": 12.06,
|
16 |
+
"duration": 3.66,
|
17 |
+
"is_speech": true
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"start_time": 13.11,
|
21 |
+
"end_time": 17.79,
|
22 |
+
"duration": 4.68,
|
23 |
+
"is_speech": true
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"start_time": 19.77,
|
27 |
+
"end_time": 21.06,
|
28 |
+
"duration": 1.2899999999999991,
|
29 |
+
"is_speech": true
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"start_time": 21.63,
|
33 |
+
"end_time": 25.47,
|
34 |
+
"duration": 3.84,
|
35 |
+
"is_speech": true
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"start_time": 26.28,
|
39 |
+
"end_time": 29.82,
|
40 |
+
"duration": 3.539999999999999,
|
41 |
+
"is_speech": true
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"start_time": 30.42,
|
45 |
+
"end_time": 32.85,
|
46 |
+
"duration": 2.4299999999999997,
|
47 |
+
"is_speech": true
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"start_time": 33.54,
|
51 |
+
"end_time": 36.39,
|
52 |
+
"duration": 2.8500000000000014,
|
53 |
+
"is_speech": true
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"start_time": 37.8,
|
57 |
+
"end_time": 39.9,
|
58 |
+
"duration": 2.1000000000000014,
|
59 |
+
"is_speech": true
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"start_time": 40.86,
|
63 |
+
"end_time": 42.36,
|
64 |
+
"duration": 1.5,
|
65 |
+
"is_speech": true
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"start_time": 43.05,
|
69 |
+
"end_time": 46.53,
|
70 |
+
"duration": 3.480000000000004,
|
71 |
+
"is_speech": true
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"start_time": 47.49,
|
75 |
+
"end_time": 49.71,
|
76 |
+
"duration": 2.219999999999999,
|
77 |
+
"is_speech": true
|
78 |
+
}
|
79 |
+
]
|
80 |
+
}
|
dataset/audio/temp/test1_segments_20250423_140123.json
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "../dataset/audio/test1.wav",
|
3 |
+
"timestamp": "20250423_140123",
|
4 |
+
"total_frames": 1821,
|
5 |
+
"speech_frames": 1167,
|
6 |
+
"segments": [
|
7 |
+
{
|
8 |
+
"start_time": 4.56,
|
9 |
+
"end_time": 5.67,
|
10 |
+
"duration": 1.1100000000000003,
|
11 |
+
"is_speech": true
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"start_time": 8.4,
|
15 |
+
"end_time": 12.06,
|
16 |
+
"duration": 3.66,
|
17 |
+
"is_speech": true
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"start_time": 13.11,
|
21 |
+
"end_time": 17.79,
|
22 |
+
"duration": 4.68,
|
23 |
+
"is_speech": true
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"start_time": 19.77,
|
27 |
+
"end_time": 21.06,
|
28 |
+
"duration": 1.2899999999999991,
|
29 |
+
"is_speech": true
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"start_time": 21.63,
|
33 |
+
"end_time": 25.47,
|
34 |
+
"duration": 3.84,
|
35 |
+
"is_speech": true
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"start_time": 26.28,
|
39 |
+
"end_time": 29.82,
|
40 |
+
"duration": 3.539999999999999,
|
41 |
+
"is_speech": true
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"start_time": 30.42,
|
45 |
+
"end_time": 32.85,
|
46 |
+
"duration": 2.4299999999999997,
|
47 |
+
"is_speech": true
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"start_time": 33.54,
|
51 |
+
"end_time": 36.39,
|
52 |
+
"duration": 2.8500000000000014,
|
53 |
+
"is_speech": true
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"start_time": 37.8,
|
57 |
+
"end_time": 39.9,
|
58 |
+
"duration": 2.1000000000000014,
|
59 |
+
"is_speech": true
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"start_time": 40.86,
|
63 |
+
"end_time": 42.36,
|
64 |
+
"duration": 1.5,
|
65 |
+
"is_speech": true
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"start_time": 43.05,
|
69 |
+
"end_time": 46.53,
|
70 |
+
"duration": 3.480000000000004,
|
71 |
+
"is_speech": true
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"start_time": 47.49,
|
75 |
+
"end_time": 49.71,
|
76 |
+
"duration": 2.219999999999999,
|
77 |
+
"is_speech": true
|
78 |
+
}
|
79 |
+
]
|
80 |
+
}
|
dataset/audio/temp/test1_segments_20250423_140325.json
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "../dataset/audio/test1.wav",
|
3 |
+
"timestamp": "20250423_140325",
|
4 |
+
"total_frames": 1821,
|
5 |
+
"speech_frames": 1167,
|
6 |
+
"segments": [
|
7 |
+
{
|
8 |
+
"start_time": 4.56,
|
9 |
+
"end_time": 5.67,
|
10 |
+
"duration": 1.1100000000000003,
|
11 |
+
"is_speech": true
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"start_time": 8.4,
|
15 |
+
"end_time": 12.06,
|
16 |
+
"duration": 3.66,
|
17 |
+
"is_speech": true
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"start_time": 13.11,
|
21 |
+
"end_time": 17.79,
|
22 |
+
"duration": 4.68,
|
23 |
+
"is_speech": true
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"start_time": 19.77,
|
27 |
+
"end_time": 21.06,
|
28 |
+
"duration": 1.2899999999999991,
|
29 |
+
"is_speech": true
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"start_time": 21.63,
|
33 |
+
"end_time": 25.47,
|
34 |
+
"duration": 3.84,
|
35 |
+
"is_speech": true
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"start_time": 26.28,
|
39 |
+
"end_time": 29.82,
|
40 |
+
"duration": 3.539999999999999,
|
41 |
+
"is_speech": true
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"start_time": 30.42,
|
45 |
+
"end_time": 32.85,
|
46 |
+
"duration": 2.4299999999999997,
|
47 |
+
"is_speech": true
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"start_time": 33.54,
|
51 |
+
"end_time": 36.39,
|
52 |
+
"duration": 2.8500000000000014,
|
53 |
+
"is_speech": true
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"start_time": 37.8,
|
57 |
+
"end_time": 39.9,
|
58 |
+
"duration": 2.1000000000000014,
|
59 |
+
"is_speech": true
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"start_time": 40.86,
|
63 |
+
"end_time": 42.36,
|
64 |
+
"duration": 1.5,
|
65 |
+
"is_speech": true
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"start_time": 43.05,
|
69 |
+
"end_time": 46.53,
|
70 |
+
"duration": 3.480000000000004,
|
71 |
+
"is_speech": true
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"start_time": 47.49,
|
75 |
+
"end_time": 49.71,
|
76 |
+
"duration": 2.219999999999999,
|
77 |
+
"is_speech": true
|
78 |
+
}
|
79 |
+
]
|
80 |
+
}
|
dataset/audio/temp/test1_segments_20250423_140503.json
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "../dataset/audio/test1.wav",
|
3 |
+
"timestamp": "20250423_140503",
|
4 |
+
"total_frames": 1821,
|
5 |
+
"speech_frames": 1167,
|
6 |
+
"segments": [
|
7 |
+
{
|
8 |
+
"start_time": 4.56,
|
9 |
+
"end_time": 5.67,
|
10 |
+
"duration": 1.1100000000000003,
|
11 |
+
"is_speech": true
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"start_time": 8.4,
|
15 |
+
"end_time": 12.06,
|
16 |
+
"duration": 3.66,
|
17 |
+
"is_speech": true
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"start_time": 13.11,
|
21 |
+
"end_time": 17.79,
|
22 |
+
"duration": 4.68,
|
23 |
+
"is_speech": true
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"start_time": 19.77,
|
27 |
+
"end_time": 21.06,
|
28 |
+
"duration": 1.2899999999999991,
|
29 |
+
"is_speech": true
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"start_time": 21.63,
|
33 |
+
"end_time": 25.47,
|
34 |
+
"duration": 3.84,
|
35 |
+
"is_speech": true
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"start_time": 26.28,
|
39 |
+
"end_time": 29.82,
|
40 |
+
"duration": 3.539999999999999,
|
41 |
+
"is_speech": true
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"start_time": 30.42,
|
45 |
+
"end_time": 32.85,
|
46 |
+
"duration": 2.4299999999999997,
|
47 |
+
"is_speech": true
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"start_time": 33.54,
|
51 |
+
"end_time": 36.39,
|
52 |
+
"duration": 2.8500000000000014,
|
53 |
+
"is_speech": true
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"start_time": 37.8,
|
57 |
+
"end_time": 39.9,
|
58 |
+
"duration": 2.1000000000000014,
|
59 |
+
"is_speech": true
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"start_time": 40.86,
|
63 |
+
"end_time": 42.36,
|
64 |
+
"duration": 1.5,
|
65 |
+
"is_speech": true
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"start_time": 43.05,
|
69 |
+
"end_time": 46.53,
|
70 |
+
"duration": 3.480000000000004,
|
71 |
+
"is_speech": true
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"start_time": 47.49,
|
75 |
+
"end_time": 49.71,
|
76 |
+
"duration": 2.219999999999999,
|
77 |
+
"is_speech": true
|
78 |
+
}
|
79 |
+
]
|
80 |
+
}
|
dataset/audio/temp/test1_segments_20250423_140556.json
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "../dataset/audio/test1.wav",
|
3 |
+
"timestamp": "20250423_140556",
|
4 |
+
"total_frames": 1821,
|
5 |
+
"speech_frames": 1167,
|
6 |
+
"segments": [
|
7 |
+
{
|
8 |
+
"start_time": 4.56,
|
9 |
+
"end_time": 5.67,
|
10 |
+
"duration": 1.1100000000000003,
|
11 |
+
"is_speech": true
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"start_time": 8.4,
|
15 |
+
"end_time": 12.06,
|
16 |
+
"duration": 3.66,
|
17 |
+
"is_speech": true
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"start_time": 13.11,
|
21 |
+
"end_time": 17.79,
|
22 |
+
"duration": 4.68,
|
23 |
+
"is_speech": true
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"start_time": 19.77,
|
27 |
+
"end_time": 21.06,
|
28 |
+
"duration": 1.2899999999999991,
|
29 |
+
"is_speech": true
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"start_time": 21.63,
|
33 |
+
"end_time": 25.47,
|
34 |
+
"duration": 3.84,
|
35 |
+
"is_speech": true
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"start_time": 26.28,
|
39 |
+
"end_time": 29.82,
|
40 |
+
"duration": 3.539999999999999,
|
41 |
+
"is_speech": true
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"start_time": 30.42,
|
45 |
+
"end_time": 32.85,
|
46 |
+
"duration": 2.4299999999999997,
|
47 |
+
"is_speech": true
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"start_time": 33.54,
|
51 |
+
"end_time": 36.39,
|
52 |
+
"duration": 2.8500000000000014,
|
53 |
+
"is_speech": true
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"start_time": 37.8,
|
57 |
+
"end_time": 39.9,
|
58 |
+
"duration": 2.1000000000000014,
|
59 |
+
"is_speech": true
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"start_time": 40.86,
|
63 |
+
"end_time": 42.36,
|
64 |
+
"duration": 1.5,
|
65 |
+
"is_speech": true
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"start_time": 43.05,
|
69 |
+
"end_time": 46.53,
|
70 |
+
"duration": 3.480000000000004,
|
71 |
+
"is_speech": true
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"start_time": 47.49,
|
75 |
+
"end_time": 49.71,
|
76 |
+
"duration": 2.219999999999999,
|
77 |
+
"is_speech": true
|
78 |
+
}
|
79 |
+
]
|
80 |
+
}
|
dataset/audio/test1.m4a
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d6786ff56dc79183c1f66df16699fb97b3efdc9819184241772e70e89f36b875
|
3 |
+
size 1389154
|
dataset/audio/test1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02f25859f79b85109154b534742128605c7ea34e1154f9d17d21c302a67b92b3
|
3 |
+
size 1749070
|
dataset/transcripts/test1.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"felo_transcript": "第一单元,音频数据处理。\n单元简介。\n所有音频或语音相关的任务都需要使用音频文件在我们深入了解这些任务之前我们需要了解音频文件的实际内容以及如何利用音频文件本单元将为你介绍的呃本单元将为你介绍与音频数据相关的基本概念包括波形采样率和频谱图你会学习到如何使用音频数据集包括音频数据加载音频数据预处理以及高效加载大规模音频数据集的\n完成本单元的学习后,你会掌握基础的音频相关术语,并且掌握针对不同应用的音频数据处理工具。\n本单元的知识会成为后面章节的基础。",
|
3 |
+
"transcript": "第1单元:音频数据处理\n单元简介\n所有音频或语音相关的任务都需要使用音频文件。在我们深入了解这些任务之前,我们需要了解音频文件的实际内容以及如何利用音频文件。\n本单元将为你介绍与音频数据相关的基本概念,包括波形、采样率和频谱图。你会学习到如何使用音频数据集,包括音频数据加载、音频数据预处理,以及高效加载大规模音频数据集的流式加载方法。\n完成本单元的学习后,你会掌握基础的音频相关术语,并且掌握针对不同应用的音频数据处理工具。本单元的知识会成为后面章节的基础"
|
4 |
+
}
|
dataset/transcripts/test1_segment_1_20250423_133335.json
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "../dataset/audio/segments\\test1_segment_1.wav",
|
3 |
+
"timestamp": "20250423_133335",
|
4 |
+
"segments": [
|
5 |
+
{
|
6 |
+
"text": "音频数据处理",
|
7 |
+
"start_time": 0.0,
|
8 |
+
"end_time": 1.16,
|
9 |
+
"confidence": 0.8830433189868927,
|
10 |
+
"verified": false,
|
11 |
+
"verified_text": null,
|
12 |
+
"verification_notes": null
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"text": "所有音频或语言相关的任务都需要使用音频",
|
16 |
+
"start_time": 0.0,
|
17 |
+
"end_time": 3.72,
|
18 |
+
"confidence": 0.7980242520570755,
|
19 |
+
"verified": false,
|
20 |
+
"verified_text": null,
|
21 |
+
"verification_notes": null
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"text": "在我们先辱了解这些任务之前",
|
25 |
+
"start_time": 0.0,
|
26 |
+
"end_time": 1.6400000000000001,
|
27 |
+
"confidence": 0.9636461660265923,
|
28 |
+
"verified": false,
|
29 |
+
"verified_text": null,
|
30 |
+
"verification_notes": null
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"text": "我们需要了解音频文件的实际内容",
|
34 |
+
"start_time": 1.6400000000000001,
|
35 |
+
"end_time": 4.0,
|
36 |
+
"confidence": 0.9636461660265923,
|
37 |
+
"verified": false,
|
38 |
+
"verified_text": null,
|
39 |
+
"verification_notes": null
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"text": "以及如何",
|
43 |
+
"start_time": 4.0,
|
44 |
+
"end_time": 4.8,
|
45 |
+
"confidence": 0.9636461660265923,
|
46 |
+
"verified": false,
|
47 |
+
"verified_text": null,
|
48 |
+
"verification_notes": null
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"text": "本来员将会你介绍的",
|
52 |
+
"start_time": 0.0,
|
53 |
+
"end_time": 1.28,
|
54 |
+
"confidence": 0.8996343165636063,
|
55 |
+
"verified": false,
|
56 |
+
"verified_text": null,
|
57 |
+
"verification_notes": null
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"text": "本单元将为你介绍于音频数据相关的基本概念",
|
61 |
+
"start_time": 0.0,
|
62 |
+
"end_time": 3.92,
|
63 |
+
"confidence": 0.6721383035182953,
|
64 |
+
"verified": false,
|
65 |
+
"verified_text": null,
|
66 |
+
"verification_notes": null
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"text": "包括剝形、採揚、綠和平、布圖",
|
70 |
+
"start_time": 0.0,
|
71 |
+
"end_time": 2.0,
|
72 |
+
"confidence": 0.7332137525081635,
|
73 |
+
"verified": false,
|
74 |
+
"verified_text": null,
|
75 |
+
"verification_notes": null
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"text": "你會學習到如何使用音頻",
|
79 |
+
"start_time": 2.0,
|
80 |
+
"end_time": 3.6,
|
81 |
+
"confidence": 0.7332137525081635,
|
82 |
+
"verified": false,
|
83 |
+
"verified_text": null,
|
84 |
+
"verification_notes": null
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"text": "包括音频数位加载",
|
88 |
+
"start_time": 0.0,
|
89 |
+
"end_time": 1.4000000000000001,
|
90 |
+
"confidence": 0.8692675232887268,
|
91 |
+
"verified": false,
|
92 |
+
"verified_text": null,
|
93 |
+
"verification_notes": null
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"text": "音频数据处理",
|
97 |
+
"start_time": 1.4000000000000001,
|
98 |
+
"end_time": 2.4,
|
99 |
+
"confidence": 0.8692675232887268,
|
100 |
+
"verified": false,
|
101 |
+
"verified_text": null,
|
102 |
+
"verification_notes": null
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"text": "高效加载大规模音频数一级的流适加载方",
|
106 |
+
"start_time": 0.0,
|
107 |
+
"end_time": 2.88,
|
108 |
+
"confidence": 0.9492924958467484,
|
109 |
+
"verified": false,
|
110 |
+
"verified_text": null,
|
111 |
+
"verification_notes": null
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"text": "完成本單元的學習後,你會找",
|
115 |
+
"start_time": 0.0,
|
116 |
+
"end_time": 2.0,
|
117 |
+
"confidence": 0.9920552605763078,
|
118 |
+
"verified": false,
|
119 |
+
"verified_text": null,
|
120 |
+
"verification_notes": null
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"text": "基础的音频相关数",
|
124 |
+
"start_time": 0.0,
|
125 |
+
"end_time": 1.6,
|
126 |
+
"confidence": 0.8243299126625061,
|
127 |
+
"verified": false,
|
128 |
+
"verified_text": null,
|
129 |
+
"verification_notes": null
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"text": "并且掌握针对不同应用的音频数据处理工具",
|
133 |
+
"start_time": 0.0,
|
134 |
+
"end_time": 3.52,
|
135 |
+
"confidence": 0.9778542779386044,
|
136 |
+
"verified": false,
|
137 |
+
"verified_text": null,
|
138 |
+
"verification_notes": null
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"text": "本单元的支持会成为后面章节的",
|
142 |
+
"start_time": 0.0,
|
143 |
+
"end_time": 2.0,
|
144 |
+
"confidence": 0.920660175383091,
|
145 |
+
"verified": false,
|
146 |
+
"verified_text": null,
|
147 |
+
"verification_notes": null
|
148 |
+
}
|
149 |
+
]
|
150 |
+
}
|
dataset/transcripts/test1_segment_1_20250423_140126.json
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "../dataset/audio/segments\\test1_segment_1.wav",
|
3 |
+
"timestamp": "20250423_140126",
|
4 |
+
"segments": [
|
5 |
+
{
|
6 |
+
"text": "音频数据处理",
|
7 |
+
"start_time": 0.0,
|
8 |
+
"end_time": 1.16,
|
9 |
+
"confidence": 0.906494140625,
|
10 |
+
"verified": false,
|
11 |
+
"verified_text": null,
|
12 |
+
"verification_notes": null
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"text": "所有音频或语言相关的任务都需要使用音频",
|
16 |
+
"start_time": 0.0,
|
17 |
+
"end_time": 3.72,
|
18 |
+
"confidence": 0.7564697265625,
|
19 |
+
"verified": false,
|
20 |
+
"verified_text": null,
|
21 |
+
"verification_notes": null
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"text": "在我们先入了解这些任务之前",
|
25 |
+
"start_time": 0.0,
|
26 |
+
"end_time": 1.6400000000000001,
|
27 |
+
"confidence": 0.939605712890625,
|
28 |
+
"verified": false,
|
29 |
+
"verified_text": null,
|
30 |
+
"verification_notes": null
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"text": "我们需要了解音频文件的实际内容",
|
34 |
+
"start_time": 1.6400000000000001,
|
35 |
+
"end_time": 4.0,
|
36 |
+
"confidence": 0.939605712890625,
|
37 |
+
"verified": false,
|
38 |
+
"verified_text": null,
|
39 |
+
"verification_notes": null
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"text": "以及如何",
|
43 |
+
"start_time": 4.0,
|
44 |
+
"end_time": 4.8,
|
45 |
+
"confidence": 0.939605712890625,
|
46 |
+
"verified": false,
|
47 |
+
"verified_text": null,
|
48 |
+
"verification_notes": null
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"text": "本台語言將為你介紹的",
|
52 |
+
"start_time": 0.0,
|
53 |
+
"end_time": 1.28,
|
54 |
+
"confidence": 0.907470703125,
|
55 |
+
"verified": false,
|
56 |
+
"verified_text": null,
|
57 |
+
"verification_notes": null
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"text": "本单元将为你介绍于音频数据相关的基本概念",
|
61 |
+
"start_time": 0.0,
|
62 |
+
"end_time": 3.92,
|
63 |
+
"confidence": 0.66796875,
|
64 |
+
"verified": false,
|
65 |
+
"verified_text": null,
|
66 |
+
"verification_notes": null
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"text": "包括剝形、採用、綠和平補土",
|
70 |
+
"start_time": 0.0,
|
71 |
+
"end_time": 2.0,
|
72 |
+
"confidence": 0.708251953125,
|
73 |
+
"verified": false,
|
74 |
+
"verified_text": null,
|
75 |
+
"verification_notes": null
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"text": "你會學習到如何使用音頻",
|
79 |
+
"start_time": 2.0,
|
80 |
+
"end_time": 3.6,
|
81 |
+
"confidence": 0.708251953125,
|
82 |
+
"verified": false,
|
83 |
+
"verified_text": null,
|
84 |
+
"verification_notes": null
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"text": "包括音频数位加载",
|
88 |
+
"start_time": 0.0,
|
89 |
+
"end_time": 1.4000000000000001,
|
90 |
+
"confidence": 0.86474609375,
|
91 |
+
"verified": false,
|
92 |
+
"verified_text": null,
|
93 |
+
"verification_notes": null
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"text": "音频数据处理",
|
97 |
+
"start_time": 1.4000000000000001,
|
98 |
+
"end_time": 2.4,
|
99 |
+
"confidence": 0.86474609375,
|
100 |
+
"verified": false,
|
101 |
+
"verified_text": null,
|
102 |
+
"verification_notes": null
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"text": "高效加载大规模音频数级的流适加载方",
|
106 |
+
"start_time": 0.0,
|
107 |
+
"end_time": 2.88,
|
108 |
+
"confidence": 0.956787109375,
|
109 |
+
"verified": false,
|
110 |
+
"verified_text": null,
|
111 |
+
"verification_notes": null
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"text": "完成本單元的學期後",
|
115 |
+
"start_time": 0.0,
|
116 |
+
"end_time": 1.44,
|
117 |
+
"confidence": 0.9926719665527344,
|
118 |
+
"verified": false,
|
119 |
+
"verified_text": null,
|
120 |
+
"verification_notes": null
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"text": "你會找",
|
124 |
+
"start_time": 1.44,
|
125 |
+
"end_time": 2.12,
|
126 |
+
"confidence": 0.9926719665527344,
|
127 |
+
"verified": false,
|
128 |
+
"verified_text": null,
|
129 |
+
"verification_notes": null
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"text": "基础的音频相关数",
|
133 |
+
"start_time": 0.0,
|
134 |
+
"end_time": 1.6,
|
135 |
+
"confidence": 0.7969970703125,
|
136 |
+
"verified": false,
|
137 |
+
"verified_text": null,
|
138 |
+
"verification_notes": null
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"text": "并且掌握针对不同应用的音频数据处理工具",
|
142 |
+
"start_time": 0.0,
|
143 |
+
"end_time": 3.52,
|
144 |
+
"confidence": 0.9851303100585938,
|
145 |
+
"verified": false,
|
146 |
+
"verified_text": null,
|
147 |
+
"verification_notes": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"text": "本单元的支持会成为后面章节的",
|
151 |
+
"start_time": 0.0,
|
152 |
+
"end_time": 2.0,
|
153 |
+
"confidence": 0.930908203125,
|
154 |
+
"verified": false,
|
155 |
+
"verified_text": null,
|
156 |
+
"verification_notes": null
|
157 |
+
}
|
158 |
+
]
|
159 |
+
}
|
dataset/transcripts/test1_segment_1_20250423_140600.json
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"audio_file": "../dataset/audio/segments\\test1_segment_1.wav",
|
3 |
+
"timestamp": "20250423_140600",
|
4 |
+
"segments": [
|
5 |
+
{
|
6 |
+
"text": "音频数据处理",
|
7 |
+
"start_time": 4.56,
|
8 |
+
"end_time": 5.72,
|
9 |
+
"confidence": 0.906494140625,
|
10 |
+
"verified": false,
|
11 |
+
"verified_text": null,
|
12 |
+
"verification_notes": null
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"text": "所有音频或语言相关的任务都需要使用音频",
|
16 |
+
"start_time": 8.4,
|
17 |
+
"end_time": 12.120000000000001,
|
18 |
+
"confidence": 0.7564697265625,
|
19 |
+
"verified": false,
|
20 |
+
"verified_text": null,
|
21 |
+
"verification_notes": null
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"text": "在我们先入了解这些任务之前",
|
25 |
+
"start_time": 13.11,
|
26 |
+
"end_time": 14.75,
|
27 |
+
"confidence": 0.939605712890625,
|
28 |
+
"verified": false,
|
29 |
+
"verified_text": null,
|
30 |
+
"verification_notes": null
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"text": "我们需要了解音频文件的实际内容",
|
34 |
+
"start_time": 14.75,
|
35 |
+
"end_time": 17.11,
|
36 |
+
"confidence": 0.939605712890625,
|
37 |
+
"verified": false,
|
38 |
+
"verified_text": null,
|
39 |
+
"verification_notes": null
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"text": "以及如何",
|
43 |
+
"start_time": 17.11,
|
44 |
+
"end_time": 17.91,
|
45 |
+
"confidence": 0.939605712890625,
|
46 |
+
"verified": false,
|
47 |
+
"verified_text": null,
|
48 |
+
"verification_notes": null
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"text": "本台語言將為你介紹的",
|
52 |
+
"start_time": 19.77,
|
53 |
+
"end_time": 21.05,
|
54 |
+
"confidence": 0.907470703125,
|
55 |
+
"verified": false,
|
56 |
+
"verified_text": null,
|
57 |
+
"verification_notes": null
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"text": "本单元将为你介绍于音频数据相关的基本概念",
|
61 |
+
"start_time": 21.63,
|
62 |
+
"end_time": 25.549999999999997,
|
63 |
+
"confidence": 0.66796875,
|
64 |
+
"verified": false,
|
65 |
+
"verified_text": null,
|
66 |
+
"verification_notes": null
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"text": "包括剝形、採用、綠和平補土",
|
70 |
+
"start_time": 26.28,
|
71 |
+
"end_time": 28.28,
|
72 |
+
"confidence": 0.708251953125,
|
73 |
+
"verified": false,
|
74 |
+
"verified_text": null,
|
75 |
+
"verification_notes": null
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"text": "你會學習到如何使用音頻",
|
79 |
+
"start_time": 28.28,
|
80 |
+
"end_time": 29.880000000000003,
|
81 |
+
"confidence": 0.708251953125,
|
82 |
+
"verified": false,
|
83 |
+
"verified_text": null,
|
84 |
+
"verification_notes": null
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"text": "包括音频数位加载",
|
88 |
+
"start_time": 30.42,
|
89 |
+
"end_time": 31.860000000000003,
|
90 |
+
"confidence": 0.86474609375,
|
91 |
+
"verified": false,
|
92 |
+
"verified_text": null,
|
93 |
+
"verification_notes": null
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"text": "音频数据处理",
|
97 |
+
"start_time": 31.860000000000003,
|
98 |
+
"end_time": 32.86,
|
99 |
+
"confidence": 0.86474609375,
|
100 |
+
"verified": false,
|
101 |
+
"verified_text": null,
|
102 |
+
"verification_notes": null
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"text": "高效加载大规模音频数级的流适加载方",
|
106 |
+
"start_time": 33.54,
|
107 |
+
"end_time": 36.42,
|
108 |
+
"confidence": 0.956787109375,
|
109 |
+
"verified": false,
|
110 |
+
"verified_text": null,
|
111 |
+
"verification_notes": null
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"text": "完成本单元的学期后",
|
115 |
+
"start_time": 37.8,
|
116 |
+
"end_time": 39.199999999999996,
|
117 |
+
"confidence": 0.9926719665527344,
|
118 |
+
"verified": false,
|
119 |
+
"verified_text": null,
|
120 |
+
"verification_notes": null
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"text": "你会找",
|
124 |
+
"start_time": 39.199999999999996,
|
125 |
+
"end_time": 40.0,
|
126 |
+
"confidence": 0.9926719665527344,
|
127 |
+
"verified": false,
|
128 |
+
"verified_text": null,
|
129 |
+
"verification_notes": null
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"text": "基础的音频相关数",
|
133 |
+
"start_time": 40.86,
|
134 |
+
"end_time": 42.46,
|
135 |
+
"confidence": 0.7969970703125,
|
136 |
+
"verified": false,
|
137 |
+
"verified_text": null,
|
138 |
+
"verification_notes": null
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"text": "并且掌握针对不同应用的音频数据处理工具",
|
142 |
+
"start_time": 43.05,
|
143 |
+
"end_time": 46.57,
|
144 |
+
"confidence": 0.9851303100585938,
|
145 |
+
"verified": false,
|
146 |
+
"verified_text": null,
|
147 |
+
"verification_notes": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"text": "本单元的支持会成为后面章节的",
|
151 |
+
"start_time": 47.49,
|
152 |
+
"end_time": 49.49,
|
153 |
+
"confidence": 0.930908203125,
|
154 |
+
"verified": false,
|
155 |
+
"verified_text": null,
|
156 |
+
"verification_notes": null
|
157 |
+
}
|
158 |
+
]
|
159 |
+
}
|
vad/README.md
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 音频数据集处理工具
|
2 |
+
|
3 |
+
本工具用于处理音频数据集,支持音频切割、转录和人工验证功能。
|
4 |
+
|
5 |
+
## 功能特性
|
6 |
+
|
7 |
+
### 1. 音频处理和切割
|
8 |
+
- 基于振幅和VAD(Voice Activity Detection)的音频切割
|
9 |
+
- 自动过滤无效的短音频片段
|
10 |
+
- 保存切割后的音频片段到指定目录
|
11 |
+
|
12 |
+
### 2. 音频转录
|
13 |
+
- 使用faster-whisper模型进行音频转录
|
14 |
+
- 支持批量处理音频片段
|
15 |
+
- 保存带时间戳的转录文本
|
16 |
+
|
17 |
+
### 3. 人工验证界面
|
18 |
+
- 交互式验证转录结果
|
19 |
+
- 支持修改转录文本
|
20 |
+
- 添加验证注释
|
21 |
+
- 保存验证状态
|
22 |
+
|
23 |
+
## 项目结构
|
24 |
+
|
25 |
+
```
|
26 |
+
faster-whisper-small/
|
27 |
+
├── vad/ # VAD音频处理工具目录
|
28 |
+
│ ├── audio_processor.py # 音频处理核心代码
|
29 |
+
│ ├── audio_transcriber.py# 音频转录核心代码
|
30 |
+
│ ├── main.py # 命令行入口
|
31 |
+
│ └── README.md # 使用说明文档
|
32 |
+
├── dataset/ # 数据集目录
|
33 |
+
│ ├── audio/ # 存放原始音频文件
|
34 |
+
│ │ └── segments/ # 存放切割后的音频片段
|
35 |
+
│ └── transcripts/ # 存放转录和验证结果
|
36 |
+
├── ct2_model/ # faster-whisper模型文件
|
37 |
+
└── whisper_processor/ # whisper处理器文件
|
38 |
+
```
|
39 |
+
|
40 |
+
## 使用方法
|
41 |
+
|
42 |
+
### 1. 安装依赖
|
43 |
+
|
44 |
+
```bash
|
45 |
+
pip install -r requirements.txt
|
46 |
+
```
|
47 |
+
|
48 |
+
### 2. 处理新的音频文件
|
49 |
+
|
50 |
+
```bash
|
51 |
+
# 在项目根目录下运行
|
52 |
+
python vad/main.py process dataset/audio/test1.wav
|
53 |
+
```
|
54 |
+
|
55 |
+
这个命令会:
|
56 |
+
1. 将音频切割成多个片段
|
57 |
+
2. 对每个片段进行转录
|
58 |
+
3. 提供交互式界面进行验证
|
59 |
+
4. 保存结果到JSON文件
|
60 |
+
|
61 |
+
### 3. 验证已有的转录结果
|
62 |
+
|
63 |
+
```bash
|
64 |
+
# 在项目根目录下运行
|
65 |
+
python vad/main.py verify dataset/transcripts/your_transcript.json
|
66 |
+
```
|
67 |
+
|
68 |
+
## 参数调整
|
69 |
+
|
70 |
+
可以通过修改 `vad/audio_processor.py` 中的参数来优化切割效果:
|
71 |
+
|
72 |
+
```python
|
73 |
+
processor = AudioProcessor(
|
74 |
+
vad_level=2, # VAD灵敏度 (0-3)
|
75 |
+
min_silence_duration=0.5, # 最小静音持续时间(秒)
|
76 |
+
min_speech_duration=0.3, # 最小语音片段长度(秒)
|
77 |
+
amplitude_threshold=0.01 # 振幅阈值
|
78 |
+
)
|
79 |
+
```
|
80 |
+
|
81 |
+
## 输出文件格式
|
82 |
+
|
83 |
+
### 1. 音频片段
|
84 |
+
切割后的音频片段保存为WAV格式,采样率为16kHz,命名格式为:
|
85 |
+
```
|
86 |
+
{原文件名}_segment_{序号}.wav
|
87 |
+
```
|
88 |
+
|
89 |
+
### 2. 转录结果
|
90 |
+
转录结果保存为JSON格式,包含以下信息:
|
91 |
+
```json
|
92 |
+
{
|
93 |
+
"audio_file": "dataset/audio/meeting_001.wav",
|
94 |
+
"timestamp": "20250422_182233",
|
95 |
+
"segments": [
|
96 |
+
{
|
97 |
+
"text": "今天的会议主要讨论两个议题。",
|
98 |
+
"start_time": 0.0,
|
99 |
+
"end_time": 2.5,
|
100 |
+
"confidence": 0.92,
|
101 |
+
"verified": true,
|
102 |
+
"verified_text": null,
|
103 |
+
"verification_notes": "转录正确"
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"text": "第一个是项目进度报告。",
|
107 |
+
"start_time": 2.8,
|
108 |
+
"end_time": 4.6,
|
109 |
+
"confidence": 0.88,
|
110 |
+
"verified": true,
|
111 |
+
"verified_text": "第一个是项目进度汇报",
|
112 |
+
"verification_notes": "纠正:'报告'改为'汇报'"
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"text": "第二个是下个月的工作计划",
|
116 |
+
"start_time": 5.0,
|
117 |
+
"end_time": 7.2,
|
118 |
+
"confidence": 0.95,
|
119 |
+
"verified": false,
|
120 |
+
"verified_text": null,
|
121 |
+
"verification_notes": null
|
122 |
+
}
|
123 |
+
]
|
124 |
+
}
|
125 |
+
```
|
126 |
+
|
127 |
+
这个示例展示了:
|
128 |
+
1. 已验证且正确的片段(第一个)
|
129 |
+
2. 已验证且需要修正的片段(第二个)
|
130 |
+
3. 未验证的片段(第三个)
|
131 |
+
|
132 |
+
使用以下命令验证此转录:
|
133 |
+
```bash
|
134 |
+
python vad/main.py verify dataset/transcripts/meeting_001_20250422_182233.json
|
135 |
+
```
|
136 |
+
|
137 |
+
## 注意事项
|
138 |
+
|
139 |
+
1. 音频文件要求:
|
140 |
+
- 支持常见音频格式(WAV, MP3, M4A等)
|
141 |
+
- 建议使用16kHz采样率
|
142 |
+
- 如果是多声道音频会自动转换为单声道
|
143 |
+
|
144 |
+
2. 性能考虑:
|
145 |
+
- 转录速度取决于CPU性能和音频长度
|
146 |
+
- 较长的音频文件会被自动切割成小片段处理
|
147 |
+
|
148 |
+
3. 后续优化方向:
|
149 |
+
- 优化切割策略
|
150 |
+
- 添加批量处理功能
|
151 |
+
- 改进语义重组算法
|
152 |
+
- 添加GUI界面
|
vad/__init__.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
VAD音频处理工具包
|
3 |
+
|
4 |
+
提供音频切割、转录和验证功能。
|
5 |
+
"""
|
6 |
+
|
7 |
+
from .audio_processor import AudioProcessor, AudioSegment
|
8 |
+
from .audio_transcriber import AudioTranscriber, TranscriptionResult
|
9 |
+
|
10 |
+
__all__ = ['AudioProcessor', 'AudioSegment', 'AudioTranscriber', 'TranscriptionResult']
|
vad/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (459 Bytes). View file
|
|
vad/__pycache__/audio_processor.cpython-312.pyc
ADDED
Binary file (10.1 kB). View file
|
|
vad/__pycache__/audio_transcriber.cpython-312.pyc
ADDED
Binary file (7.37 kB). View file
|
|
vad/__pycache__/main.cpython-312.pyc
ADDED
Binary file (6.06 kB). View file
|
|
vad/audio_processor.py
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import soundfile as sf
|
3 |
+
from typing import List, Tuple, Optional, Dict
|
4 |
+
import webrtcvad
|
5 |
+
from dataclasses import dataclass, asdict
|
6 |
+
from scipy import signal
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
from datetime import datetime
|
10 |
+
|
11 |
+
@dataclass
|
12 |
+
class AudioSegment:
|
13 |
+
start_time: float # 开始时间(秒)
|
14 |
+
end_time: float # 结束时间(秒)
|
15 |
+
audio_data: np.ndarray # 音频数据
|
16 |
+
is_speech: bool # 是否包含语音
|
17 |
+
|
18 |
+
class AudioProcessor:
|
19 |
+
def __init__(self,
|
20 |
+
sample_rate: int = 16000,
|
21 |
+
frame_duration_ms: int = 30,
|
22 |
+
vad_level: int = 1, # 降低VAD灵敏度
|
23 |
+
min_silence_duration: float = 0.5, # 静音持续时间
|
24 |
+
min_speech_duration: float = 1.0, # 增加最小语音持续时间,确保完整句子
|
25 |
+
amplitude_threshold: float = 0.003): # 进一步降低振幅阈值
|
26 |
+
"""
|
27 |
+
初始化音频处理器
|
28 |
+
|
29 |
+
Args:
|
30 |
+
sample_rate: 采样率
|
31 |
+
frame_duration_ms: VAD帧长度(毫秒)
|
32 |
+
vad_level: VAD灵敏度 (0-3)
|
33 |
+
min_silence_duration: 最小静音持续时间(秒)
|
34 |
+
min_speech_duration: 最小语音片段长度(秒)
|
35 |
+
amplitude_threshold: 振幅阈值
|
36 |
+
"""
|
37 |
+
self.sample_rate = sample_rate
|
38 |
+
self.frame_duration_ms = frame_duration_ms
|
39 |
+
self.frame_size = int(sample_rate * frame_duration_ms / 1000)
|
40 |
+
self.vad = webrtcvad.Vad(vad_level)
|
41 |
+
self.min_silence_frames = int(min_silence_duration * 1000 / frame_duration_ms)
|
42 |
+
self.min_speech_frames = int(min_speech_duration * 1000 / frame_duration_ms)
|
43 |
+
self.amplitude_threshold = amplitude_threshold
|
44 |
+
|
45 |
+
def _is_speech_frame(self, frame: np.ndarray) -> bool:
|
46 |
+
"""
|
47 |
+
判断一帧是否包含语音
|
48 |
+
"""
|
49 |
+
# 确保帧长度正确
|
50 |
+
if len(frame) != self.frame_size:
|
51 |
+
return False
|
52 |
+
|
53 |
+
# 将float32转换为int16,并确保值在范围内
|
54 |
+
frame_int16 = np.clip(frame * 32768, -32768, 32767).astype(np.int16)
|
55 |
+
|
56 |
+
# 使用振幅判断
|
57 |
+
frame_amplitude = np.max(np.abs(frame))
|
58 |
+
if frame_amplitude < self.amplitude_threshold:
|
59 |
+
return False
|
60 |
+
|
61 |
+
# 使用VAD判断
|
62 |
+
try:
|
63 |
+
return self.vad.is_speech(frame_int16.tobytes(), self.sample_rate)
|
64 |
+
except Exception as e:
|
65 |
+
print(f"VAD处理出错: {e}")
|
66 |
+
# 如果VAD失败,仅使用振幅判断
|
67 |
+
return frame_amplitude >= self.amplitude_threshold * 2
|
68 |
+
|
69 |
+
def process_audio_file(self, audio_path: str) -> List[AudioSegment]:
|
70 |
+
"""
|
71 |
+
处理音频文件,返回切割后的片段列表
|
72 |
+
"""
|
73 |
+
# 读取音频文件
|
74 |
+
print(f"正在读取音频文件: {audio_path}")
|
75 |
+
audio_data, sample_rate = sf.read(audio_path)
|
76 |
+
print(f"音频采样率: {sample_rate}Hz, 形状: {audio_data.shape}")
|
77 |
+
|
78 |
+
if sample_rate != self.sample_rate:
|
79 |
+
print(f"正在重采样音频从 {sample_rate}Hz 到 {self.sample_rate}Hz")
|
80 |
+
# 使用scipy的resample函数进行重采样
|
81 |
+
num_samples = int(len(audio_data) * self.sample_rate / sample_rate)
|
82 |
+
audio_data = signal.resample(audio_data, num_samples)
|
83 |
+
print(f"重采样后音频长度: {len(audio_data)} 采样点")
|
84 |
+
|
85 |
+
if len(audio_data.shape) > 1:
|
86 |
+
print("检测到多声道音频,正在转换为单声道")
|
87 |
+
audio_data = audio_data.mean(axis=1) # 转换为单声道
|
88 |
+
|
89 |
+
# 初始化结果列表
|
90 |
+
segments: List[AudioSegment] = []
|
91 |
+
print(f"开始处理音频,总长度: {len(audio_data)} 采样点 ({len(audio_data)/self.sample_rate:.2f}秒)")
|
92 |
+
|
93 |
+
# 当前处理的状态
|
94 |
+
current_segment_start = 0
|
95 |
+
silence_frame_count = 0
|
96 |
+
is_in_speech = False
|
97 |
+
|
98 |
+
# 按帧处理音频
|
99 |
+
total_frames = len(audio_data) // self.frame_size
|
100 |
+
speech_frames = 0
|
101 |
+
for i in range(0, len(audio_data), self.frame_size):
|
102 |
+
# 确保帧长度正确
|
103 |
+
frame = audio_data[i:i + self.frame_size]
|
104 |
+
if len(frame) < self.frame_size:
|
105 |
+
# 对于最后一个不完整帧,补零处理
|
106 |
+
frame = np.pad(frame, (0, self.frame_size - len(frame)), 'constant')
|
107 |
+
|
108 |
+
is_speech = self._is_speech_frame(frame)
|
109 |
+
if is_speech:
|
110 |
+
speech_frames += 1
|
111 |
+
|
112 |
+
if is_speech and not is_in_speech:
|
113 |
+
# 开始新的语音段
|
114 |
+
current_segment_start = i
|
115 |
+
is_in_speech = True
|
116 |
+
silence_frame_count = 0
|
117 |
+
print(f"\n检测到语音开始,位置: {i/self.sample_rate:.2f}秒")
|
118 |
+
elif not is_speech and is_in_speech:
|
119 |
+
silence_frame_count += 1
|
120 |
+
|
121 |
+
# 如果静音持续足够长,结束当前语音���
|
122 |
+
if silence_frame_count >= self.min_silence_frames:
|
123 |
+
segment_end = i - (silence_frame_count * self.frame_size)
|
124 |
+
duration_frames = (segment_end - current_segment_start) // self.frame_size
|
125 |
+
|
126 |
+
# 只保存超过最小长度的片段
|
127 |
+
if duration_frames >= self.min_speech_frames:
|
128 |
+
start_time = current_segment_start / self.sample_rate
|
129 |
+
end_time = segment_end / self.sample_rate
|
130 |
+
print(f"保存语音片段: {start_time:.2f}s -> {end_time:.2f}s (持续时间: {end_time-start_time:.2f}s)")
|
131 |
+
segments.append(AudioSegment(
|
132 |
+
start_time=start_time,
|
133 |
+
end_time=end_time,
|
134 |
+
audio_data=audio_data[current_segment_start:segment_end],
|
135 |
+
is_speech=True
|
136 |
+
))
|
137 |
+
else:
|
138 |
+
print(f"丢弃过短的语音片段: {duration_frames * self.frame_duration_ms / 1000:.2f}s")
|
139 |
+
|
140 |
+
is_in_speech = False
|
141 |
+
|
142 |
+
# 处理最后一个语音段
|
143 |
+
if is_in_speech:
|
144 |
+
segment_end = len(audio_data)
|
145 |
+
duration_frames = (segment_end - current_segment_start) // self.frame_size
|
146 |
+
if duration_frames >= self.min_speech_frames:
|
147 |
+
start_time = current_segment_start / self.sample_rate
|
148 |
+
end_time = segment_end / self.sample_rate
|
149 |
+
print(f"保存最后的语音片段: {start_time:.2f}s -> {end_time:.2f}s (持续时间: {end_time-start_time:.2f}s)")
|
150 |
+
segments.append(AudioSegment(
|
151 |
+
start_time=start_time,
|
152 |
+
end_time=end_time,
|
153 |
+
audio_data=audio_data[current_segment_start:segment_end],
|
154 |
+
is_speech=True
|
155 |
+
))
|
156 |
+
else:
|
157 |
+
print(f"丢弃过短的最后语音片段: {duration_frames * self.frame_duration_ms / 1000:.2f}s")
|
158 |
+
|
159 |
+
print(f"\n音频处理完成:")
|
160 |
+
print(f"总帧数: {total_frames}")
|
161 |
+
print(f"语音帧数: {speech_frames}")
|
162 |
+
print(f"检测到的语音片段数: {len(segments)}")
|
163 |
+
|
164 |
+
# 保存中间结果到临时文件
|
165 |
+
temp_dir = "../dataset/audio/temp"
|
166 |
+
os.makedirs(temp_dir, exist_ok=True)
|
167 |
+
|
168 |
+
# 准备保存的数据
|
169 |
+
temp_data = {
|
170 |
+
"audio_file": audio_path,
|
171 |
+
"timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
|
172 |
+
"total_frames": total_frames,
|
173 |
+
"speech_frames": speech_frames,
|
174 |
+
"segments": [
|
175 |
+
{
|
176 |
+
"start_time": seg.start_time,
|
177 |
+
"end_time": seg.end_time,
|
178 |
+
"duration": seg.end_time - seg.start_time,
|
179 |
+
"is_speech": seg.is_speech
|
180 |
+
}
|
181 |
+
for seg in segments
|
182 |
+
]
|
183 |
+
}
|
184 |
+
|
185 |
+
# 保存临时结果
|
186 |
+
base_name = os.path.splitext(os.path.basename(audio_path))[0]
|
187 |
+
temp_path = os.path.join(temp_dir, f"{base_name}_segments_{temp_data['timestamp']}.json")
|
188 |
+
with open(temp_path, 'w', encoding='utf-8') as f:
|
189 |
+
json.dump(temp_data, f, ensure_ascii=False, indent=2)
|
190 |
+
print(f"\n临时结果已保存到: {temp_path}")
|
191 |
+
|
192 |
+
return segments
|
193 |
+
|
194 |
+
def save_segment(self, segment: AudioSegment, output_path: str):
|
195 |
+
"""
|
196 |
+
保存音频片段到文件
|
197 |
+
"""
|
198 |
+
sf.write(output_path, segment.audio_data, self.sample_rate)
|
199 |
+
|
200 |
+
if __name__ == "__main__":
|
201 |
+
# 测试代码
|
202 |
+
processor = AudioProcessor()
|
203 |
+
|
204 |
+
# 示例:处理一个音频文件
|
205 |
+
audio_path = "dataset/audio/test.wav" # 替换为实际的音频文件路径
|
206 |
+
try:
|
207 |
+
segments = processor.process_audio_file(audio_path)
|
208 |
+
print(f"检测到 {len(segments)} 个语音片段:")
|
209 |
+
for i, segment in enumerate(segments):
|
210 |
+
print(f"片段 {i+1}: {segment.start_time:.2f}s -> {segment.end_time:.2f}s")
|
211 |
+
except Exception as e:
|
212 |
+
print(f"处理音频时出错: {e}")
|
vad/audio_transcriber.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from faster_whisper import WhisperModel
|
2 |
+
from audio_processor import AudioSegment
|
3 |
+
import json
|
4 |
+
from typing import List, Dict, Optional
|
5 |
+
from dataclasses import dataclass
|
6 |
+
import os
|
7 |
+
from datetime import datetime
|
8 |
+
|
9 |
+
@dataclass
|
10 |
+
class TranscriptionResult:
|
11 |
+
text: str
|
12 |
+
start_time: float
|
13 |
+
end_time: float
|
14 |
+
confidence: float
|
15 |
+
verified: bool = False
|
16 |
+
verified_text: Optional[str] = None
|
17 |
+
verification_notes: Optional[str] = None
|
18 |
+
|
19 |
+
class AudioTranscriber:
|
20 |
+
def __init__(self, model: str = "small", device: str = "cuda", compute_type: str = "int8"):
|
21 |
+
"""
|
22 |
+
初始化转录器
|
23 |
+
|
24 |
+
Args:
|
25 |
+
model_path: Whisper模型路径
|
26 |
+
device: 使用的设备 ("cpu" 或 "cuda")
|
27 |
+
compute_type: 计算类型
|
28 |
+
"""
|
29 |
+
print("📥 Loading Whisper model...")
|
30 |
+
self.model = WhisperModel(model, device=device, compute_type=compute_type)
|
31 |
+
print("📥 Loading Whisper model successfully!!")
|
32 |
+
|
33 |
+
def transcribe_segment(self, segment: AudioSegment) -> List[TranscriptionResult]:
|
34 |
+
"""
|
35 |
+
转录单个音频片段
|
36 |
+
"""
|
37 |
+
print("Model transcribe...")
|
38 |
+
print(f"开始转录音频片段,长度: {len(segment.audio_data)} 采样点 ({len(segment.audio_data)/16000:.2f}秒)")
|
39 |
+
segments_generator, info = self.model.transcribe(segment.audio_data,
|
40 |
+
beam_size=3,
|
41 |
+
language="zh")
|
42 |
+
print(f"检测到语言: {info.language}, 语言概率: {info.language_probability:.2f}")
|
43 |
+
segments = list(segments_generator)
|
44 |
+
print(f"Model transcribe successfully! Segments count: {len(segments)}")
|
45 |
+
if len(segments) > 0:
|
46 |
+
print(segments[0])
|
47 |
+
results = []
|
48 |
+
for seg in segments:
|
49 |
+
# 调整时间戳以匹配原始音频中的位置
|
50 |
+
absolute_start = segment.start_time + seg.start
|
51 |
+
absolute_end = segment.start_time + seg.end
|
52 |
+
|
53 |
+
result = TranscriptionResult(
|
54 |
+
text=seg.text,
|
55 |
+
start_time=absolute_start,
|
56 |
+
end_time=absolute_end,
|
57 |
+
confidence=1.0 - seg.no_speech_prob
|
58 |
+
)
|
59 |
+
results.append(result)
|
60 |
+
|
61 |
+
return results
|
62 |
+
|
63 |
+
def save_transcription(self,
|
64 |
+
results: List[TranscriptionResult],
|
65 |
+
audio_path: str,
|
66 |
+
output_dir: str = "../dataset/transcripts"):
|
67 |
+
"""
|
68 |
+
保存转录结果到JSON文件
|
69 |
+
"""
|
70 |
+
# 生成输出文件名
|
71 |
+
base_name = os.path.splitext(os.path.basename(audio_path))[0]
|
72 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
73 |
+
output_path = os.path.join(output_dir, f"{base_name}_{timestamp}.json")
|
74 |
+
|
75 |
+
# 准备保存的数据
|
76 |
+
data = {
|
77 |
+
"audio_file": audio_path,
|
78 |
+
"timestamp": timestamp,
|
79 |
+
"segments": [
|
80 |
+
{
|
81 |
+
"text": r.text,
|
82 |
+
"start_time": r.start_time,
|
83 |
+
"end_time": r.end_time,
|
84 |
+
"confidence": r.confidence,
|
85 |
+
"verified": r.verified,
|
86 |
+
"verified_text": r.verified_text,
|
87 |
+
"verification_notes": r.verification_notes
|
88 |
+
}
|
89 |
+
for r in results
|
90 |
+
]
|
91 |
+
}
|
92 |
+
|
93 |
+
# 保存到文件
|
94 |
+
os.makedirs(output_dir, exist_ok=True)
|
95 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
96 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
97 |
+
|
98 |
+
return output_path
|
99 |
+
|
100 |
+
def verify_transcription(self,
|
101 |
+
result: TranscriptionResult,
|
102 |
+
verified_text: Optional[str] = None,
|
103 |
+
verification_notes: Optional[str] = None) -> TranscriptionResult:
|
104 |
+
"""
|
105 |
+
人工验证转录结果
|
106 |
+
"""
|
107 |
+
result.verified = True
|
108 |
+
if verified_text is not None:
|
109 |
+
result.verified_text = verified_text
|
110 |
+
if verification_notes is not None:
|
111 |
+
result.verification_notes = verification_notes
|
112 |
+
return result
|
113 |
+
|
114 |
+
def load_transcription(self, json_path: str) -> List[TranscriptionResult]:
|
115 |
+
"""
|
116 |
+
从JSON文件加载转录结果
|
117 |
+
"""
|
118 |
+
with open(json_path, 'r', encoding='utf-8') as f:
|
119 |
+
data = json.load(f)
|
120 |
+
|
121 |
+
results = []
|
122 |
+
for seg in data["segments"]:
|
123 |
+
result = TranscriptionResult(
|
124 |
+
text=seg["text"],
|
125 |
+
start_time=seg["start_time"],
|
126 |
+
end_time=seg["end_time"],
|
127 |
+
confidence=seg["confidence"],
|
128 |
+
verified=seg["verified"],
|
129 |
+
verified_text=seg.get("verified_text"),
|
130 |
+
verification_notes=seg.get("verification_notes")
|
131 |
+
)
|
132 |
+
results.append(result)
|
133 |
+
|
134 |
+
return results
|
135 |
+
|
136 |
+
if __name__ == "__main__":
|
137 |
+
# 测试代码
|
138 |
+
from audio_processor import AudioProcessor
|
139 |
+
|
140 |
+
# 初始化处理器和转录器
|
141 |
+
processor = AudioProcessor()
|
142 |
+
transcriber = AudioTranscriber()
|
143 |
+
|
144 |
+
# 示例:处理和转录音频文件
|
145 |
+
audio_path = "../dataset/audio/test.wav" # 替换为实际的音频文件路径
|
146 |
+
try:
|
147 |
+
# 1. 切割音频
|
148 |
+
segments = processor.process_audio_file(audio_path)
|
149 |
+
print(f"检测到 {len(segments)} 个语音片段")
|
150 |
+
|
151 |
+
# 2. 转录每个片段
|
152 |
+
all_results = []
|
153 |
+
for i, segment in enumerate(segments):
|
154 |
+
print(f"转录片段 {i+1}/{len(segments)}...")
|
155 |
+
results = transcriber.transcribe_segment(segment)
|
156 |
+
all_results.extend(results)
|
157 |
+
|
158 |
+
# 3. 保存结果
|
159 |
+
output_path = transcriber.save_transcription(all_results, audio_path)
|
160 |
+
print(f"✅ 转录结果已保存到: {output_path}")
|
161 |
+
|
162 |
+
except Exception as e:
|
163 |
+
print(f"处理音频时出错: {e}")
|
vad/dataset/audio/segments/test1_segment_1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:877aee33d778b34af2f0b819ac822d80316e97b73cb3823c1f436dbef8efcb0e
|
3 |
+
size 35564
|
vad/dataset/audio/segments/test1_segment_10.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3135d983a5260d846e6cf165583efa3a0ef379bd86c885e678a63b41f66f548b
|
3 |
+
size 48044
|
vad/dataset/audio/segments/test1_segment_11.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9a966cbb2e82ebd278692adad509a18061306b73b715fc4a93468c27ed61627b
|
3 |
+
size 111404
|
vad/dataset/audio/segments/test1_segment_12.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52cfbcdc17cc5f190df467310f1a91c89e27f79662b2ce13f4ff5ec07015afec
|
3 |
+
size 71084
|
vad/dataset/audio/segments/test1_segment_13.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8eedb04a2d817c0875003a7594f8bac255a28898dfae56aa97bd3021870140b2
|
3 |
+
size 86444
|
vad/dataset/audio/segments/test1_segment_14.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:078850683b71e5b04781884b44bce0edb74999459b68b6fd53175ecacbd4980e
|
3 |
+
size 34604
|
vad/dataset/audio/segments/test1_segment_15.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c9c467e2ec3bdec346774cde0480a78689c0f6f13fd093b32baaa00187c392fb
|
3 |
+
size 29804
|
vad/dataset/audio/segments/test1_segment_16.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:59bc62b5c124c9cac5ef78c69caff4e5caf3d0333e496e382ee365142eafc354
|
3 |
+
size 47084
|
vad/dataset/audio/segments/test1_segment_17.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2dbcb3578c8537243143da7ac2c7531ea7b9fc750cb26e9809643289eeddce7b
|
3 |
+
size 107564
|
vad/dataset/audio/segments/test1_segment_18.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:538b2dda6990368d68234fded22b5ed3d67c56a620e79cba7ac545a102465160
|
3 |
+
size 68204
|
vad/dataset/audio/segments/test1_segment_2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:81375721eb3a532941083c9781f53f5e0f1ccbe1ef4108f98a019de400f5c564
|
3 |
+
size 117164
|
vad/dataset/audio/segments/test1_segment_3.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd6120ff04e7365640b9e3a1fb062bc1c31ce0dc54904bd27e25ac5a0b068cde
|
3 |
+
size 149804
|
vad/dataset/audio/segments/test1_segment_4.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99bc0d18ffd0d10742b8d6b5450e537eccd1497c2247e714fa8efe6beb602abd
|
3 |
+
size 41324
|
vad/dataset/audio/segments/test1_segment_5.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a9e2196db3537028898b87442f074523251b33219302e6eb8518fb33396c30bd
|
3 |
+
size 122924
|
vad/dataset/audio/segments/test1_segment_6.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e929f7966a425a559b7442a2914cb99b0df74f1d02938264642dc71f160fc383
|
3 |
+
size 113324
|
vad/dataset/audio/segments/test1_segment_7.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:65d800356647c415d80e59fac63db01df31ce51a497aacf43f98aa0e6ec468cb
|
3 |
+
size 77804
|
vad/dataset/audio/segments/test1_segment_8.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c1c574a7c20332f85c6260febf6eae232473a798404ca29f1b54ac39e5b2d35c
|
3 |
+
size 91244
|
vad/dataset/audio/segments/test1_segment_9.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f943b20eb3aafa0befb884f5d125e0596d3f419d8a3c5546ff3cf878603c36b8
|
3 |
+
size 67244
|