tianyaogavin commited on
Commit
64f8498
·
1 Parent(s): 9d802fa

add vad test framework

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. dataset/audio/converter.py +31 -0
  2. dataset/audio/segments/test1_segment_1.wav +3 -0
  3. dataset/audio/segments/test1_segment_10.wav +3 -0
  4. dataset/audio/segments/test1_segment_11.wav +3 -0
  5. dataset/audio/segments/test1_segment_12.wav +3 -0
  6. dataset/audio/segments/test1_segment_2.wav +3 -0
  7. dataset/audio/segments/test1_segment_3.wav +3 -0
  8. dataset/audio/segments/test1_segment_4.wav +3 -0
  9. dataset/audio/segments/test1_segment_5.wav +3 -0
  10. dataset/audio/segments/test1_segment_6.wav +3 -0
  11. dataset/audio/segments/test1_segment_7.wav +3 -0
  12. dataset/audio/segments/test1_segment_8.wav +3 -0
  13. dataset/audio/segments/test1_segment_9.wav +3 -0
  14. dataset/audio/temp/test1_segments_20250423_133311.json +80 -0
  15. dataset/audio/temp/test1_segments_20250423_140123.json +80 -0
  16. dataset/audio/temp/test1_segments_20250423_140325.json +80 -0
  17. dataset/audio/temp/test1_segments_20250423_140503.json +80 -0
  18. dataset/audio/temp/test1_segments_20250423_140556.json +80 -0
  19. dataset/audio/test1.m4a +3 -0
  20. dataset/audio/test1.wav +3 -0
  21. dataset/transcripts/test1.json +4 -0
  22. dataset/transcripts/test1_segment_1_20250423_133335.json +150 -0
  23. dataset/transcripts/test1_segment_1_20250423_140126.json +159 -0
  24. dataset/transcripts/test1_segment_1_20250423_140600.json +159 -0
  25. vad/README.md +152 -0
  26. vad/__init__.py +10 -0
  27. vad/__pycache__/__init__.cpython-312.pyc +0 -0
  28. vad/__pycache__/audio_processor.cpython-312.pyc +0 -0
  29. vad/__pycache__/audio_transcriber.cpython-312.pyc +0 -0
  30. vad/__pycache__/main.cpython-312.pyc +0 -0
  31. vad/audio_processor.py +212 -0
  32. vad/audio_transcriber.py +163 -0
  33. vad/dataset/audio/segments/test1_segment_1.wav +3 -0
  34. vad/dataset/audio/segments/test1_segment_10.wav +3 -0
  35. vad/dataset/audio/segments/test1_segment_11.wav +3 -0
  36. vad/dataset/audio/segments/test1_segment_12.wav +3 -0
  37. vad/dataset/audio/segments/test1_segment_13.wav +3 -0
  38. vad/dataset/audio/segments/test1_segment_14.wav +3 -0
  39. vad/dataset/audio/segments/test1_segment_15.wav +3 -0
  40. vad/dataset/audio/segments/test1_segment_16.wav +3 -0
  41. vad/dataset/audio/segments/test1_segment_17.wav +3 -0
  42. vad/dataset/audio/segments/test1_segment_18.wav +3 -0
  43. vad/dataset/audio/segments/test1_segment_2.wav +3 -0
  44. vad/dataset/audio/segments/test1_segment_3.wav +3 -0
  45. vad/dataset/audio/segments/test1_segment_4.wav +3 -0
  46. vad/dataset/audio/segments/test1_segment_5.wav +3 -0
  47. vad/dataset/audio/segments/test1_segment_6.wav +3 -0
  48. vad/dataset/audio/segments/test1_segment_7.wav +3 -0
  49. vad/dataset/audio/segments/test1_segment_8.wav +3 -0
  50. vad/dataset/audio/segments/test1_segment_9.wav +3 -0
dataset/audio/converter.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import imageio_ffmpeg
3
+
4
+ input_file = "test1.m4a"
5
+ output_file = "test1.wav"
6
+ ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
7
+
8
+ cmd = [
9
+ ffmpeg_path,
10
+ "-y", # 覆盖输出文件
11
+ "-i", input_file, # 输入文件
12
+ "-ar", "16000", # 采样率
13
+ "-ac", "1", # 单声道
14
+ output_file
15
+ ]
16
+
17
+ print(f"🎬 正在转换: {input_file} -> {output_file}")
18
+ res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
19
+
20
+ # 输出 stderr 日志(FFmpeg 输出信息都在 stderr)
21
+ stderr = res.stderr.decode()
22
+ print("------ FFmpeg stderr ------")
23
+ print(stderr)
24
+ print("----------------------------")
25
+
26
+ # 检查是否生成成功
27
+ import os
28
+ if os.path.exists(output_file):
29
+ print(f"✅ 成功生成 {output_file}")
30
+ else:
31
+ print("❌ 转换失败,请检查上面的错误信息")
dataset/audio/segments/test1_segment_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:877aee33d778b34af2f0b819ac822d80316e97b73cb3823c1f436dbef8efcb0e
3
+ size 35564
dataset/audio/segments/test1_segment_10.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3135d983a5260d846e6cf165583efa3a0ef379bd86c885e678a63b41f66f548b
3
+ size 48044
dataset/audio/segments/test1_segment_11.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a966cbb2e82ebd278692adad509a18061306b73b715fc4a93468c27ed61627b
3
+ size 111404
dataset/audio/segments/test1_segment_12.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52cfbcdc17cc5f190df467310f1a91c89e27f79662b2ce13f4ff5ec07015afec
3
+ size 71084
dataset/audio/segments/test1_segment_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81375721eb3a532941083c9781f53f5e0f1ccbe1ef4108f98a019de400f5c564
3
+ size 117164
dataset/audio/segments/test1_segment_3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd6120ff04e7365640b9e3a1fb062bc1c31ce0dc54904bd27e25ac5a0b068cde
3
+ size 149804
dataset/audio/segments/test1_segment_4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99bc0d18ffd0d10742b8d6b5450e537eccd1497c2247e714fa8efe6beb602abd
3
+ size 41324
dataset/audio/segments/test1_segment_5.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9e2196db3537028898b87442f074523251b33219302e6eb8518fb33396c30bd
3
+ size 122924
dataset/audio/segments/test1_segment_6.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e929f7966a425a559b7442a2914cb99b0df74f1d02938264642dc71f160fc383
3
+ size 113324
dataset/audio/segments/test1_segment_7.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65d800356647c415d80e59fac63db01df31ce51a497aacf43f98aa0e6ec468cb
3
+ size 77804
dataset/audio/segments/test1_segment_8.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1c574a7c20332f85c6260febf6eae232473a798404ca29f1b54ac39e5b2d35c
3
+ size 91244
dataset/audio/segments/test1_segment_9.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f943b20eb3aafa0befb884f5d125e0596d3f419d8a3c5546ff3cf878603c36b8
3
+ size 67244
dataset/audio/temp/test1_segments_20250423_133311.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "../dataset/audio/test1.wav",
3
+ "timestamp": "20250423_133311",
4
+ "total_frames": 1821,
5
+ "speech_frames": 1167,
6
+ "segments": [
7
+ {
8
+ "start_time": 4.56,
9
+ "end_time": 5.67,
10
+ "duration": 1.1100000000000003,
11
+ "is_speech": true
12
+ },
13
+ {
14
+ "start_time": 8.4,
15
+ "end_time": 12.06,
16
+ "duration": 3.66,
17
+ "is_speech": true
18
+ },
19
+ {
20
+ "start_time": 13.11,
21
+ "end_time": 17.79,
22
+ "duration": 4.68,
23
+ "is_speech": true
24
+ },
25
+ {
26
+ "start_time": 19.77,
27
+ "end_time": 21.06,
28
+ "duration": 1.2899999999999991,
29
+ "is_speech": true
30
+ },
31
+ {
32
+ "start_time": 21.63,
33
+ "end_time": 25.47,
34
+ "duration": 3.84,
35
+ "is_speech": true
36
+ },
37
+ {
38
+ "start_time": 26.28,
39
+ "end_time": 29.82,
40
+ "duration": 3.539999999999999,
41
+ "is_speech": true
42
+ },
43
+ {
44
+ "start_time": 30.42,
45
+ "end_time": 32.85,
46
+ "duration": 2.4299999999999997,
47
+ "is_speech": true
48
+ },
49
+ {
50
+ "start_time": 33.54,
51
+ "end_time": 36.39,
52
+ "duration": 2.8500000000000014,
53
+ "is_speech": true
54
+ },
55
+ {
56
+ "start_time": 37.8,
57
+ "end_time": 39.9,
58
+ "duration": 2.1000000000000014,
59
+ "is_speech": true
60
+ },
61
+ {
62
+ "start_time": 40.86,
63
+ "end_time": 42.36,
64
+ "duration": 1.5,
65
+ "is_speech": true
66
+ },
67
+ {
68
+ "start_time": 43.05,
69
+ "end_time": 46.53,
70
+ "duration": 3.480000000000004,
71
+ "is_speech": true
72
+ },
73
+ {
74
+ "start_time": 47.49,
75
+ "end_time": 49.71,
76
+ "duration": 2.219999999999999,
77
+ "is_speech": true
78
+ }
79
+ ]
80
+ }
dataset/audio/temp/test1_segments_20250423_140123.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "../dataset/audio/test1.wav",
3
+ "timestamp": "20250423_140123",
4
+ "total_frames": 1821,
5
+ "speech_frames": 1167,
6
+ "segments": [
7
+ {
8
+ "start_time": 4.56,
9
+ "end_time": 5.67,
10
+ "duration": 1.1100000000000003,
11
+ "is_speech": true
12
+ },
13
+ {
14
+ "start_time": 8.4,
15
+ "end_time": 12.06,
16
+ "duration": 3.66,
17
+ "is_speech": true
18
+ },
19
+ {
20
+ "start_time": 13.11,
21
+ "end_time": 17.79,
22
+ "duration": 4.68,
23
+ "is_speech": true
24
+ },
25
+ {
26
+ "start_time": 19.77,
27
+ "end_time": 21.06,
28
+ "duration": 1.2899999999999991,
29
+ "is_speech": true
30
+ },
31
+ {
32
+ "start_time": 21.63,
33
+ "end_time": 25.47,
34
+ "duration": 3.84,
35
+ "is_speech": true
36
+ },
37
+ {
38
+ "start_time": 26.28,
39
+ "end_time": 29.82,
40
+ "duration": 3.539999999999999,
41
+ "is_speech": true
42
+ },
43
+ {
44
+ "start_time": 30.42,
45
+ "end_time": 32.85,
46
+ "duration": 2.4299999999999997,
47
+ "is_speech": true
48
+ },
49
+ {
50
+ "start_time": 33.54,
51
+ "end_time": 36.39,
52
+ "duration": 2.8500000000000014,
53
+ "is_speech": true
54
+ },
55
+ {
56
+ "start_time": 37.8,
57
+ "end_time": 39.9,
58
+ "duration": 2.1000000000000014,
59
+ "is_speech": true
60
+ },
61
+ {
62
+ "start_time": 40.86,
63
+ "end_time": 42.36,
64
+ "duration": 1.5,
65
+ "is_speech": true
66
+ },
67
+ {
68
+ "start_time": 43.05,
69
+ "end_time": 46.53,
70
+ "duration": 3.480000000000004,
71
+ "is_speech": true
72
+ },
73
+ {
74
+ "start_time": 47.49,
75
+ "end_time": 49.71,
76
+ "duration": 2.219999999999999,
77
+ "is_speech": true
78
+ }
79
+ ]
80
+ }
dataset/audio/temp/test1_segments_20250423_140325.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "../dataset/audio/test1.wav",
3
+ "timestamp": "20250423_140325",
4
+ "total_frames": 1821,
5
+ "speech_frames": 1167,
6
+ "segments": [
7
+ {
8
+ "start_time": 4.56,
9
+ "end_time": 5.67,
10
+ "duration": 1.1100000000000003,
11
+ "is_speech": true
12
+ },
13
+ {
14
+ "start_time": 8.4,
15
+ "end_time": 12.06,
16
+ "duration": 3.66,
17
+ "is_speech": true
18
+ },
19
+ {
20
+ "start_time": 13.11,
21
+ "end_time": 17.79,
22
+ "duration": 4.68,
23
+ "is_speech": true
24
+ },
25
+ {
26
+ "start_time": 19.77,
27
+ "end_time": 21.06,
28
+ "duration": 1.2899999999999991,
29
+ "is_speech": true
30
+ },
31
+ {
32
+ "start_time": 21.63,
33
+ "end_time": 25.47,
34
+ "duration": 3.84,
35
+ "is_speech": true
36
+ },
37
+ {
38
+ "start_time": 26.28,
39
+ "end_time": 29.82,
40
+ "duration": 3.539999999999999,
41
+ "is_speech": true
42
+ },
43
+ {
44
+ "start_time": 30.42,
45
+ "end_time": 32.85,
46
+ "duration": 2.4299999999999997,
47
+ "is_speech": true
48
+ },
49
+ {
50
+ "start_time": 33.54,
51
+ "end_time": 36.39,
52
+ "duration": 2.8500000000000014,
53
+ "is_speech": true
54
+ },
55
+ {
56
+ "start_time": 37.8,
57
+ "end_time": 39.9,
58
+ "duration": 2.1000000000000014,
59
+ "is_speech": true
60
+ },
61
+ {
62
+ "start_time": 40.86,
63
+ "end_time": 42.36,
64
+ "duration": 1.5,
65
+ "is_speech": true
66
+ },
67
+ {
68
+ "start_time": 43.05,
69
+ "end_time": 46.53,
70
+ "duration": 3.480000000000004,
71
+ "is_speech": true
72
+ },
73
+ {
74
+ "start_time": 47.49,
75
+ "end_time": 49.71,
76
+ "duration": 2.219999999999999,
77
+ "is_speech": true
78
+ }
79
+ ]
80
+ }
dataset/audio/temp/test1_segments_20250423_140503.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "../dataset/audio/test1.wav",
3
+ "timestamp": "20250423_140503",
4
+ "total_frames": 1821,
5
+ "speech_frames": 1167,
6
+ "segments": [
7
+ {
8
+ "start_time": 4.56,
9
+ "end_time": 5.67,
10
+ "duration": 1.1100000000000003,
11
+ "is_speech": true
12
+ },
13
+ {
14
+ "start_time": 8.4,
15
+ "end_time": 12.06,
16
+ "duration": 3.66,
17
+ "is_speech": true
18
+ },
19
+ {
20
+ "start_time": 13.11,
21
+ "end_time": 17.79,
22
+ "duration": 4.68,
23
+ "is_speech": true
24
+ },
25
+ {
26
+ "start_time": 19.77,
27
+ "end_time": 21.06,
28
+ "duration": 1.2899999999999991,
29
+ "is_speech": true
30
+ },
31
+ {
32
+ "start_time": 21.63,
33
+ "end_time": 25.47,
34
+ "duration": 3.84,
35
+ "is_speech": true
36
+ },
37
+ {
38
+ "start_time": 26.28,
39
+ "end_time": 29.82,
40
+ "duration": 3.539999999999999,
41
+ "is_speech": true
42
+ },
43
+ {
44
+ "start_time": 30.42,
45
+ "end_time": 32.85,
46
+ "duration": 2.4299999999999997,
47
+ "is_speech": true
48
+ },
49
+ {
50
+ "start_time": 33.54,
51
+ "end_time": 36.39,
52
+ "duration": 2.8500000000000014,
53
+ "is_speech": true
54
+ },
55
+ {
56
+ "start_time": 37.8,
57
+ "end_time": 39.9,
58
+ "duration": 2.1000000000000014,
59
+ "is_speech": true
60
+ },
61
+ {
62
+ "start_time": 40.86,
63
+ "end_time": 42.36,
64
+ "duration": 1.5,
65
+ "is_speech": true
66
+ },
67
+ {
68
+ "start_time": 43.05,
69
+ "end_time": 46.53,
70
+ "duration": 3.480000000000004,
71
+ "is_speech": true
72
+ },
73
+ {
74
+ "start_time": 47.49,
75
+ "end_time": 49.71,
76
+ "duration": 2.219999999999999,
77
+ "is_speech": true
78
+ }
79
+ ]
80
+ }
dataset/audio/temp/test1_segments_20250423_140556.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "../dataset/audio/test1.wav",
3
+ "timestamp": "20250423_140556",
4
+ "total_frames": 1821,
5
+ "speech_frames": 1167,
6
+ "segments": [
7
+ {
8
+ "start_time": 4.56,
9
+ "end_time": 5.67,
10
+ "duration": 1.1100000000000003,
11
+ "is_speech": true
12
+ },
13
+ {
14
+ "start_time": 8.4,
15
+ "end_time": 12.06,
16
+ "duration": 3.66,
17
+ "is_speech": true
18
+ },
19
+ {
20
+ "start_time": 13.11,
21
+ "end_time": 17.79,
22
+ "duration": 4.68,
23
+ "is_speech": true
24
+ },
25
+ {
26
+ "start_time": 19.77,
27
+ "end_time": 21.06,
28
+ "duration": 1.2899999999999991,
29
+ "is_speech": true
30
+ },
31
+ {
32
+ "start_time": 21.63,
33
+ "end_time": 25.47,
34
+ "duration": 3.84,
35
+ "is_speech": true
36
+ },
37
+ {
38
+ "start_time": 26.28,
39
+ "end_time": 29.82,
40
+ "duration": 3.539999999999999,
41
+ "is_speech": true
42
+ },
43
+ {
44
+ "start_time": 30.42,
45
+ "end_time": 32.85,
46
+ "duration": 2.4299999999999997,
47
+ "is_speech": true
48
+ },
49
+ {
50
+ "start_time": 33.54,
51
+ "end_time": 36.39,
52
+ "duration": 2.8500000000000014,
53
+ "is_speech": true
54
+ },
55
+ {
56
+ "start_time": 37.8,
57
+ "end_time": 39.9,
58
+ "duration": 2.1000000000000014,
59
+ "is_speech": true
60
+ },
61
+ {
62
+ "start_time": 40.86,
63
+ "end_time": 42.36,
64
+ "duration": 1.5,
65
+ "is_speech": true
66
+ },
67
+ {
68
+ "start_time": 43.05,
69
+ "end_time": 46.53,
70
+ "duration": 3.480000000000004,
71
+ "is_speech": true
72
+ },
73
+ {
74
+ "start_time": 47.49,
75
+ "end_time": 49.71,
76
+ "duration": 2.219999999999999,
77
+ "is_speech": true
78
+ }
79
+ ]
80
+ }
dataset/audio/test1.m4a ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6786ff56dc79183c1f66df16699fb97b3efdc9819184241772e70e89f36b875
3
+ size 1389154
dataset/audio/test1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02f25859f79b85109154b534742128605c7ea34e1154f9d17d21c302a67b92b3
3
+ size 1749070
dataset/transcripts/test1.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "felo_transcript": "第一单元,音频数据处理。\n单元简介。\n所有音频或语音相关的任务都需要使用音频文件在我们深入了解这些任务之前我们需要了解音频文件的实际内容以及如何利用音频文件本单元将为你介绍的呃本单元将为你介绍与音频数据相关的基本概念包括波形采样率和频谱图你会学习到如何使用音频数据集包括音频数据加载音频数据预处理以及高效加载大规模音频数据集的\n完成本单元的学习后,你会掌握基础的音频相关术语,并且掌握针对不同应用的音频数据处理工具。\n本单元的知识会成为后面章节的基础。",
3
+ "transcript": "第1单元:音频数据处理\n单元简介\n所有音频或语音相关的任务都需要使用音频文件。在我们深入了解这些任务之前,我们需要了解音频文件的实际内容以及如何利用音频文件。\n本单元将为你介绍与音频数据相关的基本概念,包括波形、采样率和频谱图。你会学习到如何使用音频数据集,包括音频数据加载、音频数据预处理,以及高效加载大规模音频数据集的流式加载方法。\n完成本单元的学习后,你会掌握基础的音频相关术语,并且掌握针对不同应用的音频数据处理工具。本单元的知识会成为后面章节的基础"
4
+ }
dataset/transcripts/test1_segment_1_20250423_133335.json ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "../dataset/audio/segments\\test1_segment_1.wav",
3
+ "timestamp": "20250423_133335",
4
+ "segments": [
5
+ {
6
+ "text": "音频数据处理",
7
+ "start_time": 0.0,
8
+ "end_time": 1.16,
9
+ "confidence": 0.8830433189868927,
10
+ "verified": false,
11
+ "verified_text": null,
12
+ "verification_notes": null
13
+ },
14
+ {
15
+ "text": "所有音频或语言相关的任务都需要使用音频",
16
+ "start_time": 0.0,
17
+ "end_time": 3.72,
18
+ "confidence": 0.7980242520570755,
19
+ "verified": false,
20
+ "verified_text": null,
21
+ "verification_notes": null
22
+ },
23
+ {
24
+ "text": "在我们先辱了解这些任务之前",
25
+ "start_time": 0.0,
26
+ "end_time": 1.6400000000000001,
27
+ "confidence": 0.9636461660265923,
28
+ "verified": false,
29
+ "verified_text": null,
30
+ "verification_notes": null
31
+ },
32
+ {
33
+ "text": "我们需要了解音频文件的实际内容",
34
+ "start_time": 1.6400000000000001,
35
+ "end_time": 4.0,
36
+ "confidence": 0.9636461660265923,
37
+ "verified": false,
38
+ "verified_text": null,
39
+ "verification_notes": null
40
+ },
41
+ {
42
+ "text": "以及如何",
43
+ "start_time": 4.0,
44
+ "end_time": 4.8,
45
+ "confidence": 0.9636461660265923,
46
+ "verified": false,
47
+ "verified_text": null,
48
+ "verification_notes": null
49
+ },
50
+ {
51
+ "text": "本来员将会你介绍的",
52
+ "start_time": 0.0,
53
+ "end_time": 1.28,
54
+ "confidence": 0.8996343165636063,
55
+ "verified": false,
56
+ "verified_text": null,
57
+ "verification_notes": null
58
+ },
59
+ {
60
+ "text": "本单元将为你介绍于音频数据相关的基本概念",
61
+ "start_time": 0.0,
62
+ "end_time": 3.92,
63
+ "confidence": 0.6721383035182953,
64
+ "verified": false,
65
+ "verified_text": null,
66
+ "verification_notes": null
67
+ },
68
+ {
69
+ "text": "包括剝形、採揚、綠和平、布圖",
70
+ "start_time": 0.0,
71
+ "end_time": 2.0,
72
+ "confidence": 0.7332137525081635,
73
+ "verified": false,
74
+ "verified_text": null,
75
+ "verification_notes": null
76
+ },
77
+ {
78
+ "text": "你會學習到如何使用音頻",
79
+ "start_time": 2.0,
80
+ "end_time": 3.6,
81
+ "confidence": 0.7332137525081635,
82
+ "verified": false,
83
+ "verified_text": null,
84
+ "verification_notes": null
85
+ },
86
+ {
87
+ "text": "包括音频数位加载",
88
+ "start_time": 0.0,
89
+ "end_time": 1.4000000000000001,
90
+ "confidence": 0.8692675232887268,
91
+ "verified": false,
92
+ "verified_text": null,
93
+ "verification_notes": null
94
+ },
95
+ {
96
+ "text": "音频数据处理",
97
+ "start_time": 1.4000000000000001,
98
+ "end_time": 2.4,
99
+ "confidence": 0.8692675232887268,
100
+ "verified": false,
101
+ "verified_text": null,
102
+ "verification_notes": null
103
+ },
104
+ {
105
+ "text": "高效加载大规模音频数一级的流适加载方",
106
+ "start_time": 0.0,
107
+ "end_time": 2.88,
108
+ "confidence": 0.9492924958467484,
109
+ "verified": false,
110
+ "verified_text": null,
111
+ "verification_notes": null
112
+ },
113
+ {
114
+ "text": "完成本單元的學習後,你會找",
115
+ "start_time": 0.0,
116
+ "end_time": 2.0,
117
+ "confidence": 0.9920552605763078,
118
+ "verified": false,
119
+ "verified_text": null,
120
+ "verification_notes": null
121
+ },
122
+ {
123
+ "text": "基础的音频相关数",
124
+ "start_time": 0.0,
125
+ "end_time": 1.6,
126
+ "confidence": 0.8243299126625061,
127
+ "verified": false,
128
+ "verified_text": null,
129
+ "verification_notes": null
130
+ },
131
+ {
132
+ "text": "并且掌握针对不同应用的音频数据处理工具",
133
+ "start_time": 0.0,
134
+ "end_time": 3.52,
135
+ "confidence": 0.9778542779386044,
136
+ "verified": false,
137
+ "verified_text": null,
138
+ "verification_notes": null
139
+ },
140
+ {
141
+ "text": "本单元的支持会成为后面章节的",
142
+ "start_time": 0.0,
143
+ "end_time": 2.0,
144
+ "confidence": 0.920660175383091,
145
+ "verified": false,
146
+ "verified_text": null,
147
+ "verification_notes": null
148
+ }
149
+ ]
150
+ }
dataset/transcripts/test1_segment_1_20250423_140126.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "../dataset/audio/segments\\test1_segment_1.wav",
3
+ "timestamp": "20250423_140126",
4
+ "segments": [
5
+ {
6
+ "text": "音频数据处理",
7
+ "start_time": 0.0,
8
+ "end_time": 1.16,
9
+ "confidence": 0.906494140625,
10
+ "verified": false,
11
+ "verified_text": null,
12
+ "verification_notes": null
13
+ },
14
+ {
15
+ "text": "所有音频或语言相关的任务都需要使用音频",
16
+ "start_time": 0.0,
17
+ "end_time": 3.72,
18
+ "confidence": 0.7564697265625,
19
+ "verified": false,
20
+ "verified_text": null,
21
+ "verification_notes": null
22
+ },
23
+ {
24
+ "text": "在我们先入了解这些任务之前",
25
+ "start_time": 0.0,
26
+ "end_time": 1.6400000000000001,
27
+ "confidence": 0.939605712890625,
28
+ "verified": false,
29
+ "verified_text": null,
30
+ "verification_notes": null
31
+ },
32
+ {
33
+ "text": "我们需要了解音频文件的实际内容",
34
+ "start_time": 1.6400000000000001,
35
+ "end_time": 4.0,
36
+ "confidence": 0.939605712890625,
37
+ "verified": false,
38
+ "verified_text": null,
39
+ "verification_notes": null
40
+ },
41
+ {
42
+ "text": "以及如何",
43
+ "start_time": 4.0,
44
+ "end_time": 4.8,
45
+ "confidence": 0.939605712890625,
46
+ "verified": false,
47
+ "verified_text": null,
48
+ "verification_notes": null
49
+ },
50
+ {
51
+ "text": "本台語言將為你介紹的",
52
+ "start_time": 0.0,
53
+ "end_time": 1.28,
54
+ "confidence": 0.907470703125,
55
+ "verified": false,
56
+ "verified_text": null,
57
+ "verification_notes": null
58
+ },
59
+ {
60
+ "text": "本单元将为你介绍于音频数据相关的基本概念",
61
+ "start_time": 0.0,
62
+ "end_time": 3.92,
63
+ "confidence": 0.66796875,
64
+ "verified": false,
65
+ "verified_text": null,
66
+ "verification_notes": null
67
+ },
68
+ {
69
+ "text": "包括剝形、採用、綠和平補土",
70
+ "start_time": 0.0,
71
+ "end_time": 2.0,
72
+ "confidence": 0.708251953125,
73
+ "verified": false,
74
+ "verified_text": null,
75
+ "verification_notes": null
76
+ },
77
+ {
78
+ "text": "你會學習到如何使用音頻",
79
+ "start_time": 2.0,
80
+ "end_time": 3.6,
81
+ "confidence": 0.708251953125,
82
+ "verified": false,
83
+ "verified_text": null,
84
+ "verification_notes": null
85
+ },
86
+ {
87
+ "text": "包括音频数位加载",
88
+ "start_time": 0.0,
89
+ "end_time": 1.4000000000000001,
90
+ "confidence": 0.86474609375,
91
+ "verified": false,
92
+ "verified_text": null,
93
+ "verification_notes": null
94
+ },
95
+ {
96
+ "text": "音频数据处理",
97
+ "start_time": 1.4000000000000001,
98
+ "end_time": 2.4,
99
+ "confidence": 0.86474609375,
100
+ "verified": false,
101
+ "verified_text": null,
102
+ "verification_notes": null
103
+ },
104
+ {
105
+ "text": "高效加载大规模音频数级的流适加载方",
106
+ "start_time": 0.0,
107
+ "end_time": 2.88,
108
+ "confidence": 0.956787109375,
109
+ "verified": false,
110
+ "verified_text": null,
111
+ "verification_notes": null
112
+ },
113
+ {
114
+ "text": "完成本單元的學期後",
115
+ "start_time": 0.0,
116
+ "end_time": 1.44,
117
+ "confidence": 0.9926719665527344,
118
+ "verified": false,
119
+ "verified_text": null,
120
+ "verification_notes": null
121
+ },
122
+ {
123
+ "text": "你會找",
124
+ "start_time": 1.44,
125
+ "end_time": 2.12,
126
+ "confidence": 0.9926719665527344,
127
+ "verified": false,
128
+ "verified_text": null,
129
+ "verification_notes": null
130
+ },
131
+ {
132
+ "text": "基础的音频相关数",
133
+ "start_time": 0.0,
134
+ "end_time": 1.6,
135
+ "confidence": 0.7969970703125,
136
+ "verified": false,
137
+ "verified_text": null,
138
+ "verification_notes": null
139
+ },
140
+ {
141
+ "text": "并且掌握针对不同应用的音频数据处理工具",
142
+ "start_time": 0.0,
143
+ "end_time": 3.52,
144
+ "confidence": 0.9851303100585938,
145
+ "verified": false,
146
+ "verified_text": null,
147
+ "verification_notes": null
148
+ },
149
+ {
150
+ "text": "本单元的支持会成为后面章节的",
151
+ "start_time": 0.0,
152
+ "end_time": 2.0,
153
+ "confidence": 0.930908203125,
154
+ "verified": false,
155
+ "verified_text": null,
156
+ "verification_notes": null
157
+ }
158
+ ]
159
+ }
dataset/transcripts/test1_segment_1_20250423_140600.json ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_file": "../dataset/audio/segments\\test1_segment_1.wav",
3
+ "timestamp": "20250423_140600",
4
+ "segments": [
5
+ {
6
+ "text": "音频数据处理",
7
+ "start_time": 4.56,
8
+ "end_time": 5.72,
9
+ "confidence": 0.906494140625,
10
+ "verified": false,
11
+ "verified_text": null,
12
+ "verification_notes": null
13
+ },
14
+ {
15
+ "text": "所有音频或语言相关的任务都需要使用音频",
16
+ "start_time": 8.4,
17
+ "end_time": 12.120000000000001,
18
+ "confidence": 0.7564697265625,
19
+ "verified": false,
20
+ "verified_text": null,
21
+ "verification_notes": null
22
+ },
23
+ {
24
+ "text": "在我们先入了解这些任务之前",
25
+ "start_time": 13.11,
26
+ "end_time": 14.75,
27
+ "confidence": 0.939605712890625,
28
+ "verified": false,
29
+ "verified_text": null,
30
+ "verification_notes": null
31
+ },
32
+ {
33
+ "text": "我们需要了解音频文件的实际内容",
34
+ "start_time": 14.75,
35
+ "end_time": 17.11,
36
+ "confidence": 0.939605712890625,
37
+ "verified": false,
38
+ "verified_text": null,
39
+ "verification_notes": null
40
+ },
41
+ {
42
+ "text": "以及如何",
43
+ "start_time": 17.11,
44
+ "end_time": 17.91,
45
+ "confidence": 0.939605712890625,
46
+ "verified": false,
47
+ "verified_text": null,
48
+ "verification_notes": null
49
+ },
50
+ {
51
+ "text": "本台語言將為你介紹的",
52
+ "start_time": 19.77,
53
+ "end_time": 21.05,
54
+ "confidence": 0.907470703125,
55
+ "verified": false,
56
+ "verified_text": null,
57
+ "verification_notes": null
58
+ },
59
+ {
60
+ "text": "本单元将为你介绍于音频数据相关的基本概念",
61
+ "start_time": 21.63,
62
+ "end_time": 25.549999999999997,
63
+ "confidence": 0.66796875,
64
+ "verified": false,
65
+ "verified_text": null,
66
+ "verification_notes": null
67
+ },
68
+ {
69
+ "text": "包括剝形、採用、綠和平補土",
70
+ "start_time": 26.28,
71
+ "end_time": 28.28,
72
+ "confidence": 0.708251953125,
73
+ "verified": false,
74
+ "verified_text": null,
75
+ "verification_notes": null
76
+ },
77
+ {
78
+ "text": "你會學習到如何使用音頻",
79
+ "start_time": 28.28,
80
+ "end_time": 29.880000000000003,
81
+ "confidence": 0.708251953125,
82
+ "verified": false,
83
+ "verified_text": null,
84
+ "verification_notes": null
85
+ },
86
+ {
87
+ "text": "包括音频数位加载",
88
+ "start_time": 30.42,
89
+ "end_time": 31.860000000000003,
90
+ "confidence": 0.86474609375,
91
+ "verified": false,
92
+ "verified_text": null,
93
+ "verification_notes": null
94
+ },
95
+ {
96
+ "text": "音频数据处理",
97
+ "start_time": 31.860000000000003,
98
+ "end_time": 32.86,
99
+ "confidence": 0.86474609375,
100
+ "verified": false,
101
+ "verified_text": null,
102
+ "verification_notes": null
103
+ },
104
+ {
105
+ "text": "高效加载大规模音频数级的流适加载方",
106
+ "start_time": 33.54,
107
+ "end_time": 36.42,
108
+ "confidence": 0.956787109375,
109
+ "verified": false,
110
+ "verified_text": null,
111
+ "verification_notes": null
112
+ },
113
+ {
114
+ "text": "完成本单元的学期后",
115
+ "start_time": 37.8,
116
+ "end_time": 39.199999999999996,
117
+ "confidence": 0.9926719665527344,
118
+ "verified": false,
119
+ "verified_text": null,
120
+ "verification_notes": null
121
+ },
122
+ {
123
+ "text": "你会找",
124
+ "start_time": 39.199999999999996,
125
+ "end_time": 40.0,
126
+ "confidence": 0.9926719665527344,
127
+ "verified": false,
128
+ "verified_text": null,
129
+ "verification_notes": null
130
+ },
131
+ {
132
+ "text": "基础的音频相关数",
133
+ "start_time": 40.86,
134
+ "end_time": 42.46,
135
+ "confidence": 0.7969970703125,
136
+ "verified": false,
137
+ "verified_text": null,
138
+ "verification_notes": null
139
+ },
140
+ {
141
+ "text": "并且掌握针对不同应用的音频数据处理工具",
142
+ "start_time": 43.05,
143
+ "end_time": 46.57,
144
+ "confidence": 0.9851303100585938,
145
+ "verified": false,
146
+ "verified_text": null,
147
+ "verification_notes": null
148
+ },
149
+ {
150
+ "text": "本单元的支持会成为后面章节的",
151
+ "start_time": 47.49,
152
+ "end_time": 49.49,
153
+ "confidence": 0.930908203125,
154
+ "verified": false,
155
+ "verified_text": null,
156
+ "verification_notes": null
157
+ }
158
+ ]
159
+ }
vad/README.md ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 音频数据集处理工具
2
+
3
+ 本工具用于处理音频数据集,支持音频切割、转录和人工验证功能。
4
+
5
+ ## 功能特性
6
+
7
+ ### 1. 音频处理和切割
8
+ - 基于振幅和VAD(Voice Activity Detection)的音频切割
9
+ - 自动过滤无效的短音频片段
10
+ - 保存切割后的音频片段到指定目录
11
+
12
+ ### 2. 音频转录
13
+ - 使用faster-whisper模型进行音频转录
14
+ - 支持批量处理音频片段
15
+ - 保存带时间戳的转录文本
16
+
17
+ ### 3. 人工验证界面
18
+ - 交互式验证转录结果
19
+ - 支持修改转录文本
20
+ - 添加验证注释
21
+ - 保存验证状态
22
+
23
+ ## 项目结构
24
+
25
+ ```
26
+ faster-whisper-small/
27
+ ├── vad/ # VAD音频处理工具目录
28
+ │ ├── audio_processor.py # 音频处理核心代码
29
+ │ ├── audio_transcriber.py# 音频转录核心代码
30
+ │ ├── main.py # 命令行入口
31
+ │ └── README.md # 使用说明文档
32
+ ├── dataset/ # 数据集目录
33
+ │ ├── audio/ # 存放原始音频文件
34
+ │ │ └── segments/ # 存放切割后的音频片段
35
+ │ └── transcripts/ # 存放转录和验证结果
36
+ ├── ct2_model/ # faster-whisper模型文件
37
+ └── whisper_processor/ # whisper处理器文件
38
+ ```
39
+
40
+ ## 使用方法
41
+
42
+ ### 1. 安装依赖
43
+
44
+ ```bash
45
+ pip install -r requirements.txt
46
+ ```
47
+
48
+ ### 2. 处理新的音频文件
49
+
50
+ ```bash
51
+ # 在项目根目录下运行
52
+ python vad/main.py process dataset/audio/test1.wav
53
+ ```
54
+
55
+ 这个命令会:
56
+ 1. 将音频切割成多个片段
57
+ 2. 对每个片段进行转录
58
+ 3. 提供交互式界面进行验证
59
+ 4. 保存结果到JSON文件
60
+
61
+ ### 3. 验证已有的转录结果
62
+
63
+ ```bash
64
+ # 在项目根目录下运行
65
+ python vad/main.py verify dataset/transcripts/your_transcript.json
66
+ ```
67
+
68
+ ## 参数调整
69
+
70
+ 可以通过修改 `vad/audio_processor.py` 中的参数来优化切割效果:
71
+
72
+ ```python
73
+ processor = AudioProcessor(
74
+ vad_level=2, # VAD灵敏度 (0-3)
75
+ min_silence_duration=0.5, # 最小静音持续时间(秒)
76
+ min_speech_duration=0.3, # 最小语音片段长度(秒)
77
+ amplitude_threshold=0.01 # 振幅阈值
78
+ )
79
+ ```
80
+
81
+ ## 输出文件格式
82
+
83
+ ### 1. 音频片段
84
+ 切割后的音频片段保存为WAV格式,采样率为16kHz,命名格式为:
85
+ ```
86
+ {原文件名}_segment_{序号}.wav
87
+ ```
88
+
89
+ ### 2. 转录结果
90
+ 转录结果保存为JSON格式,包含以下信息:
91
+ ```json
92
+ {
93
+ "audio_file": "dataset/audio/meeting_001.wav",
94
+ "timestamp": "20250422_182233",
95
+ "segments": [
96
+ {
97
+ "text": "今天的会议主要讨论两个议题。",
98
+ "start_time": 0.0,
99
+ "end_time": 2.5,
100
+ "confidence": 0.92,
101
+ "verified": true,
102
+ "verified_text": null,
103
+ "verification_notes": "转录正确"
104
+ },
105
+ {
106
+ "text": "第一个是项目进度报告。",
107
+ "start_time": 2.8,
108
+ "end_time": 4.6,
109
+ "confidence": 0.88,
110
+ "verified": true,
111
+ "verified_text": "第一个是项目进度汇报",
112
+ "verification_notes": "纠正:'报告'改为'汇报'"
113
+ },
114
+ {
115
+ "text": "第二个是下个月的工作计划",
116
+ "start_time": 5.0,
117
+ "end_time": 7.2,
118
+ "confidence": 0.95,
119
+ "verified": false,
120
+ "verified_text": null,
121
+ "verification_notes": null
122
+ }
123
+ ]
124
+ }
125
+ ```
126
+
127
+ 这个示例展示了:
128
+ 1. 已验证且正确的片段(第一个)
129
+ 2. 已验证且需要修正的片段(第二个)
130
+ 3. 未验证的片段(第三个)
131
+
132
+ 使用以下命令验证此转录:
133
+ ```bash
134
+ python vad/main.py verify dataset/transcripts/meeting_001_20250422_182233.json
135
+ ```
136
+
137
+ ## 注意事项
138
+
139
+ 1. 音频文件要求:
140
+ - 支持常见音频格式(WAV, MP3, M4A等)
141
+ - 建议使用16kHz采样率
142
+ - 如果是多声道音频会自动转换为单声道
143
+
144
+ 2. 性能考虑:
145
+ - 转录速度取决于CPU性能和音频长度
146
+ - 较长的音频文件会被自动切割成小片段处理
147
+
148
+ 3. 后续优化方向:
149
+ - 优化切割策略
150
+ - 添加批量处理功能
151
+ - 改进语义重组算法
152
+ - 添加GUI界面
vad/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VAD音频处理工具包
3
+
4
+ 提供音频切割、转录和验证功能。
5
+ """
6
+
7
+ from .audio_processor import AudioProcessor, AudioSegment
8
+ from .audio_transcriber import AudioTranscriber, TranscriptionResult
9
+
10
+ __all__ = ['AudioProcessor', 'AudioSegment', 'AudioTranscriber', 'TranscriptionResult']
vad/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (459 Bytes). View file
 
vad/__pycache__/audio_processor.cpython-312.pyc ADDED
Binary file (10.1 kB). View file
 
vad/__pycache__/audio_transcriber.cpython-312.pyc ADDED
Binary file (7.37 kB). View file
 
vad/__pycache__/main.cpython-312.pyc ADDED
Binary file (6.06 kB). View file
 
vad/audio_processor.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import soundfile as sf
3
+ from typing import List, Tuple, Optional, Dict
4
+ import webrtcvad
5
+ from dataclasses import dataclass, asdict
6
+ from scipy import signal
7
+ import json
8
+ import os
9
+ from datetime import datetime
10
+
11
+ @dataclass
12
+ class AudioSegment:
13
+ start_time: float # 开始时间(秒)
14
+ end_time: float # 结束时间(秒)
15
+ audio_data: np.ndarray # 音频数据
16
+ is_speech: bool # 是否包含语音
17
+
18
+ class AudioProcessor:
19
+ def __init__(self,
20
+ sample_rate: int = 16000,
21
+ frame_duration_ms: int = 30,
22
+ vad_level: int = 1, # 降低VAD灵敏度
23
+ min_silence_duration: float = 0.5, # 静音持续时间
24
+ min_speech_duration: float = 1.0, # 增加最小语音持续时间,确保完整句子
25
+ amplitude_threshold: float = 0.003): # 进一步降低振幅阈值
26
+ """
27
+ 初始化音频处理器
28
+
29
+ Args:
30
+ sample_rate: 采样率
31
+ frame_duration_ms: VAD帧长度(毫秒)
32
+ vad_level: VAD灵敏度 (0-3)
33
+ min_silence_duration: 最小静音持续时间(秒)
34
+ min_speech_duration: 最小语音片段长度(秒)
35
+ amplitude_threshold: 振幅阈值
36
+ """
37
+ self.sample_rate = sample_rate
38
+ self.frame_duration_ms = frame_duration_ms
39
+ self.frame_size = int(sample_rate * frame_duration_ms / 1000)
40
+ self.vad = webrtcvad.Vad(vad_level)
41
+ self.min_silence_frames = int(min_silence_duration * 1000 / frame_duration_ms)
42
+ self.min_speech_frames = int(min_speech_duration * 1000 / frame_duration_ms)
43
+ self.amplitude_threshold = amplitude_threshold
44
+
45
+ def _is_speech_frame(self, frame: np.ndarray) -> bool:
46
+ """
47
+ 判断一帧是否包含语音
48
+ """
49
+ # 确保帧长度正确
50
+ if len(frame) != self.frame_size:
51
+ return False
52
+
53
+ # 将float32转换为int16,并确保值在范围内
54
+ frame_int16 = np.clip(frame * 32768, -32768, 32767).astype(np.int16)
55
+
56
+ # 使用振幅判断
57
+ frame_amplitude = np.max(np.abs(frame))
58
+ if frame_amplitude < self.amplitude_threshold:
59
+ return False
60
+
61
+ # 使用VAD判断
62
+ try:
63
+ return self.vad.is_speech(frame_int16.tobytes(), self.sample_rate)
64
+ except Exception as e:
65
+ print(f"VAD处理出错: {e}")
66
+ # 如果VAD失败,仅使用振幅判断
67
+ return frame_amplitude >= self.amplitude_threshold * 2
68
+
69
+ def process_audio_file(self, audio_path: str) -> List[AudioSegment]:
70
+ """
71
+ 处理音频文件,返回切割后的片段列表
72
+ """
73
+ # 读取音频文件
74
+ print(f"正在读取音频文件: {audio_path}")
75
+ audio_data, sample_rate = sf.read(audio_path)
76
+ print(f"音频采样率: {sample_rate}Hz, 形状: {audio_data.shape}")
77
+
78
+ if sample_rate != self.sample_rate:
79
+ print(f"正在重采样音频从 {sample_rate}Hz 到 {self.sample_rate}Hz")
80
+ # 使用scipy的resample函数进行重采样
81
+ num_samples = int(len(audio_data) * self.sample_rate / sample_rate)
82
+ audio_data = signal.resample(audio_data, num_samples)
83
+ print(f"重采样后音频长度: {len(audio_data)} 采样点")
84
+
85
+ if len(audio_data.shape) > 1:
86
+ print("检测到多声道音频,正在转换为单声道")
87
+ audio_data = audio_data.mean(axis=1) # 转换为单声道
88
+
89
+ # 初始化结果列表
90
+ segments: List[AudioSegment] = []
91
+ print(f"开始处理音频,总长度: {len(audio_data)} 采样点 ({len(audio_data)/self.sample_rate:.2f}秒)")
92
+
93
+ # 当前处理的状态
94
+ current_segment_start = 0
95
+ silence_frame_count = 0
96
+ is_in_speech = False
97
+
98
+ # 按帧处理音频
99
+ total_frames = len(audio_data) // self.frame_size
100
+ speech_frames = 0
101
+ for i in range(0, len(audio_data), self.frame_size):
102
+ # 确保帧长度正确
103
+ frame = audio_data[i:i + self.frame_size]
104
+ if len(frame) < self.frame_size:
105
+ # 对于最后一个不完整帧,补零处理
106
+ frame = np.pad(frame, (0, self.frame_size - len(frame)), 'constant')
107
+
108
+ is_speech = self._is_speech_frame(frame)
109
+ if is_speech:
110
+ speech_frames += 1
111
+
112
+ if is_speech and not is_in_speech:
113
+ # 开始新的语音段
114
+ current_segment_start = i
115
+ is_in_speech = True
116
+ silence_frame_count = 0
117
+ print(f"\n检测到语音开始,位置: {i/self.sample_rate:.2f}秒")
118
+ elif not is_speech and is_in_speech:
119
+ silence_frame_count += 1
120
+
121
+ # 如果静音持续足够长,结束当前语音���
122
+ if silence_frame_count >= self.min_silence_frames:
123
+ segment_end = i - (silence_frame_count * self.frame_size)
124
+ duration_frames = (segment_end - current_segment_start) // self.frame_size
125
+
126
+ # 只保存超过最小长度的片段
127
+ if duration_frames >= self.min_speech_frames:
128
+ start_time = current_segment_start / self.sample_rate
129
+ end_time = segment_end / self.sample_rate
130
+ print(f"保存语音片段: {start_time:.2f}s -> {end_time:.2f}s (持续时间: {end_time-start_time:.2f}s)")
131
+ segments.append(AudioSegment(
132
+ start_time=start_time,
133
+ end_time=end_time,
134
+ audio_data=audio_data[current_segment_start:segment_end],
135
+ is_speech=True
136
+ ))
137
+ else:
138
+ print(f"丢弃过短的语音片段: {duration_frames * self.frame_duration_ms / 1000:.2f}s")
139
+
140
+ is_in_speech = False
141
+
142
+ # 处理最后一个语音段
143
+ if is_in_speech:
144
+ segment_end = len(audio_data)
145
+ duration_frames = (segment_end - current_segment_start) // self.frame_size
146
+ if duration_frames >= self.min_speech_frames:
147
+ start_time = current_segment_start / self.sample_rate
148
+ end_time = segment_end / self.sample_rate
149
+ print(f"保存最后的语音片段: {start_time:.2f}s -> {end_time:.2f}s (持续时间: {end_time-start_time:.2f}s)")
150
+ segments.append(AudioSegment(
151
+ start_time=start_time,
152
+ end_time=end_time,
153
+ audio_data=audio_data[current_segment_start:segment_end],
154
+ is_speech=True
155
+ ))
156
+ else:
157
+ print(f"丢弃过短的最后语音片段: {duration_frames * self.frame_duration_ms / 1000:.2f}s")
158
+
159
+ print(f"\n音频处理完成:")
160
+ print(f"总帧数: {total_frames}")
161
+ print(f"语音帧数: {speech_frames}")
162
+ print(f"检测到的语音片段数: {len(segments)}")
163
+
164
+ # 保存中间结果到临时文件
165
+ temp_dir = "../dataset/audio/temp"
166
+ os.makedirs(temp_dir, exist_ok=True)
167
+
168
+ # 准备保存的数据
169
+ temp_data = {
170
+ "audio_file": audio_path,
171
+ "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
172
+ "total_frames": total_frames,
173
+ "speech_frames": speech_frames,
174
+ "segments": [
175
+ {
176
+ "start_time": seg.start_time,
177
+ "end_time": seg.end_time,
178
+ "duration": seg.end_time - seg.start_time,
179
+ "is_speech": seg.is_speech
180
+ }
181
+ for seg in segments
182
+ ]
183
+ }
184
+
185
+ # 保存临时结果
186
+ base_name = os.path.splitext(os.path.basename(audio_path))[0]
187
+ temp_path = os.path.join(temp_dir, f"{base_name}_segments_{temp_data['timestamp']}.json")
188
+ with open(temp_path, 'w', encoding='utf-8') as f:
189
+ json.dump(temp_data, f, ensure_ascii=False, indent=2)
190
+ print(f"\n临时结果已保存到: {temp_path}")
191
+
192
+ return segments
193
+
194
+ def save_segment(self, segment: AudioSegment, output_path: str):
195
+ """
196
+ 保存音频片段到文件
197
+ """
198
+ sf.write(output_path, segment.audio_data, self.sample_rate)
199
+
200
+ if __name__ == "__main__":
201
+ # 测试代码
202
+ processor = AudioProcessor()
203
+
204
+ # 示例:处理一个音频文件
205
+ audio_path = "dataset/audio/test.wav" # 替换为实际的音频文件路径
206
+ try:
207
+ segments = processor.process_audio_file(audio_path)
208
+ print(f"检测到 {len(segments)} 个语音片段:")
209
+ for i, segment in enumerate(segments):
210
+ print(f"片段 {i+1}: {segment.start_time:.2f}s -> {segment.end_time:.2f}s")
211
+ except Exception as e:
212
+ print(f"处理音频时出错: {e}")
vad/audio_transcriber.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from faster_whisper import WhisperModel
2
+ from audio_processor import AudioSegment
3
+ import json
4
+ from typing import List, Dict, Optional
5
+ from dataclasses import dataclass
6
+ import os
7
+ from datetime import datetime
8
+
9
+ @dataclass
10
+ class TranscriptionResult:
11
+ text: str
12
+ start_time: float
13
+ end_time: float
14
+ confidence: float
15
+ verified: bool = False
16
+ verified_text: Optional[str] = None
17
+ verification_notes: Optional[str] = None
18
+
19
+ class AudioTranscriber:
20
+ def __init__(self, model: str = "small", device: str = "cuda", compute_type: str = "int8"):
21
+ """
22
+ 初始化转录器
23
+
24
+ Args:
25
+ model_path: Whisper模型路径
26
+ device: 使用的设备 ("cpu" 或 "cuda")
27
+ compute_type: 计算类型
28
+ """
29
+ print("📥 Loading Whisper model...")
30
+ self.model = WhisperModel(model, device=device, compute_type=compute_type)
31
+ print("📥 Loading Whisper model successfully!!")
32
+
33
+ def transcribe_segment(self, segment: AudioSegment) -> List[TranscriptionResult]:
34
+ """
35
+ 转录单个音频片段
36
+ """
37
+ print("Model transcribe...")
38
+ print(f"开始转录音频片段,长度: {len(segment.audio_data)} 采样点 ({len(segment.audio_data)/16000:.2f}秒)")
39
+ segments_generator, info = self.model.transcribe(segment.audio_data,
40
+ beam_size=3,
41
+ language="zh")
42
+ print(f"检测到语言: {info.language}, 语言概率: {info.language_probability:.2f}")
43
+ segments = list(segments_generator)
44
+ print(f"Model transcribe successfully! Segments count: {len(segments)}")
45
+ if len(segments) > 0:
46
+ print(segments[0])
47
+ results = []
48
+ for seg in segments:
49
+ # 调整时间戳以匹配原始音频中的位置
50
+ absolute_start = segment.start_time + seg.start
51
+ absolute_end = segment.start_time + seg.end
52
+
53
+ result = TranscriptionResult(
54
+ text=seg.text,
55
+ start_time=absolute_start,
56
+ end_time=absolute_end,
57
+ confidence=1.0 - seg.no_speech_prob
58
+ )
59
+ results.append(result)
60
+
61
+ return results
62
+
63
+ def save_transcription(self,
64
+ results: List[TranscriptionResult],
65
+ audio_path: str,
66
+ output_dir: str = "../dataset/transcripts"):
67
+ """
68
+ 保存转录结果到JSON文件
69
+ """
70
+ # 生成输出文件名
71
+ base_name = os.path.splitext(os.path.basename(audio_path))[0]
72
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
73
+ output_path = os.path.join(output_dir, f"{base_name}_{timestamp}.json")
74
+
75
+ # 准备保存的数据
76
+ data = {
77
+ "audio_file": audio_path,
78
+ "timestamp": timestamp,
79
+ "segments": [
80
+ {
81
+ "text": r.text,
82
+ "start_time": r.start_time,
83
+ "end_time": r.end_time,
84
+ "confidence": r.confidence,
85
+ "verified": r.verified,
86
+ "verified_text": r.verified_text,
87
+ "verification_notes": r.verification_notes
88
+ }
89
+ for r in results
90
+ ]
91
+ }
92
+
93
+ # 保存到文件
94
+ os.makedirs(output_dir, exist_ok=True)
95
+ with open(output_path, 'w', encoding='utf-8') as f:
96
+ json.dump(data, f, ensure_ascii=False, indent=2)
97
+
98
+ return output_path
99
+
100
+ def verify_transcription(self,
101
+ result: TranscriptionResult,
102
+ verified_text: Optional[str] = None,
103
+ verification_notes: Optional[str] = None) -> TranscriptionResult:
104
+ """
105
+ 人工验证转录结果
106
+ """
107
+ result.verified = True
108
+ if verified_text is not None:
109
+ result.verified_text = verified_text
110
+ if verification_notes is not None:
111
+ result.verification_notes = verification_notes
112
+ return result
113
+
114
+ def load_transcription(self, json_path: str) -> List[TranscriptionResult]:
115
+ """
116
+ 从JSON文件加载转录结果
117
+ """
118
+ with open(json_path, 'r', encoding='utf-8') as f:
119
+ data = json.load(f)
120
+
121
+ results = []
122
+ for seg in data["segments"]:
123
+ result = TranscriptionResult(
124
+ text=seg["text"],
125
+ start_time=seg["start_time"],
126
+ end_time=seg["end_time"],
127
+ confidence=seg["confidence"],
128
+ verified=seg["verified"],
129
+ verified_text=seg.get("verified_text"),
130
+ verification_notes=seg.get("verification_notes")
131
+ )
132
+ results.append(result)
133
+
134
+ return results
135
+
136
+ if __name__ == "__main__":
137
+ # 测试代码
138
+ from audio_processor import AudioProcessor
139
+
140
+ # 初始化处理器和转录器
141
+ processor = AudioProcessor()
142
+ transcriber = AudioTranscriber()
143
+
144
+ # 示例:处理和转录音频文件
145
+ audio_path = "../dataset/audio/test.wav" # 替换为实际的音频文件路径
146
+ try:
147
+ # 1. 切割音频
148
+ segments = processor.process_audio_file(audio_path)
149
+ print(f"检测到 {len(segments)} 个语音片段")
150
+
151
+ # 2. 转录每个片段
152
+ all_results = []
153
+ for i, segment in enumerate(segments):
154
+ print(f"转录片段 {i+1}/{len(segments)}...")
155
+ results = transcriber.transcribe_segment(segment)
156
+ all_results.extend(results)
157
+
158
+ # 3. 保存结果
159
+ output_path = transcriber.save_transcription(all_results, audio_path)
160
+ print(f"✅ 转录结果已保存到: {output_path}")
161
+
162
+ except Exception as e:
163
+ print(f"处理音频时出错: {e}")
vad/dataset/audio/segments/test1_segment_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:877aee33d778b34af2f0b819ac822d80316e97b73cb3823c1f436dbef8efcb0e
3
+ size 35564
vad/dataset/audio/segments/test1_segment_10.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3135d983a5260d846e6cf165583efa3a0ef379bd86c885e678a63b41f66f548b
3
+ size 48044
vad/dataset/audio/segments/test1_segment_11.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a966cbb2e82ebd278692adad509a18061306b73b715fc4a93468c27ed61627b
3
+ size 111404
vad/dataset/audio/segments/test1_segment_12.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52cfbcdc17cc5f190df467310f1a91c89e27f79662b2ce13f4ff5ec07015afec
3
+ size 71084
vad/dataset/audio/segments/test1_segment_13.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8eedb04a2d817c0875003a7594f8bac255a28898dfae56aa97bd3021870140b2
3
+ size 86444
vad/dataset/audio/segments/test1_segment_14.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:078850683b71e5b04781884b44bce0edb74999459b68b6fd53175ecacbd4980e
3
+ size 34604
vad/dataset/audio/segments/test1_segment_15.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9c467e2ec3bdec346774cde0480a78689c0f6f13fd093b32baaa00187c392fb
3
+ size 29804
vad/dataset/audio/segments/test1_segment_16.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59bc62b5c124c9cac5ef78c69caff4e5caf3d0333e496e382ee365142eafc354
3
+ size 47084
vad/dataset/audio/segments/test1_segment_17.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dbcb3578c8537243143da7ac2c7531ea7b9fc750cb26e9809643289eeddce7b
3
+ size 107564
vad/dataset/audio/segments/test1_segment_18.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:538b2dda6990368d68234fded22b5ed3d67c56a620e79cba7ac545a102465160
3
+ size 68204
vad/dataset/audio/segments/test1_segment_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81375721eb3a532941083c9781f53f5e0f1ccbe1ef4108f98a019de400f5c564
3
+ size 117164
vad/dataset/audio/segments/test1_segment_3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd6120ff04e7365640b9e3a1fb062bc1c31ce0dc54904bd27e25ac5a0b068cde
3
+ size 149804
vad/dataset/audio/segments/test1_segment_4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99bc0d18ffd0d10742b8d6b5450e537eccd1497c2247e714fa8efe6beb602abd
3
+ size 41324
vad/dataset/audio/segments/test1_segment_5.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9e2196db3537028898b87442f074523251b33219302e6eb8518fb33396c30bd
3
+ size 122924
vad/dataset/audio/segments/test1_segment_6.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e929f7966a425a559b7442a2914cb99b0df74f1d02938264642dc71f160fc383
3
+ size 113324
vad/dataset/audio/segments/test1_segment_7.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65d800356647c415d80e59fac63db01df31ce51a497aacf43f98aa0e6ec468cb
3
+ size 77804
vad/dataset/audio/segments/test1_segment_8.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1c574a7c20332f85c6260febf6eae232473a798404ca29f1b54ac39e5b2d35c
3
+ size 91244
vad/dataset/audio/segments/test1_segment_9.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f943b20eb3aafa0befb884f5d125e0596d3f419d8a3c5546ff3cf878603c36b8
3
+ size 67244