积极的屁孩 commited on
Commit
29b1e08
·
1 Parent(s): e48a9d8

adjust all frequency

Browse files
Files changed (1) hide show
  1. app.py +199 -79
app.py CHANGED
@@ -316,167 +316,287 @@ def vevo_timbre(content_wav, reference_wav):
316
  temp_reference_path = "wav/temp_reference.wav"
317
  output_path = "wav/output_vevotimbre.wav"
318
 
319
- # 检查并正确处理音频数据
320
  if content_wav is None or reference_wav is None:
321
  raise ValueError("请上传音频文件")
322
 
323
- # Gradio音频组件返回(sample_rate, data)元组或(data, sample_rate)元组
324
  if isinstance(content_wav, tuple) and len(content_wav) == 2:
325
- # 确保正确的顺序 (data, sample_rate)
326
  if isinstance(content_wav[0], np.ndarray):
327
  content_data, content_sr = content_wav
328
  else:
329
  content_sr, content_data = content_wav
330
- content_tensor = torch.FloatTensor(content_data)
331
- if content_tensor.ndim == 1:
332
- content_tensor = content_tensor.unsqueeze(0) # 添加通道维度
 
 
 
 
 
 
 
 
 
 
 
 
333
  else:
334
  raise ValueError("内容音频格式不正确")
335
-
 
336
  if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
337
- # 确保正确的顺序 (data, sample_rate)
338
  if isinstance(reference_wav[0], np.ndarray):
339
  reference_data, reference_sr = reference_wav
340
  else:
341
  reference_sr, reference_data = reference_wav
342
- reference_tensor = torch.FloatTensor(reference_data)
343
- if reference_tensor.ndim == 1:
344
- reference_tensor = reference_tensor.unsqueeze(0) # 添加通道维度
 
 
 
 
 
 
 
 
 
 
 
 
345
  else:
346
  raise ValueError("参考音频格式不正确")
347
 
 
 
 
 
348
  # 保存上传的音频
349
  torchaudio.save(temp_content_path, content_tensor, content_sr)
350
  torchaudio.save(temp_reference_path, reference_tensor, reference_sr)
351
 
352
- # 获取管道
353
- pipeline = get_pipeline("timbre")
354
-
355
- # 推理
356
- gen_audio = pipeline.inference_fm(
357
- src_wav_path=temp_content_path,
358
- timbre_ref_wav_path=temp_reference_path,
359
- flow_matching_steps=32,
360
- )
361
-
362
- # 保存生成的音频
363
- save_audio(gen_audio, output_path=output_path)
364
-
365
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
366
 
367
  def vevo_voice(content_wav, reference_wav):
368
  temp_content_path = "wav/temp_content.wav"
369
  temp_reference_path = "wav/temp_reference.wav"
370
  output_path = "wav/output_vevovoice.wav"
371
 
372
- # 检查并正确处理音频数据
373
  if content_wav is None or reference_wav is None:
374
  raise ValueError("请上传音频文件")
375
 
376
- # Gradio音频组件返回(sample_rate, data)元组或(data, sample_rate)元组
377
  if isinstance(content_wav, tuple) and len(content_wav) == 2:
378
- # 确保正确的顺序 (data, sample_rate)
379
  if isinstance(content_wav[0], np.ndarray):
380
  content_data, content_sr = content_wav
381
  else:
382
  content_sr, content_data = content_wav
383
- content_tensor = torch.FloatTensor(content_data)
384
- if content_tensor.ndim == 1:
385
- content_tensor = content_tensor.unsqueeze(0) # 添加通道维度
 
 
 
 
 
 
 
 
 
 
 
 
386
  else:
387
  raise ValueError("内容音频格式不正确")
388
-
 
389
  if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
390
- # 确保正确的顺序 (data, sample_rate)
391
  if isinstance(reference_wav[0], np.ndarray):
392
  reference_data, reference_sr = reference_wav
393
  else:
394
  reference_sr, reference_data = reference_wav
395
- reference_tensor = torch.FloatTensor(reference_data)
396
- if reference_tensor.ndim == 1:
397
- reference_tensor = reference_tensor.unsqueeze(0) # 添加通道维度
 
 
 
 
 
 
 
 
 
 
 
 
398
  else:
399
  raise ValueError("参考音频格式不正确")
400
 
 
 
 
 
401
  # 保存上传的音频
402
  torchaudio.save(temp_content_path, content_tensor, content_sr)
403
  torchaudio.save(temp_reference_path, reference_tensor, reference_sr)
404
 
405
- # 获取管道
406
- pipeline = get_pipeline("voice")
407
-
408
- # 推理
409
- gen_audio = pipeline.inference_ar_and_fm(
410
- src_wav_path=temp_content_path,
411
- src_text=None,
412
- style_ref_wav_path=temp_reference_path,
413
- timbre_ref_wav_path=temp_reference_path,
414
- )
415
-
416
- # 保存生成的音频
417
- save_audio(gen_audio, output_path=output_path)
418
-
419
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
420
 
421
  def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language="en"):
422
  temp_ref_path = "wav/temp_ref.wav"
423
  temp_timbre_path = "wav/temp_timbre.wav"
424
  output_path = "wav/output_vevotts.wav"
425
 
426
- # 检查并正确处理音频数据
427
  if ref_wav is None:
428
  raise ValueError("请上传参考音频文件")
429
 
430
- # Gradio音频组件返回(sample_rate, data)元组或(data, sample_rate)元组
431
  if isinstance(ref_wav, tuple) and len(ref_wav) == 2:
432
- # 确保正确的顺序 (data, sample_rate)
433
  if isinstance(ref_wav[0], np.ndarray):
434
  ref_data, ref_sr = ref_wav
435
  else:
436
  ref_sr, ref_data = ref_wav
437
- ref_tensor = torch.FloatTensor(ref_data)
438
- if ref_tensor.ndim == 1:
439
- ref_tensor = ref_tensor.unsqueeze(0) # 添加通道维度
 
 
 
 
 
 
 
 
 
 
 
 
440
  else:
441
  raise ValueError("参考音频格式不正确")
442
 
 
 
 
443
  # 保存上传的音频
444
  torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
445
 
446
  if timbre_ref_wav is not None:
447
  if isinstance(timbre_ref_wav, tuple) and len(timbre_ref_wav) == 2:
448
- # 确保正确的顺序 (data, sample_rate)
449
  if isinstance(timbre_ref_wav[0], np.ndarray):
450
  timbre_data, timbre_sr = timbre_ref_wav
451
  else:
452
  timbre_sr, timbre_data = timbre_ref_wav
453
- timbre_tensor = torch.FloatTensor(timbre_data)
454
- if timbre_tensor.ndim == 1:
455
- timbre_tensor = timbre_tensor.unsqueeze(0) # 添加通道维度
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
  torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr)
457
  else:
458
  raise ValueError("音色参考音频格式不正确")
459
  else:
460
  temp_timbre_path = temp_ref_path
461
 
462
- # 获取管道
463
- pipeline = get_pipeline("tts")
464
-
465
- # 推理
466
- gen_audio = pipeline.inference_ar_and_fm(
467
- src_wav_path=None,
468
- src_text=text,
469
- style_ref_wav_path=temp_ref_path,
470
- timbre_ref_wav_path=temp_timbre_path,
471
- style_ref_wav_text=None,
472
- src_text_language=src_language,
473
- style_ref_wav_text_language=ref_language,
474
- )
475
-
476
- # 保存生成的音频
477
- save_audio(gen_audio, output_path=output_path)
478
-
479
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
  # 创建Gradio界面
482
  with gr.Blocks(title="VEVO Demo") as demo:
 
316
  temp_reference_path = "wav/temp_reference.wav"
317
  output_path = "wav/output_vevotimbre.wav"
318
 
319
+ # 检查并处理音频数据
320
  if content_wav is None or reference_wav is None:
321
  raise ValueError("请上传音频文件")
322
 
323
+ # 处理内容音频格式
324
  if isinstance(content_wav, tuple) and len(content_wav) == 2:
 
325
  if isinstance(content_wav[0], np.ndarray):
326
  content_data, content_sr = content_wav
327
  else:
328
  content_sr, content_data = content_wav
329
+
330
+ # 确保是单声道
331
+ if len(content_data.shape) > 1 and content_data.shape[1] > 1:
332
+ content_data = np.mean(content_data, axis=1)
333
+
334
+ # 重采样到24kHz
335
+ if content_sr != 24000:
336
+ content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
337
+ content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
338
+ content_sr = 24000
339
+ else:
340
+ content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
341
+
342
+ # 归一化音量
343
+ content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
344
  else:
345
  raise ValueError("内容音频格式不正确")
346
+
347
+ # 处理参考音频格式
348
  if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
 
349
  if isinstance(reference_wav[0], np.ndarray):
350
  reference_data, reference_sr = reference_wav
351
  else:
352
  reference_sr, reference_data = reference_wav
353
+
354
+ # 确保是单声道
355
+ if len(reference_data.shape) > 1 and reference_data.shape[1] > 1:
356
+ reference_data = np.mean(reference_data, axis=1)
357
+
358
+ # 重采样到24kHz
359
+ if reference_sr != 24000:
360
+ reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
361
+ reference_tensor = torchaudio.functional.resample(reference_tensor, reference_sr, 24000)
362
+ reference_sr = 24000
363
+ else:
364
+ reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
365
+
366
+ # 归一化音量
367
+ reference_tensor = reference_tensor / (torch.max(torch.abs(reference_tensor)) + 1e-6) * 0.95
368
  else:
369
  raise ValueError("参考音频格式不正确")
370
 
371
+ # 打印debug信息
372
+ print(f"内容音频形状: {content_tensor.shape}, 采样率: {content_sr}")
373
+ print(f"参考音频形状: {reference_tensor.shape}, 采样率: {reference_sr}")
374
+
375
  # 保存上传的音频
376
  torchaudio.save(temp_content_path, content_tensor, content_sr)
377
  torchaudio.save(temp_reference_path, reference_tensor, reference_sr)
378
 
379
+ try:
380
+ # 获取管道
381
+ pipeline = get_pipeline("timbre")
382
+
383
+ # 推理
384
+ gen_audio = pipeline.inference_fm(
385
+ src_wav_path=temp_content_path,
386
+ timbre_ref_wav_path=temp_reference_path,
387
+ flow_matching_steps=32,
388
+ )
389
+
390
+ # 检查生成音频是否为数值异常
391
+ if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
392
+ print("警告:生成的音频包含NaN或Inf值")
393
+ gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
394
+
395
+ print(f"生成音频形状: {gen_audio.shape}, 最大值: {torch.max(gen_audio)}, 最小值: {torch.min(gen_audio)}")
396
+
397
+ # 保存生成的音频
398
+ save_audio(gen_audio, output_path=output_path)
399
+
400
+ return output_path
401
+ except Exception as e:
402
+ print(f"处理过程中出错: {e}")
403
+ import traceback
404
+ traceback.print_exc()
405
+ raise e
406
 
407
  def vevo_voice(content_wav, reference_wav):
408
  temp_content_path = "wav/temp_content.wav"
409
  temp_reference_path = "wav/temp_reference.wav"
410
  output_path = "wav/output_vevovoice.wav"
411
 
412
+ # 检查并处理音频数据
413
  if content_wav is None or reference_wav is None:
414
  raise ValueError("请上传音频文件")
415
 
416
+ # 处理内容音频格式
417
  if isinstance(content_wav, tuple) and len(content_wav) == 2:
 
418
  if isinstance(content_wav[0], np.ndarray):
419
  content_data, content_sr = content_wav
420
  else:
421
  content_sr, content_data = content_wav
422
+
423
+ # 确保是单声道
424
+ if len(content_data.shape) > 1 and content_data.shape[1] > 1:
425
+ content_data = np.mean(content_data, axis=1)
426
+
427
+ # 重采样到24kHz
428
+ if content_sr != 24000:
429
+ content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
430
+ content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
431
+ content_sr = 24000
432
+ else:
433
+ content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
434
+
435
+ # 归一化音量
436
+ content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
437
  else:
438
  raise ValueError("内容音频格式不正确")
439
+
440
+ # 处理参考音频格式
441
  if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
 
442
  if isinstance(reference_wav[0], np.ndarray):
443
  reference_data, reference_sr = reference_wav
444
  else:
445
  reference_sr, reference_data = reference_wav
446
+
447
+ # 确保是单声道
448
+ if len(reference_data.shape) > 1 and reference_data.shape[1] > 1:
449
+ reference_data = np.mean(reference_data, axis=1)
450
+
451
+ # 重采样到24kHz
452
+ if reference_sr != 24000:
453
+ reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
454
+ reference_tensor = torchaudio.functional.resample(reference_tensor, reference_sr, 24000)
455
+ reference_sr = 24000
456
+ else:
457
+ reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
458
+
459
+ # 归一化音量
460
+ reference_tensor = reference_tensor / (torch.max(torch.abs(reference_tensor)) + 1e-6) * 0.95
461
  else:
462
  raise ValueError("参考音频格式不正确")
463
 
464
+ # 打印debug信息
465
+ print(f"内容音频形状: {content_tensor.shape}, 采样率: {content_sr}")
466
+ print(f"参考音频形状: {reference_tensor.shape}, 采样率: {reference_sr}")
467
+
468
  # 保存上传的音频
469
  torchaudio.save(temp_content_path, content_tensor, content_sr)
470
  torchaudio.save(temp_reference_path, reference_tensor, reference_sr)
471
 
472
+ try:
473
+ # 获取管道
474
+ pipeline = get_pipeline("voice")
475
+
476
+ # 推理
477
+ gen_audio = pipeline.inference_ar_and_fm(
478
+ src_wav_path=temp_content_path,
479
+ src_text=None,
480
+ style_ref_wav_path=temp_reference_path,
481
+ timbre_ref_wav_path=temp_reference_path,
482
+ )
483
+
484
+ # 检查生成音频是否为数值异常
485
+ if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
486
+ print("警告:生成的音频包含NaN或Inf值")
487
+ gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
488
+
489
+ print(f"生成音频形状: {gen_audio.shape}, 最大值: {torch.max(gen_audio)}, 最小值: {torch.min(gen_audio)}")
490
+
491
+ # 保存生成的音频
492
+ save_audio(gen_audio, output_path=output_path)
493
+
494
+ return output_path
495
+ except Exception as e:
496
+ print(f"处理过程中出错: {e}")
497
+ import traceback
498
+ traceback.print_exc()
499
+ raise e
500
 
501
  def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language="en"):
502
  temp_ref_path = "wav/temp_ref.wav"
503
  temp_timbre_path = "wav/temp_timbre.wav"
504
  output_path = "wav/output_vevotts.wav"
505
 
506
+ # 检查并处理音频数据
507
  if ref_wav is None:
508
  raise ValueError("请上传参考音频文件")
509
 
510
+ # 处理参考音频格式
511
  if isinstance(ref_wav, tuple) and len(ref_wav) == 2:
 
512
  if isinstance(ref_wav[0], np.ndarray):
513
  ref_data, ref_sr = ref_wav
514
  else:
515
  ref_sr, ref_data = ref_wav
516
+
517
+ # 确保是单声道
518
+ if len(ref_data.shape) > 1 and ref_data.shape[1] > 1:
519
+ ref_data = np.mean(ref_data, axis=1)
520
+
521
+ # 重采样到24kHz
522
+ if ref_sr != 24000:
523
+ ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
524
+ ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
525
+ ref_sr = 24000
526
+ else:
527
+ ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
528
+
529
+ # 归一化音量
530
+ ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
531
  else:
532
  raise ValueError("参考音频格式不正确")
533
 
534
+ # 打印debug信息
535
+ print(f"参考音频形状: {ref_tensor.shape}, 采样率: {ref_sr}")
536
+
537
  # 保存上传的音频
538
  torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
539
 
540
  if timbre_ref_wav is not None:
541
  if isinstance(timbre_ref_wav, tuple) and len(timbre_ref_wav) == 2:
 
542
  if isinstance(timbre_ref_wav[0], np.ndarray):
543
  timbre_data, timbre_sr = timbre_ref_wav
544
  else:
545
  timbre_sr, timbre_data = timbre_ref_wav
546
+
547
+ # 确保是单声道
548
+ if len(timbre_data.shape) > 1 and timbre_data.shape[1] > 1:
549
+ timbre_data = np.mean(timbre_data, axis=1)
550
+
551
+ # 重采样到24kHz
552
+ if timbre_sr != 24000:
553
+ timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
554
+ timbre_tensor = torchaudio.functional.resample(timbre_tensor, timbre_sr, 24000)
555
+ timbre_sr = 24000
556
+ else:
557
+ timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
558
+
559
+ # 归一化音量
560
+ timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
561
+
562
+ print(f"音色参考音频形状: {timbre_tensor.shape}, 采样率: {timbre_sr}")
563
  torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr)
564
  else:
565
  raise ValueError("音色参考音频格式不正确")
566
  else:
567
  temp_timbre_path = temp_ref_path
568
 
569
+ try:
570
+ # 获取管道
571
+ pipeline = get_pipeline("tts")
572
+
573
+ # 推理
574
+ gen_audio = pipeline.inference_ar_and_fm(
575
+ src_wav_path=None,
576
+ src_text=text,
577
+ style_ref_wav_path=temp_ref_path,
578
+ timbre_ref_wav_path=temp_timbre_path,
579
+ style_ref_wav_text=None,
580
+ src_text_language=src_language,
581
+ style_ref_wav_text_language=ref_language,
582
+ )
583
+
584
+ # 检查生成音频是否为数值异常
585
+ if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
586
+ print("警告:生成的音频包含NaN或Inf值")
587
+ gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
588
+
589
+ print(f"生成音频形状: {gen_audio.shape}, 最大值: {torch.max(gen_audio)}, 最小值: {torch.min(gen_audio)}")
590
+
591
+ # 保存生成的音频
592
+ save_audio(gen_audio, output_path=output_path)
593
+
594
+ return output_path
595
+ except Exception as e:
596
+ print(f"处理过程中出错: {e}")
597
+ import traceback
598
+ traceback.print_exc()
599
+ raise e
600
 
601
  # 创建Gradio界面
602
  with gr.Blocks(title="VEVO Demo") as demo: