mjbuehler commited on
Commit
9aef995
·
verified ·
1 Parent(s): 773f681

Update app.py

Browse files

Updates for o4, tts via gpt-4o

Files changed (1) hide show
  1. app.py +48 -13
app.py CHANGED
@@ -496,7 +496,7 @@ class DialogueItem(BaseModel):
496
  class Dialogue(BaseModel):
497
  scratchpad: str
498
  dialogue: List[DialogueItem]
499
-
500
  def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None) -> bytes:
501
  client = OpenAI(
502
  api_key=api_key or os.getenv("OPENAI_API_KEY"),
@@ -511,6 +511,25 @@ def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None) -> byt
511
  for chunk in response.iter_bytes():
512
  file.write(chunk)
513
  return file.getvalue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
 
515
 
516
  from functools import wraps
@@ -531,10 +550,12 @@ def conditional_llm(model, api_base=None, api_key=None):
531
  def generate_audio(
532
  files: list,
533
  openai_api_key: str = None,
534
- text_model: str = "o1-2024-12-17", #"o1-preview-2024-09-12",
535
  audio_model: str = "tts-1",
536
  speaker_1_voice: str = "alloy",
537
  speaker_2_voice: str = "echo",
 
 
538
  api_base: str = None,
539
  intro_instructions: str = '',
540
  text_instructions: str = '',
@@ -578,8 +599,6 @@ def generate_audio(
578
  with file_path.open("r", encoding="utf-8") as f:
579
  text = f.read()
580
  combined_text += text + "\n\n"
581
-
582
-
583
  # Configure the LLM based on selected model and api_base
584
  @retry(retry=retry_if_exception_type(ValidationError))
585
  @conditional_llm(model=text_model, api_base=api_base, api_key=openai_api_key)
@@ -642,7 +661,8 @@ def generate_audio(
642
  for line in llm_output.dialogue:
643
  transcript_line = f"{line.speaker}: {line.text}"
644
  voice = speaker_1_voice if line.speaker == "speaker-1" else speaker_2_voice
645
- future = executor.submit(get_mp3, line.text, voice, audio_model, openai_api_key)
 
646
  futures.append((future, transcript_line))
647
  characters += len(line.text)
648
 
@@ -675,7 +695,7 @@ def generate_audio(
675
  def validate_and_generate_audio(*args):
676
  files = args[0]
677
  if not files:
678
- return None, None, None, "Please upload at least one PDF file before generating audio."
679
  try:
680
  audio_file, transcript, original_text = generate_audio(*args)
681
  return audio_file, transcript, original_text, None # Return None as the error when successful
@@ -741,7 +761,6 @@ with gr.Blocks(title="PDF to Audio", css="""
741
 
742
  with gr.Row(elem_id="main_container"):
743
  with gr.Column(scale=2):
744
- #files = gr.Files(label="PDFs", file_types=["pdf"], )
745
  files = gr.Files(label="PDFs (.pdf), markdown (.md, .mmd), or text files (.txt)", file_types=[".pdf", ".PDF", ".md", ".mmd", ".txt"], )
746
 
747
  openai_api_key = gr.Textbox(
@@ -753,7 +772,7 @@ with gr.Blocks(title="PDF to Audio", css="""
753
  text_model = gr.Dropdown(
754
  label="Text Generation Model",
755
  choices=STANDARD_TEXT_MODELS,
756
- value="o1-preview-2024-09-12", #"gpt-4o-mini",
757
  info="Select the model to generate the dialogue text.",
758
  )
759
  audio_model = gr.Dropdown(
@@ -774,6 +793,20 @@ with gr.Blocks(title="PDF to Audio", css="""
774
  value="echo",
775
  info="Select the voice for Speaker 2.",
776
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
777
  api_base = gr.Textbox(
778
  label="Custom API Base",
779
  placeholder="Enter custom API base URL if using a custom/local model...",
@@ -852,7 +885,8 @@ with gr.Blocks(title="PDF to Audio", css="""
852
  fn=validate_and_generate_audio,
853
  inputs=[
854
  files, openai_api_key, text_model, audio_model,
855
- speaker_1_voice, speaker_2_voice, api_base,
 
856
  intro_instructions, text_instructions, scratch_pad_instructions,
857
  prelude_dialog, podcast_dialog_instructions,
858
  edited_transcript, # placeholder for edited_transcript
@@ -881,7 +915,8 @@ with gr.Blocks(title="PDF to Audio", css="""
881
  inputs=[
882
  use_edited_transcript, edited_transcript,
883
  files, openai_api_key, text_model, audio_model,
884
- speaker_1_voice, speaker_2_voice, api_base,
 
885
  intro_instructions, text_instructions, scratch_pad_instructions,
886
  prelude_dialog, podcast_dialog_instructions,
887
  user_feedback, original_text_output
@@ -908,7 +943,7 @@ with gr.Blocks(title="PDF to Audio", css="""
908
  #demo.queue(max_size=20, default_concurrency_limit=32)
909
 
910
  # Launch the Gradio app
911
- if __name__ == "__main__":
912
- demo.launch(share=True)
913
 
914
- #demo.launch(server_name="127.0.0.1", server_port=7860)
 
496
  class Dialogue(BaseModel):
497
  scratchpad: str
498
  dialogue: List[DialogueItem]
499
+ '''
500
  def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None) -> bytes:
501
  client = OpenAI(
502
  api_key=api_key or os.getenv("OPENAI_API_KEY"),
 
511
  for chunk in response.iter_bytes():
512
  file.write(chunk)
513
  return file.getvalue()
514
+ '''
515
+ def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None,
516
+ speaker_instructions: str ='Speak in an emotive and friendly tone.') -> bytes:
517
+ client = OpenAI(
518
+ api_key=api_key or os.getenv("OPENAI_API_KEY"),
519
+ )
520
+
521
+
522
+ with client.audio.speech.with_streaming_response.create(
523
+ model=audio_model,
524
+ voice=voice,
525
+ input=text,
526
+ instructions=speaker_instructions,
527
+ ) as response:
528
+ with io.BytesIO() as file:
529
+ for chunk in response.iter_bytes():
530
+ file.write(chunk)
531
+ return file.getvalue()
532
+
533
 
534
 
535
  from functools import wraps
 
550
  def generate_audio(
551
  files: list,
552
  openai_api_key: str = None,
553
+ text_model: str = "o4-mini", #o1-2024-12-17", #"o1-preview-2024-09-12",
554
  audio_model: str = "tts-1",
555
  speaker_1_voice: str = "alloy",
556
  speaker_2_voice: str = "echo",
557
+ speaker_1_instructions: str = '',
558
+ speaker_2_instructions: str = '',
559
  api_base: str = None,
560
  intro_instructions: str = '',
561
  text_instructions: str = '',
 
599
  with file_path.open("r", encoding="utf-8") as f:
600
  text = f.read()
601
  combined_text += text + "\n\n"
 
 
602
  # Configure the LLM based on selected model and api_base
603
  @retry(retry=retry_if_exception_type(ValidationError))
604
  @conditional_llm(model=text_model, api_base=api_base, api_key=openai_api_key)
 
661
  for line in llm_output.dialogue:
662
  transcript_line = f"{line.speaker}: {line.text}"
663
  voice = speaker_1_voice if line.speaker == "speaker-1" else speaker_2_voice
664
+ speaker_instructions=speaker_1_instructions if line.speaker == "speaker-1" else speaker_2_instructions
665
+ future = executor.submit(get_mp3, line.text, voice, audio_model, openai_api_key, speaker_instructions, )
666
  futures.append((future, transcript_line))
667
  characters += len(line.text)
668
 
 
695
  def validate_and_generate_audio(*args):
696
  files = args[0]
697
  if not files:
698
+ return None, None, None, "Please upload at least one PDF (or MD/MMD/TXT) file before generating audio."
699
  try:
700
  audio_file, transcript, original_text = generate_audio(*args)
701
  return audio_file, transcript, original_text, None # Return None as the error when successful
 
761
 
762
  with gr.Row(elem_id="main_container"):
763
  with gr.Column(scale=2):
 
764
  files = gr.Files(label="PDFs (.pdf), markdown (.md, .mmd), or text files (.txt)", file_types=[".pdf", ".PDF", ".md", ".mmd", ".txt"], )
765
 
766
  openai_api_key = gr.Textbox(
 
772
  text_model = gr.Dropdown(
773
  label="Text Generation Model",
774
  choices=STANDARD_TEXT_MODELS,
775
+ value="o3-mini", "o4-mini", #"o1-preview-2024-09-12", #"gpt-4o-mini",
776
  info="Select the model to generate the dialogue text.",
777
  )
778
  audio_model = gr.Dropdown(
 
793
  value="echo",
794
  info="Select the voice for Speaker 2.",
795
  )
796
+ speaker_1_instructions = gr.Textbox(
797
+ label="Speaker 1 instructions",
798
+ value="Speak in an emotive and friendly tone.",
799
+ info="Speaker 1 instructions (used with gpt-4o-mini-tts only)",
800
+ interactive=True,
801
+ )
802
+
803
+ speaker_2_instructions = gr.Textbox(
804
+ label="Speaker 2 instructions",
805
+ value="Speak in a friendly, but serious tone.",
806
+ info="Speaker 2 instructions (used with gpt-4o-mini-tts only)",
807
+ interactive=True,
808
+ )
809
+
810
  api_base = gr.Textbox(
811
  label="Custom API Base",
812
  placeholder="Enter custom API base URL if using a custom/local model...",
 
885
  fn=validate_and_generate_audio,
886
  inputs=[
887
  files, openai_api_key, text_model, audio_model,
888
+ speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
889
+ api_base,
890
  intro_instructions, text_instructions, scratch_pad_instructions,
891
  prelude_dialog, podcast_dialog_instructions,
892
  edited_transcript, # placeholder for edited_transcript
 
915
  inputs=[
916
  use_edited_transcript, edited_transcript,
917
  files, openai_api_key, text_model, audio_model,
918
+ speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
919
+ api_base,
920
  intro_instructions, text_instructions, scratch_pad_instructions,
921
  prelude_dialog, podcast_dialog_instructions,
922
  user_feedback, original_text_output
 
943
  #demo.queue(max_size=20, default_concurrency_limit=32)
944
 
945
  # Launch the Gradio app
946
+ #if __name__ == "__main__":
947
+ # demo.launch(share=True)
948
 
949
+ demo.launch()