mjbuehler commited on
Commit
52250e2
·
verified ·
1 Parent(s): b04813d

Update app.py

Browse files

Updated edit, a few new templates, etc.

Files changed (1) hide show
  1. app.py +326 -54
app.py CHANGED
@@ -33,7 +33,7 @@ def read_readme():
33
  INSTRUCTION_TEMPLATES = {
34
 
35
  ################# DEEP DATA ANALYSIS ##################
36
- "Deep research analysis": {
37
  # 1) High‑level task description
38
  "intro": """You are a senior analyst who conducts deep research.
39
 
@@ -69,7 +69,9 @@ When ready, compile the final report strictly following the template above.""",
69
  "prelude": """Below is the structured report based on the supplied raw data:""",
70
 
71
  # 5) Main output instructions
72
- "dialog": """Design your output to be read aloud -- it will be directly converted into audio. The presentation of materials should include 30,000 words.
 
 
73
 
74
  There is only one speaker, you. Stay on topic and maintaining an engaging flow.
75
 
@@ -77,6 +79,48 @@ Write a clear, detailed, and well-prepared analysis and report as a single narra
77
  },
78
 
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  ################# PODCAST ##################
81
  "podcast": {
82
  "intro": """Your task is to take the input text provided and turn it into an lively, engaging, informative podcast dialogue, in the style of NPR. Do not use or make up names. The input text may be messy or unstructured, as it could come from a variety of sources like PDFs or web pages.
@@ -472,33 +516,6 @@ O podcast deve ter cerca de 20.000 palavras.
472
  },
473
  }
474
 
475
- # Function to update instruction fields based on template selection
476
- def update_instructions(template):
477
- return (
478
- INSTRUCTION_TEMPLATES[template]["intro"],
479
- INSTRUCTION_TEMPLATES[template]["text_instructions"],
480
- INSTRUCTION_TEMPLATES[template]["scratch_pad"],
481
- INSTRUCTION_TEMPLATES[template]["prelude"],
482
- INSTRUCTION_TEMPLATES[template]["dialog"]
483
- )
484
-
485
- import concurrent.futures as cf
486
- import glob
487
- import io
488
- import os
489
- import time
490
- from pathlib import Path
491
- from tempfile import NamedTemporaryFile
492
- from typing import List, Literal
493
-
494
- import gradio as gr
495
-
496
- from loguru import logger
497
- from openai import OpenAI
498
- from promptic import llm
499
- from pydantic import BaseModel, ValidationError
500
- from pypdf import PdfReader
501
- from tenacity import retry, retry_if_exception_type
502
 
503
  # Define standard values
504
  STANDARD_TEXT_MODELS = [
@@ -552,6 +569,35 @@ STANDARD_VOICES = [
552
 
553
  ]
554
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
  class DialogueItem(BaseModel):
556
  text: str
557
  speaker: Literal["speaker-1", "speaker-2"]
@@ -577,11 +623,11 @@ def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None) -> byt
577
  '''
578
  def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None,
579
  speaker_instructions: str ='Speak in an emotive and friendly tone.') -> bytes:
 
580
  client = OpenAI(
581
  api_key=api_key or os.getenv("OPENAI_API_KEY"),
582
  )
583
-
584
-
585
  with client.audio.speech.with_streaming_response.create(
586
  model=audio_model,
587
  voice=voice,
@@ -596,7 +642,7 @@ def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None,
596
 
597
 
598
  from functools import wraps
599
-
600
  def conditional_llm(model, api_base=None, api_key=None, reasoning_effort="N/A"):
601
  """
602
  Conditionally apply the @llm decorator based on the api_base parameter.
@@ -614,12 +660,44 @@ def conditional_llm(model, api_base=None, api_key=None, reasoning_effort="N/A"):
614
  return llm(model=model, api_key=api_key, reasoning_effort=reasoning_effort)(func)
615
 
616
  return decorator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617
 
618
  def generate_audio(
619
  files: list,
620
  openai_api_key: str = None,
621
  text_model: str = "o4-mini", #o1-2024-12-17", #"o1-preview-2024-09-12",
622
  reasoning_effort: str = "N/A",
 
623
  audio_model: str = "tts-1",
624
  speaker_1_voice: str = "alloy",
625
  speaker_2_voice: str = "echo",
@@ -636,6 +714,8 @@ def generate_audio(
636
  original_text: str = None,
637
  debug = False,
638
  ) -> tuple:
 
 
639
  # Validate API Key
640
  if not os.getenv("OPENAI_API_KEY") and not openai_api_key:
641
  raise gr.Error("OpenAI API key is required")
@@ -643,14 +723,7 @@ def generate_audio(
643
  combined_text = original_text or ""
644
 
645
  # If there's no original text, extract it from the uploaded files
646
- '''
647
- if not combined_text:
648
- for file in files:
649
- with Path(file).open("rb") as f:
650
- reader = PdfReader(f)
651
- text = "\n\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
652
- combined_text += text + "\n\n"
653
- '''
654
 
655
  if not combined_text:
656
  for file in files:
@@ -670,7 +743,14 @@ def generate_audio(
670
  combined_text += text + "\n\n"
671
  # Configure the LLM based on selected model and api_base
672
  @retry(retry=retry_if_exception_type(ValidationError))
673
- @conditional_llm(model=text_model, api_base=api_base, api_key=openai_api_key)
 
 
 
 
 
 
 
674
  def generate_dialogue(text: str, intro_instructions: str, text_instructions: str, scratch_pad_instructions: str,
675
  prelude_dialog: str, podcast_dialog_instructions: str,
676
  edited_transcript: str = None, user_feedback: str = None, ) -> Dialogue:
@@ -749,6 +829,7 @@ def generate_audio(
749
  temporary_file = NamedTemporaryFile(
750
  dir=temporary_directory,
751
  delete=False,
 
752
  suffix=".mp3",
753
  )
754
  temporary_file.write(audio)
@@ -759,18 +840,21 @@ def generate_audio(
759
  if os.path.isfile(file) and time.time() - os.path.getmtime(file) > 24 * 60 * 60:
760
  os.remove(file)
761
 
762
- return temporary_file.name, transcript, combined_text
763
 
764
  def validate_and_generate_audio(*args):
765
  files = args[0]
766
  if not files:
767
  return None, None, None, "Please upload at least one PDF (or MD/MMD/TXT) file before generating audio."
768
  try:
769
- audio_file, transcript, original_text = generate_audio(*args)
770
- return audio_file, transcript, original_text, None # Return None as the error when successful
 
771
  except Exception as e:
772
- # If an error occurs during generation, return None for the outputs and the error message
773
- return None, None, None, str(e)
 
 
774
 
775
  def edit_and_regenerate(edited_transcript, user_feedback, *args):
776
  # Replace the original transcript and feedback in the args with the new ones
@@ -786,6 +870,128 @@ def process_feedback_and_regenerate(feedback, *args):
786
  new_args.append(feedback) # Add user feedback as a new argument
787
  return validate_and_generate_audio(*new_args)
788
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
  with gr.Blocks(title="PDF to Audio", css="""
790
  #header {
791
  display: flex;
@@ -815,6 +1021,8 @@ with gr.Blocks(title="PDF to Audio", css="""
815
  margin-top: 20px;
816
  }
817
  """) as demo:
 
 
818
 
819
  with gr.Row(elem_id="header"):
820
  with gr.Column(scale=4):
@@ -889,6 +1097,12 @@ with gr.Blocks(title="PDF to Audio", css="""
889
  info="If you are using a custom or local model, provide the API base URL here, e.g.: http://localhost:8080/v1 for llama.cpp REST server.",
890
  )
891
 
 
 
 
 
 
 
892
  with gr.Column(scale=3):
893
  template_dropdown = gr.Dropdown(
894
  label="Instruction Template",
@@ -929,7 +1143,7 @@ with gr.Blocks(title="PDF to Audio", css="""
929
  )
930
 
931
  audio_output = gr.Audio(label="Audio", format="mp3", interactive=False, autoplay=False)
932
- transcript_output = gr.Textbox(label="Transcript", lines=20, show_copy_button=True)
933
  original_text_output = gr.Textbox(label="Original Text", lines=10, visible=False)
934
  error_output = gr.Textbox(visible=False) # Hidden textbox to store error message
935
 
@@ -940,6 +1154,45 @@ with gr.Blocks(title="PDF to Audio", css="""
940
  user_feedback = gr.Textbox(label="Provide Feedback or Notes", lines=10, #placeholder="Enter your feedback or notes here..."
941
  )
942
  regenerate_btn = gr.Button("Regenerate Audio with Edits and Feedback")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
943
  # Function to update the interactive state of edited_transcript
944
  def update_edit_box(checkbox_value):
945
  return gr.update(interactive=checkbox_value, lines=20 if checkbox_value else 20, visible=True if checkbox_value else False)
@@ -960,15 +1213,16 @@ with gr.Blocks(title="PDF to Audio", css="""
960
  submit_btn.click(
961
  fn=validate_and_generate_audio,
962
  inputs=[
963
- files, openai_api_key, text_model, reasoning_effort, audio_model,
964
  speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
965
  api_base,
966
  intro_instructions, text_instructions, scratch_pad_instructions,
967
  prelude_dialog, podcast_dialog_instructions,
968
- edited_transcript, # placeholder for edited_transcript
969
- user_feedback, # placeholder for user_feedback
 
970
  ],
971
- outputs=[audio_output, transcript_output, original_text_output, error_output]
972
  ).then(
973
  fn=lambda audio, transcript, original_text, error: (
974
  transcript if transcript else "",
@@ -980,7 +1234,11 @@ with gr.Blocks(title="PDF to Audio", css="""
980
  fn=lambda error: gr.Warning(error) if error else None,
981
  inputs=[error_output],
982
  outputs=[]
983
- )
 
 
 
 
984
 
985
  regenerate_btn.click(
986
  fn=lambda use_edit, edit, *args: validate_and_generate_audio(
@@ -990,14 +1248,14 @@ with gr.Blocks(title="PDF to Audio", css="""
990
  ),
991
  inputs=[
992
  use_edited_transcript, edited_transcript,
993
- files, openai_api_key, text_model, reasoning_effort, audio_model,
994
  speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
995
  api_base,
996
  intro_instructions, text_instructions, scratch_pad_instructions,
997
  prelude_dialog, podcast_dialog_instructions,
998
  user_feedback, original_text_output
999
  ],
1000
- outputs=[audio_output, transcript_output, original_text_output, error_output]
1001
  ).then(
1002
  fn=lambda audio, transcript, original_text, error: (
1003
  transcript if transcript else "",
@@ -1009,6 +1267,20 @@ with gr.Blocks(title="PDF to Audio", css="""
1009
  fn=lambda error: gr.Warning(error) if error else None,
1010
  inputs=[error_output],
1011
  outputs=[]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1012
  )
1013
 
1014
  # Add README content at the bottom
 
33
  INSTRUCTION_TEMPLATES = {
34
 
35
  ################# DEEP DATA ANALYSIS ##################
36
+ "deep research analysis": {
37
  # 1) High‑level task description
38
  "intro": """You are a senior analyst who conducts deep research.
39
 
 
69
  "prelude": """Below is the structured report based on the supplied raw data:""",
70
 
71
  # 5) Main output instructions
72
+ "dialog": """Design your output to be read aloud -- it will be directly converted into audio. The presentation of materials should include 30,000 words.
73
+
74
+ If you have equations, variables or other complex concepts, make sure to design your output so that it can be clearly rendered by a text-to-voice model.
75
 
76
  There is only one speaker, you. Stay on topic and maintaining an engaging flow.
77
 
 
79
  },
80
 
81
 
82
+ ################# CLEAN READ‑THROUGH ##################
83
+ "clean rendering": {
84
+ # 1) What the model should do
85
+ "intro": """You are a careful narrator tasked with producing an **accurate, faithful rendering** of the supplied document so it can be read aloud.
86
+
87
+ Your priorities are:
88
+ • Preserve the original wording and ordering of the content.
89
+ • Remove anything that is clearly an artefact of page layout (page numbers, running headers/footers, line numbers, PDF crop marks, hyphen‑splits at line wraps).
90
+ • Keep mathematical symbols, equations and variable names intact, but read them in a way a TTS system can pronounce (e.g. “square root of”, “alpha sub i”).
91
+ • Do **not** add commentary, summaries, or extra explanations—just the cleaned text.
92
+ • Present everything in the **same sequence** as in the source.
93
+
94
+ Output must be suitable for text‑to‑speech; begin every paragraph with `speaker-1:` and write as a single narrator.""",
95
+
96
+ # 2) How to cleanse the raw text
97
+ "text_instructions": """Scan the input for artefacts such as:
98
+
99
+ - Stand‑alone page numbers or headers like “Page 12 of 30”
100
+ - Repeated footers, URLs or timestamps
101
+ - Manual hyphenation at line breaks (join split words)
102
+ - Broken tables or columns (flatten them into continuous sentences where possible)
103
+
104
+ Strip these while keeping all legitimate content. Do **not** reorder paragraphs or sentences.""",
105
+
106
+ # 3) Hidden scratch‑pad for the model
107
+ "scratch_pad": """Brainstorm here (hidden):
108
+ - Identify obvious header/footer patterns to delete.
109
+ - Decide how to handle any malformed tables (e.g. read row‑by‑row).
110
+ - Note any equations that need a spoken equivalent.
111
+ After cleaning decisions are made, move on to generate the final narration.""",
112
+
113
+ # 4) Prelude before the narration starts
114
+ "prelude": """Below is the faithful narration of the provided document (cleaned of layout artefacts, otherwise unchanged):""",
115
+
116
+ # 5) Main output instructions
117
+ "dialog": """Design your output to be read aloud—no markup, no bracketed directions.
118
+ Only one speaker (`speaker-1:`).
119
+ Maintain original headings and paragraph breaks where they naturally occur in the source.
120
+ If an equation appears, read it in a TTS‑friendly style (e.g. `speaker-1: E equals m times c squared`)."""
121
+ },
122
+
123
+
124
  ################# PODCAST ##################
125
  "podcast": {
126
  "intro": """Your task is to take the input text provided and turn it into an lively, engaging, informative podcast dialogue, in the style of NPR. Do not use or make up names. The input text may be messy or unstructured, as it could come from a variety of sources like PDFs or web pages.
 
516
  },
517
  }
518
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519
 
520
  # Define standard values
521
  STANDARD_TEXT_MODELS = [
 
569
 
570
  ]
571
 
572
+ # Function to update instruction fields based on template selection
573
+ def update_instructions(template):
574
+ return (
575
+ INSTRUCTION_TEMPLATES[template]["intro"],
576
+ INSTRUCTION_TEMPLATES[template]["text_instructions"],
577
+ INSTRUCTION_TEMPLATES[template]["scratch_pad"],
578
+ INSTRUCTION_TEMPLATES[template]["prelude"],
579
+ INSTRUCTION_TEMPLATES[template]["dialog"]
580
+ )
581
+
582
+ import concurrent.futures as cf
583
+ import glob
584
+ import io
585
+ import os
586
+ import time
587
+ from pathlib import Path
588
+ from tempfile import NamedTemporaryFile
589
+ from typing import List, Literal
590
+
591
+ import gradio as gr
592
+
593
+ from loguru import logger
594
+ from openai import OpenAI
595
+ from promptic import llm
596
+ from pydantic import BaseModel, ValidationError
597
+ from pypdf import PdfReader
598
+ from tenacity import retry, retry_if_exception_type
599
+
600
+
601
  class DialogueItem(BaseModel):
602
  text: str
603
  speaker: Literal["speaker-1", "speaker-2"]
 
623
  '''
624
  def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None,
625
  speaker_instructions: str ='Speak in an emotive and friendly tone.') -> bytes:
626
+
627
  client = OpenAI(
628
  api_key=api_key or os.getenv("OPENAI_API_KEY"),
629
  )
630
+
 
631
  with client.audio.speech.with_streaming_response.create(
632
  model=audio_model,
633
  voice=voice,
 
642
 
643
 
644
  from functools import wraps
645
+ '''
646
  def conditional_llm(model, api_base=None, api_key=None, reasoning_effort="N/A"):
647
  """
648
  Conditionally apply the @llm decorator based on the api_base parameter.
 
660
  return llm(model=model, api_key=api_key, reasoning_effort=reasoning_effort)(func)
661
 
662
  return decorator
663
+ '''
664
+ def conditional_llm(
665
+ model,
666
+ api_base=None,
667
+ api_key=None,
668
+ reasoning_effort="N/A",
669
+ do_web_search=False,
670
+ ):
671
+ """
672
+ Wrap a function with the @llm decorator, choosing kwargs dynamically.
673
+ Adds `web_search_options={}` when do_web_search==True.
674
+ """
675
+
676
+ # build decorator kwargs once so we don’t repeat logic
677
+ decorator_kwargs = {"model": model}
678
+
679
+ if api_base:
680
+ decorator_kwargs["api_base"] = api_base
681
+ else:
682
+ decorator_kwargs["api_key"] = api_key
683
+ if reasoning_effort != "N/A":
684
+ decorator_kwargs["reasoning_effort"] = reasoning_effort
685
+
686
+ if do_web_search:
687
+ decorator_kwargs["web_search_options"] = {} # empty dict → default behaviour
688
+
689
+ def decorator(func):
690
+ return llm(**decorator_kwargs)(func)
691
+
692
+ return decorator
693
+
694
 
695
  def generate_audio(
696
  files: list,
697
  openai_api_key: str = None,
698
  text_model: str = "o4-mini", #o1-2024-12-17", #"o1-preview-2024-09-12",
699
  reasoning_effort: str = "N/A",
700
+ do_web_search: bool = False,
701
  audio_model: str = "tts-1",
702
  speaker_1_voice: str = "alloy",
703
  speaker_2_voice: str = "echo",
 
714
  original_text: str = None,
715
  debug = False,
716
  ) -> tuple:
717
+
718
+
719
  # Validate API Key
720
  if not os.getenv("OPENAI_API_KEY") and not openai_api_key:
721
  raise gr.Error("OpenAI API key is required")
 
723
  combined_text = original_text or ""
724
 
725
  # If there's no original text, extract it from the uploaded files
726
+
 
 
 
 
 
 
 
727
 
728
  if not combined_text:
729
  for file in files:
 
743
  combined_text += text + "\n\n"
744
  # Configure the LLM based on selected model and api_base
745
  @retry(retry=retry_if_exception_type(ValidationError))
746
+ #@conditional_llm(model=text_model, api_base=api_base, api_key=openai_api_key)
747
+ @conditional_llm(
748
+ model=text_model,
749
+ api_base=api_base,
750
+ api_key=openai_api_key,
751
+ reasoning_effort=reasoning_effort,
752
+ do_web_search=do_web_search,
753
+ )
754
  def generate_dialogue(text: str, intro_instructions: str, text_instructions: str, scratch_pad_instructions: str,
755
  prelude_dialog: str, podcast_dialog_instructions: str,
756
  edited_transcript: str = None, user_feedback: str = None, ) -> Dialogue:
 
829
  temporary_file = NamedTemporaryFile(
830
  dir=temporary_directory,
831
  delete=False,
832
+ prefix="PDF2Audio_",
833
  suffix=".mp3",
834
  )
835
  temporary_file.write(audio)
 
840
  if os.path.isfile(file) and time.time() - os.path.getmtime(file) > 24 * 60 * 60:
841
  os.remove(file)
842
 
843
+ return temporary_file.name, transcript, combined_text, llm_output
844
 
845
  def validate_and_generate_audio(*args):
846
  files = args[0]
847
  if not files:
848
  return None, None, None, "Please upload at least one PDF (or MD/MMD/TXT) file before generating audio."
849
  try:
850
+ #audio_file, transcript, original_text = generate_audio(*args)
851
+ audio_file, transcript, original_text, dialogue = generate_audio(*args)
852
+ return audio_file, transcript, original_text, None, dialogue #
853
  except Exception as e:
854
+ return None, None, None, str(e), None #
855
+
856
+
857
+
858
 
859
  def edit_and_regenerate(edited_transcript, user_feedback, *args):
860
  # Replace the original transcript and feedback in the args with the new ones
 
870
  new_args.append(feedback) # Add user feedback as a new argument
871
  return validate_and_generate_audio(*new_args)
872
 
873
+
874
+ ####################################################
875
+ #Download dialog/result as markdown
876
+ ####################################################
877
+
878
+ def dialogue_to_markdown(dlg: Dialogue) -> str:
879
+ lines = []
880
+ lines.append("# PDF2Audio Transcript\n")
881
+ lines.append("## Transcript\n")
882
+ for item in dlg.dialogue:
883
+ lines.append(f"**{item.speaker}:** {item.text.strip()}\n")
884
+ return "\n".join(lines)
885
+
886
+ def save_dialogue_as_markdown(cached_dialogue) -> str:
887
+ if cached_dialogue is None:
888
+ raise gr.Error("No dialogue to save. Please generate or edit a dialogue first.")
889
+
890
+ markdown_text = dialogue_to_markdown(cached_dialogue)
891
+
892
+ # Write to a temporary .md file
893
+ temp_dir = "./gradio_cached_examples/tmp/"
894
+ os.makedirs(temp_dir, exist_ok=True)
895
+
896
+ file_path = os.path.join(temp_dir, f"PDF2Audio_dialogue_{int(time.time())}.md")
897
+ with open(file_path, "w", encoding="utf-8") as f:
898
+ f.write(markdown_text)
899
+
900
+ return file_path
901
+
902
+
903
+
904
+ ####################################################
905
+ #Edit and re-render audio from existing LLM output
906
+ ####################################################
907
+
908
+ import pandas as pd
909
+ from typing import List
910
+
911
+ def dialogue_to_df(dlg: Dialogue) -> pd.DataFrame:
912
+ data = [{"Speaker": item.speaker, "Line": item.text} for item in dlg.dialogue]
913
+ return pd.DataFrame(data)
914
+
915
+ def df_to_dialogue(df: pd.DataFrame, scratchpad: str = "") -> Dialogue:
916
+ items: List[DialogueItem] = [
917
+ DialogueItem(speaker=row["Speaker"], text=row["Line"])
918
+ for _, row in df.iterrows()
919
+ ]
920
+ return Dialogue(scratchpad=scratchpad, dialogue=items)
921
+
922
+ def save_dialogue_edits(df, cached_dialogue):
923
+ """
924
+ Save the edited dialogue and update the per-session cached state.
925
+ """
926
+ if cached_dialogue is None:
927
+ raise gr.Error("Nothing to edit yet – run Generate Audio first.")
928
+
929
+ import pandas as pd
930
+ new_dlg = df_to_dialogue(pd.DataFrame(df, columns=["Speaker", "Line"]))
931
+
932
+ # regenerate plain transcript so the user sees the change immediately
933
+ transcript_str = "\n".join(f"{d.speaker}: {d.text}" for d in new_dlg.dialogue)
934
+
935
+ # Return updated state and transcript
936
+ return new_dlg, gr.update(value=transcript_str), "Edits saved. Press *Re‑render* to hear them."
937
+
938
+
939
+ def render_audio_from_dialogue(
940
+ cached_dialogue, # 👈 NEW: pass in as argument
941
+ openai_api_key: str,
942
+ audio_model: str,
943
+ speaker_1_voice: str,
944
+ speaker_2_voice: str,
945
+ speaker_1_instructions: str,
946
+ speaker_2_instructions: str,
947
+ ) -> tuple[str, str]: # mp3 file path, transcript
948
+
949
+ if cached_dialogue is None:
950
+ raise gr.Error("Nothing to re‑render yet – run Generate Audio first.")
951
+
952
+ dlg = cached_dialogue
953
+ audio_bytes, transcript, characters = b"", "", 0
954
+
955
+ with cf.ThreadPoolExecutor() as ex:
956
+ futures = []
957
+ for item in dlg.dialogue:
958
+ voice = speaker_1_voice if item.speaker == "speaker-1" else speaker_2_voice
959
+ instr = speaker_1_instructions if item.speaker == "speaker-1" else speaker_2_instructions
960
+ futures.append(
961
+ (
962
+ ex.submit(get_mp3, item.text, voice, audio_model, openai_api_key, instr),
963
+ f"{item.speaker}: {item.text}",
964
+ )
965
+ )
966
+ characters += len(item.text)
967
+
968
+ for fut, line in futures:
969
+ audio_bytes += fut.result()
970
+ transcript += line + "\n\n"
971
+
972
+ logger.info(f"[Re‑render] {characters} characters voiced")
973
+
974
+ # Write to temporary .mp3 file
975
+ temporary_directory = "./gradio_cached_examples/tmp/"
976
+ os.makedirs(temporary_directory, exist_ok=True)
977
+
978
+ temporary_file = NamedTemporaryFile(
979
+ dir=temporary_directory,
980
+ delete=False,
981
+ prefix="PDF2Audio_",
982
+ suffix=".mp3",
983
+ )
984
+ temporary_file.write(audio_bytes)
985
+ temporary_file.close()
986
+
987
+ # Clean up old files
988
+ for file in glob.glob(f"{temporary_directory}*.mp3"):
989
+ if os.path.isfile(file) and time.time() - os.path.getmtime(file) > 24 * 60 * 60:
990
+ os.remove(file)
991
+
992
+ return temporary_file.name, transcript
993
+
994
+
995
  with gr.Blocks(title="PDF to Audio", css="""
996
  #header {
997
  display: flex;
 
1021
  margin-top: 20px;
1022
  }
1023
  """) as demo:
1024
+
1025
+ cached_dialogue = gr.State()
1026
 
1027
  with gr.Row(elem_id="header"):
1028
  with gr.Column(scale=4):
 
1097
  info="If you are using a custom or local model, provide the API base URL here, e.g.: http://localhost:8080/v1 for llama.cpp REST server.",
1098
  )
1099
 
1100
+ do_web_search = gr.Checkbox(
1101
+ label="Let the LLM search the web to complement the documents.",
1102
+ value=False,
1103
+ info="When enabled, the LLM will call the web search tool during its reasoning."
1104
+ )
1105
+
1106
  with gr.Column(scale=3):
1107
  template_dropdown = gr.Dropdown(
1108
  label="Instruction Template",
 
1143
  )
1144
 
1145
  audio_output = gr.Audio(label="Audio", format="mp3", interactive=False, autoplay=False)
1146
+ transcript_output = gr.Textbox(label="Transcript", lines=25, show_copy_button=True)
1147
  original_text_output = gr.Textbox(label="Original Text", lines=10, visible=False)
1148
  error_output = gr.Textbox(visible=False) # Hidden textbox to store error message
1149
 
 
1154
  user_feedback = gr.Textbox(label="Provide Feedback or Notes", lines=10, #placeholder="Enter your feedback or notes here..."
1155
  )
1156
  regenerate_btn = gr.Button("Regenerate Audio with Edits and Feedback")
1157
+
1158
+ with gr.Accordion("Edit dialogue line‑by‑line", open=False) as editor_box:
1159
+ df_editor = gr.Dataframe(
1160
+ headers=["Speaker", "Line"],
1161
+ datatype=["str", "str"],
1162
+ wrap=True,
1163
+ interactive=True,
1164
+ row_count=(1, "dynamic"),
1165
+ col_count=(2, "fixed"),
1166
+ )
1167
+
1168
+ save_btn = gr.Button("Save edits")
1169
+ save_msg = gr.Markdown()
1170
+
1171
+
1172
+ save_btn.click(
1173
+ fn=save_dialogue_edits,
1174
+ inputs=[df_editor, cached_dialogue],
1175
+ outputs=[cached_dialogue, transcript_output, save_msg],
1176
+ )
1177
+
1178
+ rerender_btn = gr.Button("Re‑render with current voice settings (must have generated original LLM output)")
1179
+
1180
+ rerender_btn.click(
1181
+ fn=render_audio_from_dialogue,
1182
+ inputs=[
1183
+ cached_dialogue,
1184
+ openai_api_key,
1185
+ audio_model,
1186
+ speaker_1_voice,
1187
+ speaker_2_voice,
1188
+ speaker_1_instructions,
1189
+ speaker_2_instructions,
1190
+ ],
1191
+ outputs=[audio_output, transcript_output],
1192
+ )
1193
+
1194
+
1195
+
1196
  # Function to update the interactive state of edited_transcript
1197
  def update_edit_box(checkbox_value):
1198
  return gr.update(interactive=checkbox_value, lines=20 if checkbox_value else 20, visible=True if checkbox_value else False)
 
1213
  submit_btn.click(
1214
  fn=validate_and_generate_audio,
1215
  inputs=[
1216
+ files, openai_api_key, text_model, reasoning_effort, do_web_search, audio_model,
1217
  speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
1218
  api_base,
1219
  intro_instructions, text_instructions, scratch_pad_instructions,
1220
  prelude_dialog, podcast_dialog_instructions,
1221
+ edited_transcript,
1222
+ user_feedback,
1223
+
1224
  ],
1225
+ outputs=[audio_output, transcript_output, original_text_output, error_output, cached_dialogue, ]
1226
  ).then(
1227
  fn=lambda audio, transcript, original_text, error: (
1228
  transcript if transcript else "",
 
1234
  fn=lambda error: gr.Warning(error) if error else None,
1235
  inputs=[error_output],
1236
  outputs=[]
1237
+ ).then( # fill spreadsheet editor
1238
+ fn=dialogue_to_df,
1239
+ inputs=[cached_dialogue],
1240
+ outputs=[df_editor],
1241
+ )
1242
 
1243
  regenerate_btn.click(
1244
  fn=lambda use_edit, edit, *args: validate_and_generate_audio(
 
1248
  ),
1249
  inputs=[
1250
  use_edited_transcript, edited_transcript,
1251
+ files, openai_api_key, text_model, reasoning_effort, do_web_search, audio_model,
1252
  speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
1253
  api_base,
1254
  intro_instructions, text_instructions, scratch_pad_instructions,
1255
  prelude_dialog, podcast_dialog_instructions,
1256
  user_feedback, original_text_output
1257
  ],
1258
+ outputs=[audio_output, transcript_output, original_text_output, error_output, cached_dialogue, ]
1259
  ).then(
1260
  fn=lambda audio, transcript, original_text, error: (
1261
  transcript if transcript else "",
 
1267
  fn=lambda error: gr.Warning(error) if error else None,
1268
  inputs=[error_output],
1269
  outputs=[]
1270
+ ).then( # fill spreadsheet editor
1271
+ fn=dialogue_to_df,
1272
+ inputs=[cached_dialogue],
1273
+ outputs=[df_editor],
1274
+ )
1275
+
1276
+ with gr.Row():
1277
+ save_md_btn = gr.Button("Download Markdown of Dialogue")
1278
+ markdown_file_output = gr.File(label="Download .md file")
1279
+
1280
+ save_md_btn.click(
1281
+ fn=save_dialogue_as_markdown,
1282
+ inputs=[cached_dialogue],
1283
+ outputs=[markdown_file_output],
1284
  )
1285
 
1286
  # Add README content at the bottom