Spaces:
Running
Running
Update app.py
Browse filesUpdated edit, a few new templates, etc.
app.py
CHANGED
@@ -33,7 +33,7 @@ def read_readme():
|
|
33 |
INSTRUCTION_TEMPLATES = {
|
34 |
|
35 |
################# DEEP DATA ANALYSIS ##################
|
36 |
-
"
|
37 |
# 1) High‑level task description
|
38 |
"intro": """You are a senior analyst who conducts deep research.
|
39 |
|
@@ -69,7 +69,9 @@ When ready, compile the final report strictly following the template above.""",
|
|
69 |
"prelude": """Below is the structured report based on the supplied raw data:""",
|
70 |
|
71 |
# 5) Main output instructions
|
72 |
-
"dialog": """Design your output to be read aloud -- it will be directly converted into audio. The presentation of materials should include 30,000 words.
|
|
|
|
|
73 |
|
74 |
There is only one speaker, you. Stay on topic and maintaining an engaging flow.
|
75 |
|
@@ -77,6 +79,48 @@ Write a clear, detailed, and well-prepared analysis and report as a single narra
|
|
77 |
},
|
78 |
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
################# PODCAST ##################
|
81 |
"podcast": {
|
82 |
"intro": """Your task is to take the input text provided and turn it into an lively, engaging, informative podcast dialogue, in the style of NPR. Do not use or make up names. The input text may be messy or unstructured, as it could come from a variety of sources like PDFs or web pages.
|
@@ -472,33 +516,6 @@ O podcast deve ter cerca de 20.000 palavras.
|
|
472 |
},
|
473 |
}
|
474 |
|
475 |
-
# Function to update instruction fields based on template selection
|
476 |
-
def update_instructions(template):
|
477 |
-
return (
|
478 |
-
INSTRUCTION_TEMPLATES[template]["intro"],
|
479 |
-
INSTRUCTION_TEMPLATES[template]["text_instructions"],
|
480 |
-
INSTRUCTION_TEMPLATES[template]["scratch_pad"],
|
481 |
-
INSTRUCTION_TEMPLATES[template]["prelude"],
|
482 |
-
INSTRUCTION_TEMPLATES[template]["dialog"]
|
483 |
-
)
|
484 |
-
|
485 |
-
import concurrent.futures as cf
|
486 |
-
import glob
|
487 |
-
import io
|
488 |
-
import os
|
489 |
-
import time
|
490 |
-
from pathlib import Path
|
491 |
-
from tempfile import NamedTemporaryFile
|
492 |
-
from typing import List, Literal
|
493 |
-
|
494 |
-
import gradio as gr
|
495 |
-
|
496 |
-
from loguru import logger
|
497 |
-
from openai import OpenAI
|
498 |
-
from promptic import llm
|
499 |
-
from pydantic import BaseModel, ValidationError
|
500 |
-
from pypdf import PdfReader
|
501 |
-
from tenacity import retry, retry_if_exception_type
|
502 |
|
503 |
# Define standard values
|
504 |
STANDARD_TEXT_MODELS = [
|
@@ -552,6 +569,35 @@ STANDARD_VOICES = [
|
|
552 |
|
553 |
]
|
554 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
555 |
class DialogueItem(BaseModel):
|
556 |
text: str
|
557 |
speaker: Literal["speaker-1", "speaker-2"]
|
@@ -577,11 +623,11 @@ def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None) -> byt
|
|
577 |
'''
|
578 |
def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None,
|
579 |
speaker_instructions: str ='Speak in an emotive and friendly tone.') -> bytes:
|
|
|
580 |
client = OpenAI(
|
581 |
api_key=api_key or os.getenv("OPENAI_API_KEY"),
|
582 |
)
|
583 |
-
|
584 |
-
|
585 |
with client.audio.speech.with_streaming_response.create(
|
586 |
model=audio_model,
|
587 |
voice=voice,
|
@@ -596,7 +642,7 @@ def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None,
|
|
596 |
|
597 |
|
598 |
from functools import wraps
|
599 |
-
|
600 |
def conditional_llm(model, api_base=None, api_key=None, reasoning_effort="N/A"):
|
601 |
"""
|
602 |
Conditionally apply the @llm decorator based on the api_base parameter.
|
@@ -614,12 +660,44 @@ def conditional_llm(model, api_base=None, api_key=None, reasoning_effort="N/A"):
|
|
614 |
return llm(model=model, api_key=api_key, reasoning_effort=reasoning_effort)(func)
|
615 |
|
616 |
return decorator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
617 |
|
618 |
def generate_audio(
|
619 |
files: list,
|
620 |
openai_api_key: str = None,
|
621 |
text_model: str = "o4-mini", #o1-2024-12-17", #"o1-preview-2024-09-12",
|
622 |
reasoning_effort: str = "N/A",
|
|
|
623 |
audio_model: str = "tts-1",
|
624 |
speaker_1_voice: str = "alloy",
|
625 |
speaker_2_voice: str = "echo",
|
@@ -636,6 +714,8 @@ def generate_audio(
|
|
636 |
original_text: str = None,
|
637 |
debug = False,
|
638 |
) -> tuple:
|
|
|
|
|
639 |
# Validate API Key
|
640 |
if not os.getenv("OPENAI_API_KEY") and not openai_api_key:
|
641 |
raise gr.Error("OpenAI API key is required")
|
@@ -643,14 +723,7 @@ def generate_audio(
|
|
643 |
combined_text = original_text or ""
|
644 |
|
645 |
# If there's no original text, extract it from the uploaded files
|
646 |
-
|
647 |
-
if not combined_text:
|
648 |
-
for file in files:
|
649 |
-
with Path(file).open("rb") as f:
|
650 |
-
reader = PdfReader(f)
|
651 |
-
text = "\n\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
|
652 |
-
combined_text += text + "\n\n"
|
653 |
-
'''
|
654 |
|
655 |
if not combined_text:
|
656 |
for file in files:
|
@@ -670,7 +743,14 @@ def generate_audio(
|
|
670 |
combined_text += text + "\n\n"
|
671 |
# Configure the LLM based on selected model and api_base
|
672 |
@retry(retry=retry_if_exception_type(ValidationError))
|
673 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
674 |
def generate_dialogue(text: str, intro_instructions: str, text_instructions: str, scratch_pad_instructions: str,
|
675 |
prelude_dialog: str, podcast_dialog_instructions: str,
|
676 |
edited_transcript: str = None, user_feedback: str = None, ) -> Dialogue:
|
@@ -749,6 +829,7 @@ def generate_audio(
|
|
749 |
temporary_file = NamedTemporaryFile(
|
750 |
dir=temporary_directory,
|
751 |
delete=False,
|
|
|
752 |
suffix=".mp3",
|
753 |
)
|
754 |
temporary_file.write(audio)
|
@@ -759,18 +840,21 @@ def generate_audio(
|
|
759 |
if os.path.isfile(file) and time.time() - os.path.getmtime(file) > 24 * 60 * 60:
|
760 |
os.remove(file)
|
761 |
|
762 |
-
return temporary_file.name, transcript, combined_text
|
763 |
|
764 |
def validate_and_generate_audio(*args):
|
765 |
files = args[0]
|
766 |
if not files:
|
767 |
return None, None, None, "Please upload at least one PDF (or MD/MMD/TXT) file before generating audio."
|
768 |
try:
|
769 |
-
audio_file, transcript, original_text = generate_audio(*args)
|
770 |
-
|
|
|
771 |
except Exception as e:
|
772 |
-
|
773 |
-
|
|
|
|
|
774 |
|
775 |
def edit_and_regenerate(edited_transcript, user_feedback, *args):
|
776 |
# Replace the original transcript and feedback in the args with the new ones
|
@@ -786,6 +870,128 @@ def process_feedback_and_regenerate(feedback, *args):
|
|
786 |
new_args.append(feedback) # Add user feedback as a new argument
|
787 |
return validate_and_generate_audio(*new_args)
|
788 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
789 |
with gr.Blocks(title="PDF to Audio", css="""
|
790 |
#header {
|
791 |
display: flex;
|
@@ -815,6 +1021,8 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
815 |
margin-top: 20px;
|
816 |
}
|
817 |
""") as demo:
|
|
|
|
|
818 |
|
819 |
with gr.Row(elem_id="header"):
|
820 |
with gr.Column(scale=4):
|
@@ -889,6 +1097,12 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
889 |
info="If you are using a custom or local model, provide the API base URL here, e.g.: http://localhost:8080/v1 for llama.cpp REST server.",
|
890 |
)
|
891 |
|
|
|
|
|
|
|
|
|
|
|
|
|
892 |
with gr.Column(scale=3):
|
893 |
template_dropdown = gr.Dropdown(
|
894 |
label="Instruction Template",
|
@@ -929,7 +1143,7 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
929 |
)
|
930 |
|
931 |
audio_output = gr.Audio(label="Audio", format="mp3", interactive=False, autoplay=False)
|
932 |
-
transcript_output = gr.Textbox(label="Transcript", lines=
|
933 |
original_text_output = gr.Textbox(label="Original Text", lines=10, visible=False)
|
934 |
error_output = gr.Textbox(visible=False) # Hidden textbox to store error message
|
935 |
|
@@ -940,6 +1154,45 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
940 |
user_feedback = gr.Textbox(label="Provide Feedback or Notes", lines=10, #placeholder="Enter your feedback or notes here..."
|
941 |
)
|
942 |
regenerate_btn = gr.Button("Regenerate Audio with Edits and Feedback")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
943 |
# Function to update the interactive state of edited_transcript
|
944 |
def update_edit_box(checkbox_value):
|
945 |
return gr.update(interactive=checkbox_value, lines=20 if checkbox_value else 20, visible=True if checkbox_value else False)
|
@@ -960,15 +1213,16 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
960 |
submit_btn.click(
|
961 |
fn=validate_and_generate_audio,
|
962 |
inputs=[
|
963 |
-
files, openai_api_key, text_model, reasoning_effort, audio_model,
|
964 |
speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
|
965 |
api_base,
|
966 |
intro_instructions, text_instructions, scratch_pad_instructions,
|
967 |
prelude_dialog, podcast_dialog_instructions,
|
968 |
-
edited_transcript,
|
969 |
-
user_feedback,
|
|
|
970 |
],
|
971 |
-
outputs=[audio_output, transcript_output, original_text_output, error_output]
|
972 |
).then(
|
973 |
fn=lambda audio, transcript, original_text, error: (
|
974 |
transcript if transcript else "",
|
@@ -980,7 +1234,11 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
980 |
fn=lambda error: gr.Warning(error) if error else None,
|
981 |
inputs=[error_output],
|
982 |
outputs=[]
|
983 |
-
)
|
|
|
|
|
|
|
|
|
984 |
|
985 |
regenerate_btn.click(
|
986 |
fn=lambda use_edit, edit, *args: validate_and_generate_audio(
|
@@ -990,14 +1248,14 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
990 |
),
|
991 |
inputs=[
|
992 |
use_edited_transcript, edited_transcript,
|
993 |
-
files, openai_api_key, text_model, reasoning_effort, audio_model,
|
994 |
speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
|
995 |
api_base,
|
996 |
intro_instructions, text_instructions, scratch_pad_instructions,
|
997 |
prelude_dialog, podcast_dialog_instructions,
|
998 |
user_feedback, original_text_output
|
999 |
],
|
1000 |
-
outputs=[audio_output, transcript_output, original_text_output, error_output]
|
1001 |
).then(
|
1002 |
fn=lambda audio, transcript, original_text, error: (
|
1003 |
transcript if transcript else "",
|
@@ -1009,6 +1267,20 @@ with gr.Blocks(title="PDF to Audio", css="""
|
|
1009 |
fn=lambda error: gr.Warning(error) if error else None,
|
1010 |
inputs=[error_output],
|
1011 |
outputs=[]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1012 |
)
|
1013 |
|
1014 |
# Add README content at the bottom
|
|
|
33 |
INSTRUCTION_TEMPLATES = {
|
34 |
|
35 |
################# DEEP DATA ANALYSIS ##################
|
36 |
+
"deep research analysis": {
|
37 |
# 1) High‑level task description
|
38 |
"intro": """You are a senior analyst who conducts deep research.
|
39 |
|
|
|
69 |
"prelude": """Below is the structured report based on the supplied raw data:""",
|
70 |
|
71 |
# 5) Main output instructions
|
72 |
+
"dialog": """Design your output to be read aloud -- it will be directly converted into audio. The presentation of materials should include 30,000 words.
|
73 |
+
|
74 |
+
If you have equations, variables or other complex concepts, make sure to design your output so that it can be clearly rendered by a text-to-voice model.
|
75 |
|
76 |
There is only one speaker, you. Stay on topic and maintaining an engaging flow.
|
77 |
|
|
|
79 |
},
|
80 |
|
81 |
|
82 |
+
################# CLEAN READ‑THROUGH ##################
|
83 |
+
"clean rendering": {
|
84 |
+
# 1) What the model should do
|
85 |
+
"intro": """You are a careful narrator tasked with producing an **accurate, faithful rendering** of the supplied document so it can be read aloud.
|
86 |
+
|
87 |
+
Your priorities are:
|
88 |
+
• Preserve the original wording and ordering of the content.
|
89 |
+
• Remove anything that is clearly an artefact of page layout (page numbers, running headers/footers, line numbers, PDF crop marks, hyphen‑splits at line wraps).
|
90 |
+
• Keep mathematical symbols, equations and variable names intact, but read them in a way a TTS system can pronounce (e.g. “square root of”, “alpha sub i”).
|
91 |
+
• Do **not** add commentary, summaries, or extra explanations—just the cleaned text.
|
92 |
+
• Present everything in the **same sequence** as in the source.
|
93 |
+
|
94 |
+
Output must be suitable for text‑to‑speech; begin every paragraph with `speaker-1:` and write as a single narrator.""",
|
95 |
+
|
96 |
+
# 2) How to cleanse the raw text
|
97 |
+
"text_instructions": """Scan the input for artefacts such as:
|
98 |
+
|
99 |
+
- Stand‑alone page numbers or headers like “Page 12 of 30”
|
100 |
+
- Repeated footers, URLs or timestamps
|
101 |
+
- Manual hyphenation at line breaks (join split words)
|
102 |
+
- Broken tables or columns (flatten them into continuous sentences where possible)
|
103 |
+
|
104 |
+
Strip these while keeping all legitimate content. Do **not** reorder paragraphs or sentences.""",
|
105 |
+
|
106 |
+
# 3) Hidden scratch‑pad for the model
|
107 |
+
"scratch_pad": """Brainstorm here (hidden):
|
108 |
+
- Identify obvious header/footer patterns to delete.
|
109 |
+
- Decide how to handle any malformed tables (e.g. read row‑by‑row).
|
110 |
+
- Note any equations that need a spoken equivalent.
|
111 |
+
After cleaning decisions are made, move on to generate the final narration.""",
|
112 |
+
|
113 |
+
# 4) Prelude before the narration starts
|
114 |
+
"prelude": """Below is the faithful narration of the provided document (cleaned of layout artefacts, otherwise unchanged):""",
|
115 |
+
|
116 |
+
# 5) Main output instructions
|
117 |
+
"dialog": """Design your output to be read aloud—no markup, no bracketed directions.
|
118 |
+
Only one speaker (`speaker-1:`).
|
119 |
+
Maintain original headings and paragraph breaks where they naturally occur in the source.
|
120 |
+
If an equation appears, read it in a TTS‑friendly style (e.g. `speaker-1: E equals m times c squared`)."""
|
121 |
+
},
|
122 |
+
|
123 |
+
|
124 |
################# PODCAST ##################
|
125 |
"podcast": {
|
126 |
"intro": """Your task is to take the input text provided and turn it into an lively, engaging, informative podcast dialogue, in the style of NPR. Do not use or make up names. The input text may be messy or unstructured, as it could come from a variety of sources like PDFs or web pages.
|
|
|
516 |
},
|
517 |
}
|
518 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
519 |
|
520 |
# Define standard values
|
521 |
STANDARD_TEXT_MODELS = [
|
|
|
569 |
|
570 |
]
|
571 |
|
572 |
+
# Function to update instruction fields based on template selection
|
573 |
+
def update_instructions(template):
|
574 |
+
return (
|
575 |
+
INSTRUCTION_TEMPLATES[template]["intro"],
|
576 |
+
INSTRUCTION_TEMPLATES[template]["text_instructions"],
|
577 |
+
INSTRUCTION_TEMPLATES[template]["scratch_pad"],
|
578 |
+
INSTRUCTION_TEMPLATES[template]["prelude"],
|
579 |
+
INSTRUCTION_TEMPLATES[template]["dialog"]
|
580 |
+
)
|
581 |
+
|
582 |
+
import concurrent.futures as cf
|
583 |
+
import glob
|
584 |
+
import io
|
585 |
+
import os
|
586 |
+
import time
|
587 |
+
from pathlib import Path
|
588 |
+
from tempfile import NamedTemporaryFile
|
589 |
+
from typing import List, Literal
|
590 |
+
|
591 |
+
import gradio as gr
|
592 |
+
|
593 |
+
from loguru import logger
|
594 |
+
from openai import OpenAI
|
595 |
+
from promptic import llm
|
596 |
+
from pydantic import BaseModel, ValidationError
|
597 |
+
from pypdf import PdfReader
|
598 |
+
from tenacity import retry, retry_if_exception_type
|
599 |
+
|
600 |
+
|
601 |
class DialogueItem(BaseModel):
|
602 |
text: str
|
603 |
speaker: Literal["speaker-1", "speaker-2"]
|
|
|
623 |
'''
|
624 |
def get_mp3(text: str, voice: str, audio_model: str, api_key: str = None,
|
625 |
speaker_instructions: str ='Speak in an emotive and friendly tone.') -> bytes:
|
626 |
+
|
627 |
client = OpenAI(
|
628 |
api_key=api_key or os.getenv("OPENAI_API_KEY"),
|
629 |
)
|
630 |
+
|
|
|
631 |
with client.audio.speech.with_streaming_response.create(
|
632 |
model=audio_model,
|
633 |
voice=voice,
|
|
|
642 |
|
643 |
|
644 |
from functools import wraps
|
645 |
+
'''
|
646 |
def conditional_llm(model, api_base=None, api_key=None, reasoning_effort="N/A"):
|
647 |
"""
|
648 |
Conditionally apply the @llm decorator based on the api_base parameter.
|
|
|
660 |
return llm(model=model, api_key=api_key, reasoning_effort=reasoning_effort)(func)
|
661 |
|
662 |
return decorator
|
663 |
+
'''
|
664 |
+
def conditional_llm(
|
665 |
+
model,
|
666 |
+
api_base=None,
|
667 |
+
api_key=None,
|
668 |
+
reasoning_effort="N/A",
|
669 |
+
do_web_search=False,
|
670 |
+
):
|
671 |
+
"""
|
672 |
+
Wrap a function with the @llm decorator, choosing kwargs dynamically.
|
673 |
+
Adds `web_search_options={}` when do_web_search==True.
|
674 |
+
"""
|
675 |
+
|
676 |
+
# build decorator kwargs once so we don’t repeat logic
|
677 |
+
decorator_kwargs = {"model": model}
|
678 |
+
|
679 |
+
if api_base:
|
680 |
+
decorator_kwargs["api_base"] = api_base
|
681 |
+
else:
|
682 |
+
decorator_kwargs["api_key"] = api_key
|
683 |
+
if reasoning_effort != "N/A":
|
684 |
+
decorator_kwargs["reasoning_effort"] = reasoning_effort
|
685 |
+
|
686 |
+
if do_web_search:
|
687 |
+
decorator_kwargs["web_search_options"] = {} # empty dict → default behaviour
|
688 |
+
|
689 |
+
def decorator(func):
|
690 |
+
return llm(**decorator_kwargs)(func)
|
691 |
+
|
692 |
+
return decorator
|
693 |
+
|
694 |
|
695 |
def generate_audio(
|
696 |
files: list,
|
697 |
openai_api_key: str = None,
|
698 |
text_model: str = "o4-mini", #o1-2024-12-17", #"o1-preview-2024-09-12",
|
699 |
reasoning_effort: str = "N/A",
|
700 |
+
do_web_search: bool = False,
|
701 |
audio_model: str = "tts-1",
|
702 |
speaker_1_voice: str = "alloy",
|
703 |
speaker_2_voice: str = "echo",
|
|
|
714 |
original_text: str = None,
|
715 |
debug = False,
|
716 |
) -> tuple:
|
717 |
+
|
718 |
+
|
719 |
# Validate API Key
|
720 |
if not os.getenv("OPENAI_API_KEY") and not openai_api_key:
|
721 |
raise gr.Error("OpenAI API key is required")
|
|
|
723 |
combined_text = original_text or ""
|
724 |
|
725 |
# If there's no original text, extract it from the uploaded files
|
726 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
727 |
|
728 |
if not combined_text:
|
729 |
for file in files:
|
|
|
743 |
combined_text += text + "\n\n"
|
744 |
# Configure the LLM based on selected model and api_base
|
745 |
@retry(retry=retry_if_exception_type(ValidationError))
|
746 |
+
#@conditional_llm(model=text_model, api_base=api_base, api_key=openai_api_key)
|
747 |
+
@conditional_llm(
|
748 |
+
model=text_model,
|
749 |
+
api_base=api_base,
|
750 |
+
api_key=openai_api_key,
|
751 |
+
reasoning_effort=reasoning_effort,
|
752 |
+
do_web_search=do_web_search,
|
753 |
+
)
|
754 |
def generate_dialogue(text: str, intro_instructions: str, text_instructions: str, scratch_pad_instructions: str,
|
755 |
prelude_dialog: str, podcast_dialog_instructions: str,
|
756 |
edited_transcript: str = None, user_feedback: str = None, ) -> Dialogue:
|
|
|
829 |
temporary_file = NamedTemporaryFile(
|
830 |
dir=temporary_directory,
|
831 |
delete=False,
|
832 |
+
prefix="PDF2Audio_",
|
833 |
suffix=".mp3",
|
834 |
)
|
835 |
temporary_file.write(audio)
|
|
|
840 |
if os.path.isfile(file) and time.time() - os.path.getmtime(file) > 24 * 60 * 60:
|
841 |
os.remove(file)
|
842 |
|
843 |
+
return temporary_file.name, transcript, combined_text, llm_output
|
844 |
|
845 |
def validate_and_generate_audio(*args):
|
846 |
files = args[0]
|
847 |
if not files:
|
848 |
return None, None, None, "Please upload at least one PDF (or MD/MMD/TXT) file before generating audio."
|
849 |
try:
|
850 |
+
#audio_file, transcript, original_text = generate_audio(*args)
|
851 |
+
audio_file, transcript, original_text, dialogue = generate_audio(*args)
|
852 |
+
return audio_file, transcript, original_text, None, dialogue #
|
853 |
except Exception as e:
|
854 |
+
return None, None, None, str(e), None #
|
855 |
+
|
856 |
+
|
857 |
+
|
858 |
|
859 |
def edit_and_regenerate(edited_transcript, user_feedback, *args):
|
860 |
# Replace the original transcript and feedback in the args with the new ones
|
|
|
870 |
new_args.append(feedback) # Add user feedback as a new argument
|
871 |
return validate_and_generate_audio(*new_args)
|
872 |
|
873 |
+
|
874 |
+
####################################################
|
875 |
+
#Download dialog/result as markdown
|
876 |
+
####################################################
|
877 |
+
|
878 |
+
def dialogue_to_markdown(dlg: Dialogue) -> str:
|
879 |
+
lines = []
|
880 |
+
lines.append("# PDF2Audio Transcript\n")
|
881 |
+
lines.append("## Transcript\n")
|
882 |
+
for item in dlg.dialogue:
|
883 |
+
lines.append(f"**{item.speaker}:** {item.text.strip()}\n")
|
884 |
+
return "\n".join(lines)
|
885 |
+
|
886 |
+
def save_dialogue_as_markdown(cached_dialogue) -> str:
|
887 |
+
if cached_dialogue is None:
|
888 |
+
raise gr.Error("No dialogue to save. Please generate or edit a dialogue first.")
|
889 |
+
|
890 |
+
markdown_text = dialogue_to_markdown(cached_dialogue)
|
891 |
+
|
892 |
+
# Write to a temporary .md file
|
893 |
+
temp_dir = "./gradio_cached_examples/tmp/"
|
894 |
+
os.makedirs(temp_dir, exist_ok=True)
|
895 |
+
|
896 |
+
file_path = os.path.join(temp_dir, f"PDF2Audio_dialogue_{int(time.time())}.md")
|
897 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
898 |
+
f.write(markdown_text)
|
899 |
+
|
900 |
+
return file_path
|
901 |
+
|
902 |
+
|
903 |
+
|
904 |
+
####################################################
|
905 |
+
#Edit and re-render audio from existing LLM output
|
906 |
+
####################################################
|
907 |
+
|
908 |
+
import pandas as pd
|
909 |
+
from typing import List
|
910 |
+
|
911 |
+
def dialogue_to_df(dlg: Dialogue) -> pd.DataFrame:
|
912 |
+
data = [{"Speaker": item.speaker, "Line": item.text} for item in dlg.dialogue]
|
913 |
+
return pd.DataFrame(data)
|
914 |
+
|
915 |
+
def df_to_dialogue(df: pd.DataFrame, scratchpad: str = "") -> Dialogue:
|
916 |
+
items: List[DialogueItem] = [
|
917 |
+
DialogueItem(speaker=row["Speaker"], text=row["Line"])
|
918 |
+
for _, row in df.iterrows()
|
919 |
+
]
|
920 |
+
return Dialogue(scratchpad=scratchpad, dialogue=items)
|
921 |
+
|
922 |
+
def save_dialogue_edits(df, cached_dialogue):
|
923 |
+
"""
|
924 |
+
Save the edited dialogue and update the per-session cached state.
|
925 |
+
"""
|
926 |
+
if cached_dialogue is None:
|
927 |
+
raise gr.Error("Nothing to edit yet – run Generate Audio first.")
|
928 |
+
|
929 |
+
import pandas as pd
|
930 |
+
new_dlg = df_to_dialogue(pd.DataFrame(df, columns=["Speaker", "Line"]))
|
931 |
+
|
932 |
+
# regenerate plain transcript so the user sees the change immediately
|
933 |
+
transcript_str = "\n".join(f"{d.speaker}: {d.text}" for d in new_dlg.dialogue)
|
934 |
+
|
935 |
+
# Return updated state and transcript
|
936 |
+
return new_dlg, gr.update(value=transcript_str), "Edits saved. Press *Re‑render* to hear them."
|
937 |
+
|
938 |
+
|
939 |
+
def render_audio_from_dialogue(
|
940 |
+
cached_dialogue, # 👈 NEW: pass in as argument
|
941 |
+
openai_api_key: str,
|
942 |
+
audio_model: str,
|
943 |
+
speaker_1_voice: str,
|
944 |
+
speaker_2_voice: str,
|
945 |
+
speaker_1_instructions: str,
|
946 |
+
speaker_2_instructions: str,
|
947 |
+
) -> tuple[str, str]: # mp3 file path, transcript
|
948 |
+
|
949 |
+
if cached_dialogue is None:
|
950 |
+
raise gr.Error("Nothing to re‑render yet – run Generate Audio first.")
|
951 |
+
|
952 |
+
dlg = cached_dialogue
|
953 |
+
audio_bytes, transcript, characters = b"", "", 0
|
954 |
+
|
955 |
+
with cf.ThreadPoolExecutor() as ex:
|
956 |
+
futures = []
|
957 |
+
for item in dlg.dialogue:
|
958 |
+
voice = speaker_1_voice if item.speaker == "speaker-1" else speaker_2_voice
|
959 |
+
instr = speaker_1_instructions if item.speaker == "speaker-1" else speaker_2_instructions
|
960 |
+
futures.append(
|
961 |
+
(
|
962 |
+
ex.submit(get_mp3, item.text, voice, audio_model, openai_api_key, instr),
|
963 |
+
f"{item.speaker}: {item.text}",
|
964 |
+
)
|
965 |
+
)
|
966 |
+
characters += len(item.text)
|
967 |
+
|
968 |
+
for fut, line in futures:
|
969 |
+
audio_bytes += fut.result()
|
970 |
+
transcript += line + "\n\n"
|
971 |
+
|
972 |
+
logger.info(f"[Re‑render] {characters} characters voiced")
|
973 |
+
|
974 |
+
# Write to temporary .mp3 file
|
975 |
+
temporary_directory = "./gradio_cached_examples/tmp/"
|
976 |
+
os.makedirs(temporary_directory, exist_ok=True)
|
977 |
+
|
978 |
+
temporary_file = NamedTemporaryFile(
|
979 |
+
dir=temporary_directory,
|
980 |
+
delete=False,
|
981 |
+
prefix="PDF2Audio_",
|
982 |
+
suffix=".mp3",
|
983 |
+
)
|
984 |
+
temporary_file.write(audio_bytes)
|
985 |
+
temporary_file.close()
|
986 |
+
|
987 |
+
# Clean up old files
|
988 |
+
for file in glob.glob(f"{temporary_directory}*.mp3"):
|
989 |
+
if os.path.isfile(file) and time.time() - os.path.getmtime(file) > 24 * 60 * 60:
|
990 |
+
os.remove(file)
|
991 |
+
|
992 |
+
return temporary_file.name, transcript
|
993 |
+
|
994 |
+
|
995 |
with gr.Blocks(title="PDF to Audio", css="""
|
996 |
#header {
|
997 |
display: flex;
|
|
|
1021 |
margin-top: 20px;
|
1022 |
}
|
1023 |
""") as demo:
|
1024 |
+
|
1025 |
+
cached_dialogue = gr.State()
|
1026 |
|
1027 |
with gr.Row(elem_id="header"):
|
1028 |
with gr.Column(scale=4):
|
|
|
1097 |
info="If you are using a custom or local model, provide the API base URL here, e.g.: http://localhost:8080/v1 for llama.cpp REST server.",
|
1098 |
)
|
1099 |
|
1100 |
+
do_web_search = gr.Checkbox(
|
1101 |
+
label="Let the LLM search the web to complement the documents.",
|
1102 |
+
value=False,
|
1103 |
+
info="When enabled, the LLM will call the web search tool during its reasoning."
|
1104 |
+
)
|
1105 |
+
|
1106 |
with gr.Column(scale=3):
|
1107 |
template_dropdown = gr.Dropdown(
|
1108 |
label="Instruction Template",
|
|
|
1143 |
)
|
1144 |
|
1145 |
audio_output = gr.Audio(label="Audio", format="mp3", interactive=False, autoplay=False)
|
1146 |
+
transcript_output = gr.Textbox(label="Transcript", lines=25, show_copy_button=True)
|
1147 |
original_text_output = gr.Textbox(label="Original Text", lines=10, visible=False)
|
1148 |
error_output = gr.Textbox(visible=False) # Hidden textbox to store error message
|
1149 |
|
|
|
1154 |
user_feedback = gr.Textbox(label="Provide Feedback or Notes", lines=10, #placeholder="Enter your feedback or notes here..."
|
1155 |
)
|
1156 |
regenerate_btn = gr.Button("Regenerate Audio with Edits and Feedback")
|
1157 |
+
|
1158 |
+
with gr.Accordion("Edit dialogue line‑by‑line", open=False) as editor_box:
|
1159 |
+
df_editor = gr.Dataframe(
|
1160 |
+
headers=["Speaker", "Line"],
|
1161 |
+
datatype=["str", "str"],
|
1162 |
+
wrap=True,
|
1163 |
+
interactive=True,
|
1164 |
+
row_count=(1, "dynamic"),
|
1165 |
+
col_count=(2, "fixed"),
|
1166 |
+
)
|
1167 |
+
|
1168 |
+
save_btn = gr.Button("Save edits")
|
1169 |
+
save_msg = gr.Markdown()
|
1170 |
+
|
1171 |
+
|
1172 |
+
save_btn.click(
|
1173 |
+
fn=save_dialogue_edits,
|
1174 |
+
inputs=[df_editor, cached_dialogue],
|
1175 |
+
outputs=[cached_dialogue, transcript_output, save_msg],
|
1176 |
+
)
|
1177 |
+
|
1178 |
+
rerender_btn = gr.Button("Re‑render with current voice settings (must have generated original LLM output)")
|
1179 |
+
|
1180 |
+
rerender_btn.click(
|
1181 |
+
fn=render_audio_from_dialogue,
|
1182 |
+
inputs=[
|
1183 |
+
cached_dialogue,
|
1184 |
+
openai_api_key,
|
1185 |
+
audio_model,
|
1186 |
+
speaker_1_voice,
|
1187 |
+
speaker_2_voice,
|
1188 |
+
speaker_1_instructions,
|
1189 |
+
speaker_2_instructions,
|
1190 |
+
],
|
1191 |
+
outputs=[audio_output, transcript_output],
|
1192 |
+
)
|
1193 |
+
|
1194 |
+
|
1195 |
+
|
1196 |
# Function to update the interactive state of edited_transcript
|
1197 |
def update_edit_box(checkbox_value):
|
1198 |
return gr.update(interactive=checkbox_value, lines=20 if checkbox_value else 20, visible=True if checkbox_value else False)
|
|
|
1213 |
submit_btn.click(
|
1214 |
fn=validate_and_generate_audio,
|
1215 |
inputs=[
|
1216 |
+
files, openai_api_key, text_model, reasoning_effort, do_web_search, audio_model,
|
1217 |
speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
|
1218 |
api_base,
|
1219 |
intro_instructions, text_instructions, scratch_pad_instructions,
|
1220 |
prelude_dialog, podcast_dialog_instructions,
|
1221 |
+
edited_transcript,
|
1222 |
+
user_feedback,
|
1223 |
+
|
1224 |
],
|
1225 |
+
outputs=[audio_output, transcript_output, original_text_output, error_output, cached_dialogue, ]
|
1226 |
).then(
|
1227 |
fn=lambda audio, transcript, original_text, error: (
|
1228 |
transcript if transcript else "",
|
|
|
1234 |
fn=lambda error: gr.Warning(error) if error else None,
|
1235 |
inputs=[error_output],
|
1236 |
outputs=[]
|
1237 |
+
).then( # fill spreadsheet editor
|
1238 |
+
fn=dialogue_to_df,
|
1239 |
+
inputs=[cached_dialogue],
|
1240 |
+
outputs=[df_editor],
|
1241 |
+
)
|
1242 |
|
1243 |
regenerate_btn.click(
|
1244 |
fn=lambda use_edit, edit, *args: validate_and_generate_audio(
|
|
|
1248 |
),
|
1249 |
inputs=[
|
1250 |
use_edited_transcript, edited_transcript,
|
1251 |
+
files, openai_api_key, text_model, reasoning_effort, do_web_search, audio_model,
|
1252 |
speaker_1_voice, speaker_2_voice, speaker_1_instructions, speaker_2_instructions,
|
1253 |
api_base,
|
1254 |
intro_instructions, text_instructions, scratch_pad_instructions,
|
1255 |
prelude_dialog, podcast_dialog_instructions,
|
1256 |
user_feedback, original_text_output
|
1257 |
],
|
1258 |
+
outputs=[audio_output, transcript_output, original_text_output, error_output, cached_dialogue, ]
|
1259 |
).then(
|
1260 |
fn=lambda audio, transcript, original_text, error: (
|
1261 |
transcript if transcript else "",
|
|
|
1267 |
fn=lambda error: gr.Warning(error) if error else None,
|
1268 |
inputs=[error_output],
|
1269 |
outputs=[]
|
1270 |
+
).then( # fill spreadsheet editor
|
1271 |
+
fn=dialogue_to_df,
|
1272 |
+
inputs=[cached_dialogue],
|
1273 |
+
outputs=[df_editor],
|
1274 |
+
)
|
1275 |
+
|
1276 |
+
with gr.Row():
|
1277 |
+
save_md_btn = gr.Button("Download Markdown of Dialogue")
|
1278 |
+
markdown_file_output = gr.File(label="Download .md file")
|
1279 |
+
|
1280 |
+
save_md_btn.click(
|
1281 |
+
fn=save_dialogue_as_markdown,
|
1282 |
+
inputs=[cached_dialogue],
|
1283 |
+
outputs=[markdown_file_output],
|
1284 |
)
|
1285 |
|
1286 |
# Add README content at the bottom
|