Delete src
Browse files- src/interface.py +0 -215
- src/processor.py +0 -120
src/interface.py
DELETED
@@ -1,215 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
from .processor import process_document
|
3 |
-
|
4 |
-
SYNTHESIS_MODES = {
|
5 |
-
"narration": {
|
6 |
-
"description": "Simple document narration with clear voice and natural pacing",
|
7 |
-
"styles": ["Technical", "Narrative", "Instructional", "Descriptive"],
|
8 |
-
"default_temp": 0.7,
|
9 |
-
"default_chunks": 300,
|
10 |
-
"system_prompt": """Convert this content into clear narration."""
|
11 |
-
},
|
12 |
-
"podcast": {
|
13 |
-
"description": "Conversational style with engaging tone and dynamic pacing",
|
14 |
-
"styles": ["Casual", "Interview", "Educational", "Commentary"],
|
15 |
-
"default_temp": 0.8,
|
16 |
-
"default_chunks": 400,
|
17 |
-
"system_prompt": """Transform this content into engaging podcast-style speech."""
|
18 |
-
},
|
19 |
-
"presentation": {
|
20 |
-
"description": "Professional presentation style with clear structure",
|
21 |
-
"styles": ["Business", "Academic", "Sales", "Training"],
|
22 |
-
"default_temp": 0.6,
|
23 |
-
"default_chunks": 250,
|
24 |
-
"system_prompt": """Convert this content into a presentation format."""
|
25 |
-
},
|
26 |
-
"storytelling": {
|
27 |
-
"description": "Narrative style with emotional engagement",
|
28 |
-
"styles": ["Dynamic", "Dramatic", "Calm", "Energetic"],
|
29 |
-
"default_temp": 0.9,
|
30 |
-
"default_chunks": 500,
|
31 |
-
"system_prompt": """Transform this content into an engaging story."""
|
32 |
-
}
|
33 |
-
}
|
34 |
-
|
35 |
-
def create_interface():
|
36 |
-
with gr.Blocks(theme=gr.themes.Base()) as demo:
|
37 |
-
gr.HTML(
|
38 |
-
"""
|
39 |
-
<div style="margin-bottom: 1rem;">
|
40 |
-
<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png"
|
41 |
-
alt="Pixeltable" style="max-width: 150px;" />
|
42 |
-
<h1>📄 Document to Audio Synthesis 🎧</h1>
|
43 |
-
</div>
|
44 |
-
"""
|
45 |
-
)
|
46 |
-
|
47 |
-
# Overview Row
|
48 |
-
with gr.Row():
|
49 |
-
with gr.Column():
|
50 |
-
with gr.Accordion("🎯 What does it do?", open=True):
|
51 |
-
gr.Markdown("""
|
52 |
-
- 📄 Document processing - 🧠 Content transformation
|
53 |
-
- 🎧 Audio synthesis - ⚙️ Multiple output styles
|
54 |
-
""")
|
55 |
-
with gr.Column():
|
56 |
-
with gr.Accordion("⚡ How does it work?", open=True):
|
57 |
-
gr.Markdown("""
|
58 |
-
1. 📑 **Processing:** Token-based segmentation
|
59 |
-
2. 🔍 **Analysis:** LLM optimization & scripts
|
60 |
-
3. 🎵 **Synthesis:** Multiple voice options
|
61 |
-
""")
|
62 |
-
|
63 |
-
synthesis_mode = gr.State(SYNTHESIS_MODES["narration"])
|
64 |
-
|
65 |
-
# Main Settings Row
|
66 |
-
with gr.Row():
|
67 |
-
# Core Settings Column
|
68 |
-
with gr.Column():
|
69 |
-
with gr.Accordion("🔑 Core Settings", open=True):
|
70 |
-
with gr.Row():
|
71 |
-
api_key = gr.Textbox(
|
72 |
-
label="OpenAI API Key",
|
73 |
-
placeholder="sk-...",
|
74 |
-
type="password",
|
75 |
-
scale=2
|
76 |
-
)
|
77 |
-
file_input = gr.File(
|
78 |
-
label="PDF Document",
|
79 |
-
file_types=[".pdf"],
|
80 |
-
scale=1
|
81 |
-
)
|
82 |
-
|
83 |
-
# Mode Selection Column
|
84 |
-
with gr.Column():
|
85 |
-
with gr.Accordion("🎭 Output Mode", open=True):
|
86 |
-
mode_select = gr.Radio(
|
87 |
-
choices=list(SYNTHESIS_MODES.keys()),
|
88 |
-
value="narration",
|
89 |
-
label="Select Mode",
|
90 |
-
info="Choose output style"
|
91 |
-
)
|
92 |
-
mode_description = gr.Markdown(
|
93 |
-
SYNTHESIS_MODES["narration"]["description"]
|
94 |
-
)
|
95 |
-
|
96 |
-
# Voice and Processing Settings Row
|
97 |
-
with gr.Row():
|
98 |
-
# Voice Settings Column
|
99 |
-
with gr.Column():
|
100 |
-
with gr.Accordion("🎛️ Voice & Style", open=True):
|
101 |
-
voice_select = gr.Radio(
|
102 |
-
choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
|
103 |
-
value="onyx",
|
104 |
-
label="🎙️ Voice",
|
105 |
-
interactive=True
|
106 |
-
)
|
107 |
-
style_select = gr.Radio(
|
108 |
-
choices=SYNTHESIS_MODES["narration"]["styles"],
|
109 |
-
value=SYNTHESIS_MODES["narration"]["styles"][0],
|
110 |
-
label="💫 Style",
|
111 |
-
interactive=True
|
112 |
-
)
|
113 |
-
|
114 |
-
# Processing Settings Column
|
115 |
-
with gr.Column():
|
116 |
-
with gr.Accordion("⚙️ Processing Parameters", open=True):
|
117 |
-
with gr.Row():
|
118 |
-
chunk_size = gr.Slider(
|
119 |
-
minimum=100, maximum=1000,
|
120 |
-
value=SYNTHESIS_MODES["narration"]["default_chunks"],
|
121 |
-
step=50,
|
122 |
-
label="📏 Chunk Size"
|
123 |
-
)
|
124 |
-
temperature = gr.Slider(
|
125 |
-
minimum=0, maximum=1,
|
126 |
-
value=SYNTHESIS_MODES["narration"]["default_temp"],
|
127 |
-
step=0.1,
|
128 |
-
label="🌡️ Temperature"
|
129 |
-
)
|
130 |
-
max_tokens = gr.Slider(
|
131 |
-
minimum=100, maximum=1000,
|
132 |
-
value=300,
|
133 |
-
step=50,
|
134 |
-
label="📊 Tokens"
|
135 |
-
)
|
136 |
-
|
137 |
-
# Process Button Row
|
138 |
-
with gr.Row():
|
139 |
-
process_btn = gr.Button("🚀 Generate Audio", variant="primary", scale=2)
|
140 |
-
status_output = gr.Textbox(label="📋 Status", scale=1)
|
141 |
-
|
142 |
-
# Output Section
|
143 |
-
with gr.Tabs():
|
144 |
-
with gr.TabItem("📝 Content"):
|
145 |
-
output_table = gr.Dataframe(
|
146 |
-
headers=["🔍 Segment", "📄 Content", "🎭 Script"],
|
147 |
-
wrap=True
|
148 |
-
)
|
149 |
-
with gr.TabItem("🎧 Audio"):
|
150 |
-
with gr.Row():
|
151 |
-
with gr.Column(scale=2):
|
152 |
-
audio_output = gr.Audio(
|
153 |
-
label="🔊 Synthesized Audio",
|
154 |
-
type="filepath",
|
155 |
-
show_download_button=True
|
156 |
-
)
|
157 |
-
with gr.Column(scale=1):
|
158 |
-
with gr.Accordion("📚 Quick Tips", open=True):
|
159 |
-
gr.Markdown("""
|
160 |
-
- 🎯 Lower temperature = more consistent
|
161 |
-
- 📏 Smaller chunks = more precise
|
162 |
-
- 🎙️ Try different voices for best fit
|
163 |
-
- 💫 Match style to content type
|
164 |
-
""")
|
165 |
-
|
166 |
-
gr.HTML(
|
167 |
-
"""
|
168 |
-
<div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
|
169 |
-
<p style="margin: 0; color: #666; font-size: 0.8em;">
|
170 |
-
🚀 Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
|
171 |
-
| 📚 <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Docs</a>
|
172 |
-
| 🤗 <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">HF Space</a>
|
173 |
-
</p>
|
174 |
-
</div>
|
175 |
-
"""
|
176 |
-
)
|
177 |
-
|
178 |
-
def update_mode(mode_name):
|
179 |
-
mode = SYNTHESIS_MODES[mode_name]
|
180 |
-
return (
|
181 |
-
gr.update(choices=mode["styles"], value=mode["styles"][0]),
|
182 |
-
gr.update(value=mode["default_chunks"]),
|
183 |
-
gr.update(value=mode["default_temp"]),
|
184 |
-
mode["description"]
|
185 |
-
)
|
186 |
-
|
187 |
-
mode_select.change(
|
188 |
-
update_mode,
|
189 |
-
inputs=[mode_select],
|
190 |
-
outputs=[style_select, chunk_size, temperature, mode_description]
|
191 |
-
)
|
192 |
-
|
193 |
-
def update_interface(pdf_file, api_key, mode_name, voice, style, chunk_size, temperature, max_tokens):
|
194 |
-
mode = SYNTHESIS_MODES[mode_name]
|
195 |
-
return process_document(
|
196 |
-
pdf_file=pdf_file,
|
197 |
-
api_key=api_key,
|
198 |
-
voice_choice=voice,
|
199 |
-
style_choice=style,
|
200 |
-
chunk_size=chunk_size,
|
201 |
-
temperature=temperature,
|
202 |
-
max_tokens=max_tokens,
|
203 |
-
system_prompt=mode["system_prompt"]
|
204 |
-
)
|
205 |
-
|
206 |
-
process_btn.click(
|
207 |
-
update_interface,
|
208 |
-
inputs=[
|
209 |
-
file_input, api_key, mode_select, voice_select, style_select,
|
210 |
-
chunk_size, temperature, max_tokens
|
211 |
-
],
|
212 |
-
outputs=[output_table, audio_output, status_output]
|
213 |
-
)
|
214 |
-
|
215 |
-
return demo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/processor.py
DELETED
@@ -1,120 +0,0 @@
|
|
1 |
-
import pixeltable as pxt
|
2 |
-
from pixeltable.iterators import DocumentSplitter
|
3 |
-
from pixeltable.functions import openai
|
4 |
-
import os
|
5 |
-
import requests
|
6 |
-
import tempfile
|
7 |
-
import gradio as gr
|
8 |
-
|
9 |
-
def process_document(pdf_file, api_key, voice_choice, style_choice, chunk_size, temperature, max_tokens, system_prompt, progress=gr.Progress()):
|
10 |
-
try:
|
11 |
-
os.environ['OPENAI_API_KEY'] = api_key
|
12 |
-
|
13 |
-
progress(0.1, desc="Initializing...")
|
14 |
-
pxt.drop_dir('document_audio', force=True)
|
15 |
-
pxt.create_dir('document_audio')
|
16 |
-
|
17 |
-
docs = pxt.create_table(
|
18 |
-
'document_audio.documents',
|
19 |
-
{
|
20 |
-
'document': pxt.Document,
|
21 |
-
'voice': pxt.String,
|
22 |
-
'style': pxt.String,
|
23 |
-
'mode_prompt': pxt.String
|
24 |
-
}
|
25 |
-
)
|
26 |
-
|
27 |
-
progress(0.2, desc="Processing document...")
|
28 |
-
docs.insert([{
|
29 |
-
'document': pdf_file.name,
|
30 |
-
'voice': voice_choice,
|
31 |
-
'style': style_choice,
|
32 |
-
'mode_prompt': system_prompt
|
33 |
-
}])
|
34 |
-
|
35 |
-
chunks = pxt.create_view(
|
36 |
-
'document_audio.chunks',
|
37 |
-
docs,
|
38 |
-
iterator=DocumentSplitter.create(
|
39 |
-
document=docs.document,
|
40 |
-
separators='token_limit',
|
41 |
-
limit=chunk_size
|
42 |
-
)
|
43 |
-
)
|
44 |
-
|
45 |
-
progress(0.4, desc="Text processing...")
|
46 |
-
chunks['content_response'] = openai.chat_completions(
|
47 |
-
messages=[
|
48 |
-
{
|
49 |
-
'role': 'system',
|
50 |
-
'content': docs.mode_prompt # Use the mode-specific prompt
|
51 |
-
},
|
52 |
-
{'role': 'user', 'content': chunks.text}
|
53 |
-
],
|
54 |
-
model='gpt-4o-mini-2024-07-18',
|
55 |
-
max_tokens=max_tokens,
|
56 |
-
temperature=temperature
|
57 |
-
)
|
58 |
-
|
59 |
-
chunks['content'] = chunks.content_response['choices'][0]['message']['content']
|
60 |
-
|
61 |
-
progress(0.6, desc="Script generation...")
|
62 |
-
chunks['script_response'] = openai.chat_completions(
|
63 |
-
messages=[
|
64 |
-
{
|
65 |
-
'role': 'system',
|
66 |
-
'content': f"""Convert content to audio script.
|
67 |
-
Style: {docs.style}
|
68 |
-
Format:
|
69 |
-
- Clear sentence structures
|
70 |
-
- Natural pauses (...)
|
71 |
-
- Term definitions when needed
|
72 |
-
- Proper transitions"""
|
73 |
-
},
|
74 |
-
{'role': 'user', 'content': chunks.content}
|
75 |
-
],
|
76 |
-
model='gpt-4o-mini-2024-07-18',
|
77 |
-
max_tokens=max_tokens,
|
78 |
-
temperature=temperature
|
79 |
-
)
|
80 |
-
chunks['script'] = chunks.script_response['choices'][0]['message']['content']
|
81 |
-
|
82 |
-
progress(0.8, desc="Audio synthesis...")
|
83 |
-
@pxt.udf(return_type=pxt.Audio)
|
84 |
-
def generate_audio(script: str, voice: str):
|
85 |
-
if not script or not voice:
|
86 |
-
return None
|
87 |
-
try:
|
88 |
-
response = requests.post(
|
89 |
-
"https://api.openai.com/v1/audio/speech",
|
90 |
-
headers={"Authorization": f"Bearer {api_key}"},
|
91 |
-
json={"model": "tts-1", "input": script, "voice": voice}
|
92 |
-
)
|
93 |
-
if response.status_code == 200:
|
94 |
-
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
|
95 |
-
temp_file.write(response.content)
|
96 |
-
temp_file.close()
|
97 |
-
return temp_file.name
|
98 |
-
except Exception as e:
|
99 |
-
print(f"Error in audio synthesis: {e}")
|
100 |
-
return None
|
101 |
-
|
102 |
-
chunks['audio'] = generate_audio(chunks.script, docs.voice)
|
103 |
-
|
104 |
-
audio_path = chunks.select(chunks.audio).tail(1)['audio'][0]
|
105 |
-
|
106 |
-
results = chunks.select(
|
107 |
-
chunks.content,
|
108 |
-
chunks.script
|
109 |
-
).collect()
|
110 |
-
|
111 |
-
display_data = [
|
112 |
-
[f"Segment {idx + 1}", row['content'], row['script']]
|
113 |
-
for idx, row in enumerate(results)
|
114 |
-
]
|
115 |
-
|
116 |
-
progress(1.0, desc="Complete")
|
117 |
-
return display_data, audio_path, "Processing complete"
|
118 |
-
|
119 |
-
except Exception as e:
|
120 |
-
return None, None, f"Error: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|