PierreBrunelle commited on
Commit
ddd522e
·
verified ·
1 Parent(s): 0b74be0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +248 -2
app.py CHANGED
@@ -1,6 +1,252 @@
1
  import gradio as gr
2
- from src.interface import create_interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  if __name__ == "__main__":
5
- demo = create_interface()
6
  demo.launch(debug=True)
 
1
  import gradio as gr
2
+ import pixeltable as pxt
3
+ from pixeltable.iterators import DocumentSplitter
4
+ from pixeltable.functions import openai
5
+ import os
6
+ import requests
7
+ import tempfile
8
+
9
+ def process_document(pdf_file, api_key, voice_choice, style_choice, chunk_size, temperature, max_tokens, progress=gr.Progress()):
10
+ try:
11
+ os.environ['OPENAI_API_KEY'] = api_key
12
+
13
+ progress(0.1, desc="Initializing...")
14
+ pxt.drop_dir('document_audio', force=True)
15
+ pxt.create_dir('document_audio')
16
+
17
+ docs = pxt.create_table(
18
+ 'document_audio.documents',
19
+ {
20
+ 'document': pxt.DocumentType(),
21
+ 'voice': pxt.StringType(),
22
+ 'style': pxt.StringType()
23
+ }
24
+ )
25
+
26
+ progress(0.2, desc="Processing document...")
27
+ docs.insert([{
28
+ 'document': pdf_file.name,
29
+ 'voice': voice_choice,
30
+ 'style': style_choice
31
+ }])
32
+
33
+ chunks = pxt.create_view(
34
+ 'document_audio.chunks',
35
+ docs,
36
+ iterator=DocumentSplitter.create(
37
+ document=docs.document,
38
+ separators='token_limit',
39
+ limit=chunk_size
40
+ )
41
+ )
42
+
43
+ progress(0.4, desc="Text processing...")
44
+ chunks['content_response'] = openai.chat_completions(
45
+ messages=[
46
+ {
47
+ 'role': 'system',
48
+ 'content': """Transform this text segment into clear, concise content.
49
+ Structure:
50
+ 1. Core concepts and points
51
+ 2. Supporting details
52
+ 3. Key takeaways"""
53
+ },
54
+ {'role': 'user', 'content': chunks.text}
55
+ ],
56
+ model='gpt-4o-mini-2024-07-18',
57
+ max_tokens=max_tokens,
58
+ temperature=temperature
59
+ )
60
+ chunks['content'] = chunks.content_response['choices'][0]['message']['content']
61
+
62
+ progress(0.6, desc="Script generation...")
63
+ chunks['script_response'] = openai.chat_completions(
64
+ messages=[
65
+ {
66
+ 'role': 'system',
67
+ 'content': f"""Convert content to audio script.
68
+ Style: {docs.style}
69
+ Format:
70
+ - Clear sentence structures
71
+ - Natural pauses (...)
72
+ - Term definitions when needed
73
+ - Proper transitions
74
+ - Appropriate pronunciation guidance"""
75
+ },
76
+ {'role': 'user', 'content': chunks.content}
77
+ ],
78
+ model='gpt-4o-mini-2024-07-18',
79
+ max_tokens=max_tokens,
80
+ temperature=temperature
81
+ )
82
+ chunks['script'] = chunks.script_response['choices'][0]['message']['content']
83
+
84
+ progress(0.8, desc="Audio synthesis...")
85
+ @pxt.udf(return_type=pxt.AudioType())
86
+ def generate_audio(script: str, voice: str):
87
+ if not script or not voice:
88
+ return None
89
+ try:
90
+ response = requests.post(
91
+ "https://api.openai.com/v1/audio/speech",
92
+ headers={"Authorization": f"Bearer {api_key}"},
93
+ json={"model": "tts-1", "input": script, "voice": voice}
94
+ )
95
+ if response.status_code == 200:
96
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
97
+ temp_file.write(response.content)
98
+ temp_file.close()
99
+ return temp_file.name
100
+ except Exception as e:
101
+ print(f"Error in audio synthesis: {e}")
102
+ return None
103
+
104
+ chunks['audio'] = generate_audio(chunks.script, docs.voice)
105
+
106
+ audio_path = chunks.select(chunks.audio).tail(1)['audio'][0]
107
+
108
+ results = chunks.select(chunks.content, chunks.script).collect()
109
+
110
+ display_data = [
111
+ [f"Segment {idx + 1}", row['content'], row['script']]
112
+ for idx, row in enumerate(results)
113
+ ]
114
+
115
+ progress(1.0, desc="Complete")
116
+ return display_data, audio_path, "Processing complete"
117
+
118
+ except Exception as e:
119
+ return None, None, f"Error: {str(e)}"
120
+
121
+ with gr.Blocks(theme=gr.themes.Base()) as demo:
122
+ gr.HTML(
123
+ """
124
+ <div style="margin-bottom: 1rem;">
125
+ <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png"
126
+ alt="Pixeltable" style="max-width: 150px;" />
127
+ <h1>📄 Document to Audio Synthesis 🎧</h1>
128
+ </div>
129
+ """
130
+ )
131
+
132
+ with gr.Row():
133
+ with gr.Column():
134
+ with gr.Accordion("🎯 What does it do?", open=False):
135
+ gr.Markdown("""
136
+ 1. ��� **Document Processing:** PDF extraction and token-based chunking
137
+ 2. 🤖 **Content Pipeline:** LLM-powered text optimization and script generation
138
+ 3. 🔊 **Audio Generation:** Neural TTS synthesis with voice modulation
139
+ """)
140
+ with gr.Column():
141
+ with gr.Accordion("⚡ How does it work?", open=False):
142
+ gr.Markdown("""
143
+ 1. 📑 **Segmentation:** Token-based document chunking with configurable limits
144
+ 2. 🔍 **Transformation:** Dual-pass LLM processing with temperature control
145
+ 3. 🎵 **Synthesis:** OpenAI TTS with multi-voice capability
146
+ """)
147
+
148
+ gr.HTML(
149
+ """
150
+ <div style="background-color: #FFF3CD; border: 1px solid #FFEEBA; padding: 1rem; margin: 1rem 0; border-radius: 4px;">
151
+ <p style="margin: 0; color: #856404;">
152
+ ⚠️ <strong>API Cost Notice:</strong> This application uses OpenAI's Text-to-Speech API which incurs costs per use.
153
+ See <a href="https://platform.openai.com/docs/guides/text-to-speech" target="_blank" style="color: #856404; text-decoration: underline;">OpenAI's TTS Documentation</a>
154
+ for current pricing information.
155
+ </p>
156
+ </div>
157
+ """
158
+ )
159
+
160
+ with gr.Row():
161
+ with gr.Column():
162
+ with gr.Accordion("🔑 Input & Voice", open=True):
163
+ api_key = gr.Textbox(
164
+ label="OpenAI API Key",
165
+ placeholder="sk-...",
166
+ type="password"
167
+ )
168
+ file_input = gr.File(
169
+ label="PDF Document",
170
+ file_types=[".pdf"]
171
+ )
172
+
173
+ with gr.Column():
174
+ with gr.Accordion("⚙️ Processing Configuration", open=True):
175
+ style_select = gr.Radio(
176
+ choices=["Technical", "Narrative", "Instructional", "Descriptive"],
177
+ value="Technical",
178
+ label="💫 Style"
179
+ )
180
+ with gr.Row():
181
+ voice_select = gr.Radio(
182
+ choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
183
+ value="onyx",
184
+ label="🎙️ Voice Model"
185
+ )
186
+ with gr.Row():
187
+ chunk_size = gr.Slider(
188
+ minimum=100, maximum=1000, value=300, step=50,
189
+ label="📏 Chunk Size"
190
+ )
191
+ temperature = gr.Slider(
192
+ minimum=0, maximum=1, value=0.7, step=0.1,
193
+ label="🌡️ Temperature"
194
+ )
195
+ max_tokens = gr.Slider(
196
+ minimum=100, maximum=1000, value=300, step=50,
197
+ label="📊 Tokens"
198
+ )
199
+ with gr.Row():
200
+ process_btn = gr.Button("🚀 Generate Audio", variant="primary", scale=2)
201
+
202
+ with gr.Tabs():
203
+ with gr.TabItem("📝 Content"):
204
+ output_table = gr.Dataframe(
205
+ headers=["🔍 Segment", "📄 Content", "🎭 Script"],
206
+ wrap=True
207
+ )
208
+ with gr.TabItem("🎧 Audio"):
209
+ with gr.Row():
210
+ with gr.Column(scale=2):
211
+ audio_output = gr.Audio(
212
+ label="🔊 Generated Audio",
213
+ type="filepath",
214
+ show_download_button=True
215
+ )
216
+ with gr.Column(scale=1):
217
+ with gr.Accordion("📚 Technical Notes", open=True):
218
+ gr.Markdown("""
219
+ - 🎯 Temperature < 0.5: Deterministic output
220
+ - 📏 Chunk size affects token context
221
+ - 🎙️ Voice models vary in prosody
222
+ - 💰 API usage is billed per character
223
+ """)
224
+
225
+ gr.HTML(
226
+ """
227
+ <div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
228
+ <p style="margin: 0; color: #666; font-size: 0.8em;">
229
+ 🚀 Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
230
+ | 📚 <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Docs</a>
231
+ | 🤗 <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">HF Space</a>
232
+ </p>
233
+ </div>
234
+ """
235
+ )
236
+
237
+ def update_interface(pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens):
238
+ return process_document(
239
+ pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens
240
+ )
241
+
242
+ process_btn.click(
243
+ update_interface,
244
+ inputs=[
245
+ file_input, api_key, voice_select, style_select,
246
+ chunk_size, temperature, max_tokens
247
+ ],
248
+ outputs=[output_table, audio_output]
249
+ )
250
 
251
  if __name__ == "__main__":
 
252
  demo.launch(debug=True)