PierreBrunelle's picture
Update app.py
1c0e277 verified
raw
history blame
10.2 kB
import gradio as gr
import pixeltable as pxt
from pixeltable.iterators import DocumentSplitter
from pixeltable.functions import openai
import os
import requests
import tempfile
def process_document(pdf_file, api_key, voice_choice, style_choice, chunk_size, temperature, max_tokens, progress=gr.Progress()):
try:
os.environ['OPENAI_API_KEY'] = api_key
progress(0.1, desc="Initializing...")
pxt.drop_dir('document_audio', force=True)
pxt.create_dir('document_audio')
docs = pxt.create_table(
'document_audio.documents',
{
'document': pxt.DocumentType(),
'voice': pxt.StringType(),
'style': pxt.StringType()
}
)
progress(0.2, desc="Processing document...")
docs.insert([{
'document': pdf_file.name,
'voice': voice_choice,
'style': style_choice
}])
chunks = pxt.create_view(
'document_audio.chunks',
docs,
iterator=DocumentSplitter.create(
document=docs.document,
separators='token_limit',
limit=chunk_size
)
)
progress(0.4, desc="Text processing...")
chunks['content_response'] = openai.chat_completions(
messages=[
{
'role': 'system',
'content': """Transform this text segment into clear, concise content.
Structure:
1. Core concepts and points
2. Supporting details
3. Key takeaways"""
},
{'role': 'user', 'content': chunks.text}
],
model='gpt-4o-mini-2024-07-18',
max_tokens=max_tokens,
temperature=temperature
)
chunks['content'] = chunks.content_response['choices'][0]['message']['content']
progress(0.6, desc="Script generation...")
chunks['script_response'] = openai.chat_completions(
messages=[
{
'role': 'system',
'content': f"""Convert content to audio script.
Style: {docs.style}
Format:
- Clear sentence structures
- Natural pauses (...)
- Term definitions when needed
- Proper transitions
- Appropriate pronunciation guidance"""
},
{'role': 'user', 'content': chunks.content}
],
model='gpt-4o-mini-2024-07-18',
max_tokens=max_tokens,
temperature=temperature
)
chunks['script'] = chunks.script_response['choices'][0]['message']['content']
progress(0.8, desc="Audio synthesis...")
@pxt.udf(return_type=pxt.AudioType())
def generate_audio(script: str, voice: str):
if not script or not voice:
return None
try:
response = requests.post(
"https://api.openai.com/v1/audio/speech",
headers={"Authorization": f"Bearer {api_key}"},
json={"model": "tts-1", "input": script, "voice": voice}
)
if response.status_code == 200:
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
temp_file.write(response.content)
temp_file.close()
return temp_file.name
except Exception as e:
print(f"Error in audio synthesis: {e}")
return None
chunks['audio'] = generate_audio(chunks.script, docs.voice)
audio_path = chunks.select(chunks.audio).tail(1)['audio'][0]
results = chunks.select(chunks.content, chunks.script).collect()
display_data = [
[f"Segment {idx + 1}", row['content'], row['script']]
for idx, row in enumerate(results)
]
progress(1.0, desc="Complete")
return display_data, audio_path, "Processing complete"
except Exception as e:
return None, None, f"Error: {str(e)}"
with gr.Blocks(theme=gr.themes.Base()) as demo:
gr.Markdown(
"""
<div>
<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" alt="Pixeltable" style="max-width: 200px; margin-bottom: 20px;" />
<h1 style="margin-bottom: 0.5em;">📄 Document to Audio Synthesis 🎧</h1>
</div>
"""
)
with gr.Row():
with gr.Column():
with gr.Accordion("🎯 What does it do?", open=False):
gr.Markdown("""
1. 📄 **Document Processing:** PDF extraction and token-based chunking
2. 🤖 **Content Pipeline:** LLM-powered text optimization and script generation
3. 🔊 **Audio Generation:** Neural TTS synthesis with voice modulation
""")
with gr.Column():
with gr.Accordion("⚡ How does it work?", open=False):
gr.Markdown("""
1. 📑 **Segmentation:** Token-based document chunking with configurable limits
2. 🔍 **Transformation:** Dual-pass LLM processing with temperature control
3. 🎵 **Synthesis:** OpenAI TTS with multi-voice capability
""")
gr.HTML(
"""
<div style="background-color: #FFF3CD; border: 1px solid #FFEEBA; padding: 1rem; margin: 1rem 0; border-radius: 4px;">
<p style="margin: 0; color: #856404;">
⚠️ <strong>API Cost Notice:</strong> This application uses OpenAI's Text-to-Speech API which incurs costs per use.
See <a href="https://platform.openai.com/docs/guides/text-to-speech" target="_blank" style="color: #856404; text-decoration: underline;">OpenAI's TTS Documentation</a>
for current pricing information.
</p>
</div>
"""
)
with gr.Row():
with gr.Column():
with gr.Accordion("🔑 Input & Voice", open=True):
api_key = gr.Textbox(
label="OpenAI API Key",
placeholder="sk-...",
type="password"
)
file_input = gr.File(
label="PDF Document",
file_types=[".pdf"]
)
with gr.Column():
with gr.Accordion("⚙️ Processing Configuration", open=True):
style_select = gr.Radio(
choices=["Technical", "Narrative", "Instructional", "Descriptive"],
value="Technical",
label="💫 Style"
)
with gr.Row():
voice_select = gr.Radio(
choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
value="onyx",
label="🎙️ Voice Model"
)
with gr.Row():
chunk_size = gr.Slider(
minimum=100, maximum=1000, value=300, step=50,
label="📏 Chunk Size"
)
temperature = gr.Slider(
minimum=0, maximum=1, value=0.7, step=0.1,
label="🌡️ Temperature"
)
max_tokens = gr.Slider(
minimum=100, maximum=1000, value=300, step=50,
label="📊 Tokens"
)
with gr.Row():
process_btn = gr.Button("🚀 Generate Audio", variant="primary", scale=2)
with gr.Tabs():
with gr.TabItem("📝 Content"):
output_table = gr.Dataframe(
headers=["🔍 Segment", "📄 Content", "🎭 Script"],
wrap=True
)
with gr.TabItem("🎧 Audio"):
with gr.Row():
with gr.Column(scale=2):
audio_output = gr.Audio(
label="🔊 Generated Audio",
type="filepath",
show_download_button=True
)
with gr.Column(scale=1):
with gr.Accordion("📚 Technical Notes", open=True):
gr.Markdown("""
- 🎯 Temperature < 0.5: Deterministic output
- 📏 Chunk size affects token context
- 🎙️ Voice models vary in prosody
- 💰 API usage is billed per character
""")
gr.HTML(
"""
<div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
<p style="margin: 0; color: #666; font-size: 0.8em;">
🚀 Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
| 📚 <a href="https://docs.pixeltable.com" target="_blank" style="color: #666;">Docs</a>
| 🤗 <a href="https://huggingface.co/spaces/Pixeltable" target="_blank" style="color: #666;">HF Space</a>
</p>
</div>
"""
)
def update_interface(pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens):
return process_document(
pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens
)
process_btn.click(
update_interface,
inputs=[
file_input, api_key, voice_select, style_select,
chunk_size, temperature, max_tokens
],
outputs=[output_table, audio_output]
)
if __name__ == "__main__":
demo.launch()