import gradio as gr import pixeltable as pxt from pixeltable.iterators import DocumentSplitter from pixeltable.functions import openai import os import requests import tempfile def process_document(pdf_file, api_key, voice_choice, style_choice, chunk_size, temperature, max_tokens, progress=gr.Progress()): try: os.environ['OPENAI_API_KEY'] = api_key progress(0.1, desc="Initializing...") pxt.drop_dir('document_audio', force=True) pxt.create_dir('document_audio') docs = pxt.create_table( 'document_audio.documents', { 'document': pxt.DocumentType(), 'voice': pxt.StringType(), 'style': pxt.StringType() } ) progress(0.2, desc="Processing document...") docs.insert([{ 'document': pdf_file.name, 'voice': voice_choice, 'style': style_choice }]) chunks = pxt.create_view( 'document_audio.chunks', docs, iterator=DocumentSplitter.create( document=docs.document, separators='token_limit', limit=chunk_size ) ) progress(0.4, desc="Text processing...") chunks['content_response'] = openai.chat_completions( messages=[ { 'role': 'system', 'content': """Transform this text segment into clear, concise content. Structure: 1. Core concepts and points 2. Supporting details 3. Key takeaways""" }, {'role': 'user', 'content': chunks.text} ], model='gpt-4o-mini-2024-07-18', max_tokens=max_tokens, temperature=temperature ) chunks['content'] = chunks.content_response['choices'][0]['message']['content'] progress(0.6, desc="Script generation...") chunks['script_response'] = openai.chat_completions( messages=[ { 'role': 'system', 'content': f"""Convert content to audio script. Style: {docs.style} Format: - Clear sentence structures - Natural pauses (...) - Term definitions when needed - Proper transitions - Appropriate pronunciation guidance""" }, {'role': 'user', 'content': chunks.content} ], model='gpt-4o-mini-2024-07-18', max_tokens=max_tokens, temperature=temperature ) chunks['script'] = chunks.script_response['choices'][0]['message']['content'] progress(0.8, desc="Audio synthesis...") @pxt.udf(return_type=pxt.AudioType()) def generate_audio(script: str, voice: str): if not script or not voice: return None try: response = requests.post( "https://api.openai.com/v1/audio/speech", headers={"Authorization": f"Bearer {api_key}"}, json={"model": "tts-1", "input": script, "voice": voice} ) if response.status_code == 200: temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') temp_file.write(response.content) temp_file.close() return temp_file.name except Exception as e: print(f"Error in audio synthesis: {e}") return None chunks['audio'] = generate_audio(chunks.script, docs.voice) audio_path = chunks.select(chunks.audio).tail(1)['audio'][0] results = chunks.select(chunks.content, chunks.script).collect() display_data = [ [f"Segment {idx + 1}", row['content'], row['script']] for idx, row in enumerate(results) ] progress(1.0, desc="Complete") return display_data, audio_path, "Processing complete" except Exception as e: return None, None, f"Error: {str(e)}" with gr.Blocks(theme=gr.themes.Base()) as demo: gr.Markdown( """
Pixeltable

📄 Document to Audio Synthesis 🎧

""" ) with gr.Row(): with gr.Column(): with gr.Accordion("🎯 What does it do?", open=False): gr.Markdown(""" 1. 📄 **Document Processing:** PDF extraction and token-based chunking 2. 🤖 **Content Pipeline:** LLM-powered text optimization and script generation 3. 🔊 **Audio Generation:** Neural TTS synthesis with voice modulation """) with gr.Column(): with gr.Accordion("⚡ How does it work?", open=False): gr.Markdown(""" 1. 📑 **Segmentation:** Token-based document chunking with configurable limits 2. 🔍 **Transformation:** Dual-pass LLM processing with temperature control 3. 🎵 **Synthesis:** OpenAI TTS with multi-voice capability """) gr.HTML( """

⚠️ API Cost Notice: This application uses OpenAI's Text-to-Speech API which incurs costs per use. See OpenAI's TTS Documentation for current pricing information.

""" ) with gr.Row(): with gr.Column(): with gr.Accordion("🔑 Input & Voice", open=True): api_key = gr.Textbox( label="OpenAI API Key", placeholder="sk-...", type="password" ) file_input = gr.File( label="PDF Document", file_types=[".pdf"] ) with gr.Column(): with gr.Accordion("⚙️ Processing Configuration", open=True): style_select = gr.Radio( choices=["Technical", "Narrative", "Instructional", "Descriptive"], value="Technical", label="💫 Style" ) with gr.Row(): voice_select = gr.Radio( choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"], value="onyx", label="🎙️ Voice Model" ) with gr.Row(): chunk_size = gr.Slider( minimum=100, maximum=1000, value=300, step=50, label="📏 Chunk Size" ) temperature = gr.Slider( minimum=0, maximum=1, value=0.7, step=0.1, label="🌡️ Temperature" ) max_tokens = gr.Slider( minimum=100, maximum=1000, value=300, step=50, label="📊 Tokens" ) with gr.Row(): process_btn = gr.Button("🚀 Generate Audio", variant="primary", scale=2) with gr.Tabs(): with gr.TabItem("📝 Content"): output_table = gr.Dataframe( headers=["🔍 Segment", "📄 Content", "🎭 Script"], wrap=True ) with gr.TabItem("🎧 Audio"): with gr.Row(): with gr.Column(scale=2): audio_output = gr.Audio( label="🔊 Generated Audio", type="filepath", show_download_button=True ) with gr.Column(scale=1): with gr.Accordion("📚 Technical Notes", open=True): gr.Markdown(""" - 🎯 Temperature < 0.5: Deterministic output - 📏 Chunk size affects token context - 🎙️ Voice models vary in prosody - 💰 API usage is billed per character """) gr.HTML( """

🚀 Powered by Pixeltable | 📚 Docs | 🤗 HF Space

""" ) def update_interface(pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens): return process_document( pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens ) process_btn.click( update_interface, inputs=[ file_input, api_key, voice_select, style_select, chunk_size, temperature, max_tokens ], outputs=[output_table, audio_output] ) if __name__ == "__main__": demo.launch()