Spaces:

Pixeltable
/

Document-to-Audio-Synthesis

Sleeping

App Files Files Community

PierreBrunelle commited on Oct 23, 2024

Commit

0b74be0

verified ·

1 Parent(s): 5736427

Delete src

Browse files

Files changed (2) hide show

src/interface.py +0 -215
src/processor.py +0 -120

src/interface.py DELETED Viewed

@@ -1,215 +0,0 @@
-import gradio as gr
-from .processor import process_document
-SYNTHESIS_MODES = {
-    "narration": {
-        "description": "Simple document narration with clear voice and natural pacing",
-        "styles": ["Technical", "Narrative", "Instructional", "Descriptive"],
-        "default_temp": 0.7,
-        "default_chunks": 300,
-        "system_prompt": """Convert this content into clear narration."""
-    },
-    "podcast": {
-        "description": "Conversational style with engaging tone and dynamic pacing",
-        "styles": ["Casual", "Interview", "Educational", "Commentary"],
-        "default_temp": 0.8,
-        "default_chunks": 400,
-        "system_prompt": """Transform this content into engaging podcast-style speech."""
-    },
-    "presentation": {
-        "description": "Professional presentation style with clear structure",
-        "styles": ["Business", "Academic", "Sales", "Training"],
-        "default_temp": 0.6,
-        "default_chunks": 250,
-        "system_prompt": """Convert this content into a presentation format."""
-    },
-    "storytelling": {
-        "description": "Narrative style with emotional engagement",
-        "styles": ["Dynamic", "Dramatic", "Calm", "Energetic"],
-        "default_temp": 0.9,
-        "default_chunks": 500,
-        "system_prompt": """Transform this content into an engaging story."""
-    }
-}
-def create_interface():
-    with gr.Blocks(theme=gr.themes.Base()) as demo:
-        gr.HTML(
-            """
-            <div style="margin-bottom: 1rem;">
-                <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png"
-                     alt="Pixeltable" style="max-width: 150px;" />
-                <h1>📄 Document to Audio Synthesis 🎧</h1>
-            </div>
-            """
-        )
-        # Overview Row
-        with gr.Row():
-            with gr.Column():
-                with gr.Accordion("🎯 What does it do?", open=True):
-                    gr.Markdown("""
-                        - 📄 Document processing  - 🧠 Content transformation
-                        - 🎧 Audio synthesis  - ⚙️ Multiple output styles
-                    """)
-            with gr.Column():
-                with gr.Accordion("⚡ How does it work?", open=True):
-                    gr.Markdown("""
-                        1. 📑 **Processing:** Token-based segmentation
-                        2. 🔍 **Analysis:** LLM optimization & scripts
-                        3. 🎵 **Synthesis:** Multiple voice options
-                    """)
-        synthesis_mode = gr.State(SYNTHESIS_MODES["narration"])
-        # Main Settings Row
-        with gr.Row():
-            # Core Settings Column
-            with gr.Column():
-                with gr.Accordion("🔑 Core Settings", open=True):
-                    with gr.Row():
-                        api_key = gr.Textbox(
-                            label="OpenAI API Key",
-                            placeholder="sk-...",
-                            type="password",
-                            scale=2
-                        )
-                        file_input = gr.File(
-                            label="PDF Document",
-                            file_types=[".pdf"],
-                            scale=1
-                        )
-            # Mode Selection Column
-            with gr.Column():
-                with gr.Accordion("🎭 Output Mode", open=True):
-                    mode_select = gr.Radio(
-                        choices=list(SYNTHESIS_MODES.keys()),
-                        value="narration",
-                        label="Select Mode",
-                        info="Choose output style"
-                    )
-                    mode_description = gr.Markdown(
-                        SYNTHESIS_MODES["narration"]["description"]
-                    )
-        # Voice and Processing Settings Row
-        with gr.Row():
-            # Voice Settings Column
-            with gr.Column():
-                with gr.Accordion("🎛️ Voice & Style", open=True):
-                    voice_select = gr.Radio(
-                        choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
-                        value="onyx",
-                        label="🎙️ Voice",
-                        interactive=True
-                    )
-                    style_select = gr.Radio(
-                        choices=SYNTHESIS_MODES["narration"]["styles"],
-                        value=SYNTHESIS_MODES["narration"]["styles"][0],
-                        label="💫 Style",
-                        interactive=True
-                    )
-            # Processing Settings Column
-            with gr.Column():
-                with gr.Accordion("⚙️ Processing Parameters", open=True):
-                    with gr.Row():
-                        chunk_size = gr.Slider(
-                            minimum=100, maximum=1000,
-                            value=SYNTHESIS_MODES["narration"]["default_chunks"],
-                            step=50,
-                            label="📏 Chunk Size"
-                        )
-                        temperature = gr.Slider(
-                            minimum=0, maximum=1,
-                            value=SYNTHESIS_MODES["narration"]["default_temp"],
-                            step=0.1,
-                            label="🌡️ Temperature"
-                        )
-                        max_tokens = gr.Slider(
-                            minimum=100, maximum=1000,
-                            value=300,
-                            step=50,
-                            label="📊 Tokens"
-                        )
-        # Process Button Row
-        with gr.Row():
-            process_btn = gr.Button("🚀 Generate Audio", variant="primary", scale=2)
-            status_output = gr.Textbox(label="📋 Status", scale=1)
-        # Output Section
-        with gr.Tabs():
-            with gr.TabItem("📝 Content"):
-                output_table = gr.Dataframe(
-                    headers=["🔍 Segment", "📄 Content", "🎭 Script"],
-                    wrap=True
-                )
-            with gr.TabItem("🎧 Audio"):
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        audio_output = gr.Audio(
-                            label="🔊 Synthesized Audio",
-                            type="filepath",
-                            show_download_button=True
-                        )
-                    with gr.Column(scale=1):
-                        with gr.Accordion("📚 Quick Tips", open=True):
-                            gr.Markdown("""
-                                - 🎯 Lower temperature = more consistent
-                                - 📏 Smaller chunks = more precise
-                                - 🎙️ Try different voices for best fit
-                                - 💫 Match style to content type
-                            """)
-        gr.HTML(
-            """
-            <div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
-                <p style="margin: 0; color: #666; font-size: 0.8em;">
-                    🚀 Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
-                    | 📚 <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Docs</a>
-                    | 🤗 <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">HF Space</a>
-                </p>
-            </div>
-            """
-        )
-        def update_mode(mode_name):
-            mode = SYNTHESIS_MODES[mode_name]
-            return (
-                gr.update(choices=mode["styles"], value=mode["styles"][0]),
-                gr.update(value=mode["default_chunks"]),
-                gr.update(value=mode["default_temp"]),
-                mode["description"]
-            )
-        mode_select.change(
-            update_mode,
-            inputs=[mode_select],
-            outputs=[style_select, chunk_size, temperature, mode_description]
-        )
-        def update_interface(pdf_file, api_key, mode_name, voice, style, chunk_size, temperature, max_tokens):
-            mode = SYNTHESIS_MODES[mode_name]
-            return process_document(
-                pdf_file=pdf_file,
-                api_key=api_key,
-                voice_choice=voice,
-                style_choice=style,
-                chunk_size=chunk_size,
-                temperature=temperature,
-                max_tokens=max_tokens,
-                system_prompt=mode["system_prompt"]
-            )
-        process_btn.click(
-            update_interface,
-            inputs=[
-                file_input, api_key, mode_select, voice_select, style_select,
-                chunk_size, temperature, max_tokens
-            ],
-            outputs=[output_table, audio_output, status_output]
-        )
-    return demo

src/processor.py DELETED Viewed

@@ -1,120 +0,0 @@
-import pixeltable as pxt
-from pixeltable.iterators import DocumentSplitter
-from pixeltable.functions import openai
-import os
-import requests
-import tempfile
-import gradio as gr
-def process_document(pdf_file, api_key, voice_choice, style_choice, chunk_size, temperature, max_tokens, system_prompt, progress=gr.Progress()):
-    try:
-        os.environ['OPENAI_API_KEY'] = api_key
-        progress(0.1, desc="Initializing...")
-        pxt.drop_dir('document_audio', force=True)
-        pxt.create_dir('document_audio')
-        docs = pxt.create_table(
-            'document_audio.documents',
-            {
-                'document': pxt.Document,
-                'voice': pxt.String,
-                'style': pxt.String,
-                'mode_prompt': pxt.String
-            }
-        )
-        progress(0.2, desc="Processing document...")
-        docs.insert([{
-            'document': pdf_file.name,
-            'voice': voice_choice,
-            'style': style_choice,
-            'mode_prompt': system_prompt
-        }])
-        chunks = pxt.create_view(
-            'document_audio.chunks',
-            docs,
-            iterator=DocumentSplitter.create(
-                document=docs.document,
-                separators='token_limit',
-                limit=chunk_size
-            )
-        )
-        progress(0.4, desc="Text processing...")
-        chunks['content_response'] = openai.chat_completions(
-            messages=[
-                {
-                    'role': 'system',
-                    'content': docs.mode_prompt  # Use the mode-specific prompt
-                },
-                {'role': 'user', 'content': chunks.text}
-            ],
-            model='gpt-4o-mini-2024-07-18',
-            max_tokens=max_tokens,
-            temperature=temperature
-        )
-        chunks['content'] = chunks.content_response['choices'][0]['message']['content']
-        progress(0.6, desc="Script generation...")
-        chunks['script_response'] = openai.chat_completions(
-            messages=[
-                {
-                    'role': 'system',
-                    'content': f"""Convert content to audio script.
-                    Style: {docs.style}
-                    Format:
-                    - Clear sentence structures
-                    - Natural pauses (...)
-                    - Term definitions when needed
-                    - Proper transitions"""
-                },
-                {'role': 'user', 'content': chunks.content}
-            ],
-            model='gpt-4o-mini-2024-07-18',
-            max_tokens=max_tokens,
-            temperature=temperature
-        )
-        chunks['script'] = chunks.script_response['choices'][0]['message']['content']
-        progress(0.8, desc="Audio synthesis...")
-        @pxt.udf(return_type=pxt.Audio)
-        def generate_audio(script: str, voice: str):
-            if not script or not voice:
-                return None
-            try:
-                response = requests.post(
-                    "https://api.openai.com/v1/audio/speech",
-                    headers={"Authorization": f"Bearer {api_key}"},
-                    json={"model": "tts-1", "input": script, "voice": voice}
-                )
-                if response.status_code == 200:
-                    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
-                    temp_file.write(response.content)
-                    temp_file.close()
-                    return temp_file.name
-            except Exception as e:
-                print(f"Error in audio synthesis: {e}")
-            return None
-        chunks['audio'] = generate_audio(chunks.script, docs.voice)
-        audio_path = chunks.select(chunks.audio).tail(1)['audio'][0]
-        results = chunks.select(
-            chunks.content,
-            chunks.script
-        ).collect()
-        display_data = [
-            [f"Segment {idx + 1}", row['content'], row['script']]
-            for idx, row in enumerate(results)
-        ]
-        progress(1.0, desc="Complete")
-        return display_data, audio_path, "Processing complete"
-    except Exception as e:
-        return None, None, f"Error: {str(e)}"