import gradio as gr import pixeltable as pxt from pixeltable.iterators import DocumentSplitter from pixeltable.functions import openai import os import requests import tempfile def process_document(pdf_file, api_key, voice_choice, style_choice, chunk_size, temperature, max_tokens, progress=gr.Progress()): try: os.environ['OPENAI_API_KEY'] = api_key progress(0.1, desc="Initializing...") pxt.drop_dir('document_audio', force=True) pxt.create_dir('document_audio') docs = pxt.create_table( 'document_audio.documents', { 'document': pxt.DocumentType(), 'voice': pxt.StringType(), 'style': pxt.StringType() } ) progress(0.2, desc="Processing document...") docs.insert([{ 'document': pdf_file.name, 'voice': voice_choice, 'style': style_choice }]) chunks = pxt.create_view( 'document_audio.chunks', docs, iterator=DocumentSplitter.create( document=docs.document, separators='token_limit', limit=chunk_size ) ) progress(0.4, desc="Text processing...") chunks['content_response'] = openai.chat_completions( messages=[ { 'role': 'system', 'content': """Transform this text segment into clear, concise content. Structure: 1. Core concepts and points 2. Supporting details 3. Key takeaways""" }, {'role': 'user', 'content': chunks.text} ], model='gpt-4o-mini-2024-07-18', max_tokens=max_tokens, temperature=temperature ) chunks['content'] = chunks.content_response['choices'][0]['message']['content'] progress(0.6, desc="Script generation...") chunks['script_response'] = openai.chat_completions( messages=[ { 'role': 'system', 'content': f"""Convert content to audio script. Style: {docs.style} Format: - Clear sentence structures - Natural pauses (...) - Term definitions when needed - Proper transitions - Appropriate pronunciation guidance""" }, {'role': 'user', 'content': chunks.content} ], model='gpt-4o-mini-2024-07-18', max_tokens=max_tokens, temperature=temperature ) chunks['script'] = chunks.script_response['choices'][0]['message']['content'] progress(0.8, desc="Audio synthesis...") @pxt.udf(return_type=pxt.AudioType()) def generate_audio(script: str, voice: str): if not script or not voice: return None try: response = requests.post( "https://api.openai.com/v1/audio/speech", headers={"Authorization": f"Bearer {api_key}"}, json={"model": "tts-1", "input": script, "voice": voice} ) if response.status_code == 200: temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') temp_file.write(response.content) temp_file.close() return temp_file.name except Exception as e: print(f"Error in audio synthesis: {e}") return None chunks['audio'] = generate_audio(chunks.script, docs.voice) audio_path = chunks.select(chunks.audio).tail(1)['audio'][0] results = chunks.select(chunks.content, chunks.script).collect() display_data = [ [f"Segment {idx + 1}", row['content'], row['script']] for idx, row in enumerate(results) ] progress(1.0, desc="Complete") return display_data, audio_path, "Processing complete" except Exception as e: return None, None, f"Error: {str(e)}" with gr.Blocks(theme=gr.themes.Base()) as demo: gr.Markdown( """
⚠️ API Cost Notice: This application uses OpenAI's Text-to-Speech API which incurs costs per use. See OpenAI's TTS Documentation for current pricing information.
🚀 Powered by Pixeltable | 📚 Docs | 🤗 HF Space