Spaces:

sagar007
/

DeepSeekR1_Search

Running on Zero

App Files Files Community

sagar007 commited on Feb 5

Commit

b17a402

verified ·

1 Parent(s): 270de0e

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -94

app.py CHANGED Viewed

@@ -8,13 +8,14 @@ from datetime import datetime
 import os
 import subprocess
 import numpy as np
-## Install required dependencies for Kokoro with better error handling
 try:
     subprocess.run(['git', 'lfs', 'install'], check=True)
     if not os.path.exists('Kokoro-82M'):
         subprocess.run(['git', 'clone', 'https://huggingface.co/hexgrad/Kokoro-82M'], check=True)
     # Try installing espeak with proper package manager commands
     try:
         # Update package list first
@@ -32,42 +33,58 @@ except Exception as e:
     print(f"Warning: Initial setup error: {str(e)}")
     print("Continuing with limited functionality...")
-# Initialize models and tokenizers
 model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 tokenizer.pad_token = tokenizer.eos_token
-# Move model initialization inside a function to prevent CUDA initialization in main process
-def init_models():
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        device_map="auto",
-        offload_folder="offload",
-        low_cpu_mem_usage=True,
-        torch_dtype=torch.float16
-    )
-    return model
-# Initialize Kokoro TTS with better error handling
 try:
-    import sys
-    sys.path.append('Kokoro-82M')
-    from models import build_model
-    from kokoro import generate
-    # Don't initialize models/voices in main process for ZeroGPU compatibility
-    VOICE_CHOICES = {
-        '🇺🇸 Female (Default)': 'af',
-        '🇺🇸 Bella': 'af_bella',
-        '🇺🇸 Sarah': 'af_sarah',
-        '🇺🇸 Nicole': 'af_nicole'
-    }
-    TTS_ENABLED = True
 except Exception as e:
     print(f"Warning: Could not initialize Kokoro TTS: {str(e)}")
     TTS_ENABLED = False
-def get_web_results(query, max_results=5):  # Increased to 5 for better context
     """Get web search results using DuckDuckGo"""
     try:
         with DDGS() as ddgs:
@@ -79,30 +96,27 @@ def get_web_results(query, max_results=5):  # Increased to 5 for better context
                 "date": result.get("published", "")
             } for result in results]
     except Exception as e:
         return []
-def format_prompt(query, context):
     """Format the prompt with web context"""
     current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     context_lines = '\n'.join([f'- [{res["title"]}]: {res["snippet"]}' for res in context])
     return f"""You are an intelligent search assistant. Answer the user's query using the provided web context.
 Current Time: {current_time}
 Important: For election-related queries, please distinguish clearly between different election years and types (presidential vs. non-presidential). Only use information from the provided web context.
 Query: {query}
 Web Context:
 {context_lines}
 Provide a detailed answer in markdown format. Include relevant information from sources and cite them using [1], [2], etc. If the query is about elections, clearly specify which year and type of election you're discussing.
 Answer:"""
-def format_sources(web_results):
     """Format sources with more details"""
     if not web_results:
         return "<div class='no-sources'>No sources available</div>"
     sources_html = "<div class='sources-container'>"
     for i, res in enumerate(web_results, 1):
         title = res["title"] or "Source"
@@ -120,22 +134,18 @@ def format_sources(web_results):
     sources_html += "</div>"
     return sources_html
-# Wrap the answer generation with spaces.GPU decorator
 @spaces.GPU(duration=30)
-def generate_answer(prompt):
     """Generate answer using the DeepSeek model"""
-    # Initialize model inside the GPU-decorated function
-    model = init_models()
     inputs = tokenizer(
-        prompt,
-        return_tensors="pt",
         padding=True,
         truncation=True,
         max_length=512,
         return_attention_mask=True
     ).to(model.device)
     outputs = model.generate(
         inputs.input_ids,
         attention_mask=inputs.attention_mask,
@@ -148,74 +158,75 @@ def generate_answer(prompt):
     )
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
-# Similarly wrap TTS generation with spaces.GPU
 @spaces.GPU(duration=60)
-def generate_speech_with_gpu(text, voice_name='af'):
-    """Generate speech from text using Kokoro TTS model with GPU handling"""
     try:
-        # Initialize TTS model and voice inside GPU function
-        device = 'cuda'
-        TTS_MODEL = build_model('Kokoro-82M/kokoro-v0_19.pth', device)
-        VOICEPACK = torch.load(f'Kokoro-82M/voices/{voice_name}.pt', weights_only=True).to(device)
         # Clean the text
         clean_text = ' '.join([line for line in text.split('\n') if not line.startswith('#')])
         clean_text = clean_text.replace('[', '').replace(']', '').replace('*', '')
-        # Split long text into chunks
         max_chars = 1000
         chunks = []
         if len(clean_text) > max_chars:
             sentences = clean_text.split('.')
             current_chunk = ""
             for sentence in sentences:
-                if len(current_chunk) + len(sentence) < max_chars:
                     current_chunk += sentence + "."
                 else:
-                    if current_chunk:
-                        chunks.append(current_chunk)
                     current_chunk = sentence + "."
-            if current_chunk:
-                chunks.append(current_chunk)
         else:
             chunks = [clean_text]
         # Generate audio for each chunk
         audio_chunks = []
         for chunk in chunks:
             if chunk.strip():  # Only process non-empty chunks
-                chunk_audio, _ = generate(TTS_MODEL, chunk.strip(), VOICEPACK, lang='a')
                 if isinstance(chunk_audio, torch.Tensor):
                     chunk_audio = chunk_audio.cpu().numpy()
                 audio_chunks.append(chunk_audio)
-        # Concatenate chunks if we have any
         if audio_chunks:
-            if len(audio_chunks) > 1:
-                final_audio = np.concatenate(audio_chunks)
-            else:
-                final_audio = audio_chunks[0]
             return (24000, final_audio)
-        return None
     except Exception as e:
         print(f"Error generating speech: {str(e)}")
         import traceback
         traceback.print_exc()
         return None
-def process_query(query, history, selected_voice='af'):
     """Process user query with streaming effect"""
     try:
         if history is None:
             history = []
         # Get web results first
         web_results = get_web_results(query)
         sources_html = format_sources(web_results)
         current_history = history + [[query, "*Searching...*"]]
         yield {
             answer_output: gr.Markdown("*Searching & Thinking...*"),
@@ -224,48 +235,48 @@ def process_query(query, history, selected_voice='af'):
             chat_history_display: current_history,
             audio_output: None
         }
         # Generate answer
         prompt = format_prompt(query, web_results)
         answer = generate_answer(prompt)
         final_answer = answer.split("Answer:")[-1].strip()
-        # Generate speech from the answer
         if TTS_ENABLED:
             try:
-                yield {
-                    answer_output: gr.Markdown(final_answer),
-                    sources_output: gr.HTML(sources_html),
-                    search_btn: gr.Button("Generating audio...", interactive=False),
-                    chat_history_display: history + [[query, final_answer]],
-                    audio_output: None
-                }
                 audio = generate_speech_with_gpu(final_answer, selected_voice)
-                if audio is None:
-                    print("Failed to generate audio")
             except Exception as e:
-                print(f"Error in speech generation: {str(e)}")
                 audio = None
         else:
             audio = None
-        updated_history = history + [[query, final_answer]]
         yield {
             answer_output: gr.Markdown(final_answer),
             sources_output: gr.HTML(sources_html),
             search_btn: gr.Button("Search", interactive=True),
             chat_history_display: updated_history,
-            audio_output: audio if audio is not None else gr.Audio(value=None)
         }
     except Exception as e:
         error_message = str(e)
         if "GPU quota" in error_message:
-            error_message = "⚠️ GPU quota exceeded. Please try again later when the daily quota resets."
         yield {
             answer_output: gr.Markdown(f"Error: {error_message}"),
-            sources_output: gr.HTML(sources_html),
             search_btn: gr.Button("Search", interactive=True),
             chat_history_display: history + [[query, f"*Error: {error_message}*"]],
             audio_output: None

 import os
 import subprocess
 import numpy as np
+from typing import List, Dict, Tuple, Any
+# Install required dependencies for Kokoro with better error handling
 try:
     subprocess.run(['git', 'lfs', 'install'], check=True)
     if not os.path.exists('Kokoro-82M'):
         subprocess.run(['git', 'clone', 'https://huggingface.co/hexgrad/Kokoro-82M'], check=True)
     # Try installing espeak with proper package manager commands
     try:
         # Update package list first
     print(f"Warning: Initial setup error: {str(e)}")
     print("Continuing with limited functionality...")
+# --- Initialization (Do this ONCE) ---
 model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 tokenizer.pad_token = tokenizer.eos_token
+# Initialize DeepSeek model
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    device_map="auto",
+    offload_folder="offload",
+    low_cpu_mem_usage=True,
+    torch_dtype=torch.float16
+)
+# Initialize Kokoro TTS (with error handling)
+VOICE_CHOICES = {
+    '🇺🇸 Female (Default)': 'af',
+    '🇺🇸 Bella': 'af_bella',
+    '🇺🇸 Sarah': 'af_sarah',
+    '🇺🇸 Nicole': 'af_nicole'
+}
+TTS_ENABLED = False
+TTS_MODEL = None
+VOICEPACK = None
 try:
+    if os.path.exists('Kokoro-82M'):
+        import sys
+        sys.path.append('Kokoro-82M')
+        from models import build_model  # type: ignore
+        from kokoro import generate  # type: ignore
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Correct device handling
+        TTS_MODEL = build_model('Kokoro-82M/kokoro-v0_19.pth', device)
+        # Load default voice
+        try:
+            VOICEPACK = torch.load('Kokoro-82M/voices/af.pt', map_location=device, weights_only=True)
+        except Exception as e:
+            print(f"Warning: Could not load default voice: {e}")
+            raise
+        TTS_ENABLED = True
+    else:
+        print("Warning: Kokoro-82M directory not found. TTS disabled.")
 except Exception as e:
     print(f"Warning: Could not initialize Kokoro TTS: {str(e)}")
     TTS_ENABLED = False
+def get_web_results(query: str, max_results: int = 5) -> List[Dict[str, str]]:
     """Get web search results using DuckDuckGo"""
     try:
         with DDGS() as ddgs:
                 "date": result.get("published", "")
             } for result in results]
     except Exception as e:
+        print(f"Error in web search: {e}")
         return []
+def format_prompt(query: str, context: List[Dict[str, str]]) -> str:
     """Format the prompt with web context"""
     current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     context_lines = '\n'.join([f'- [{res["title"]}]: {res["snippet"]}' for res in context])
     return f"""You are an intelligent search assistant. Answer the user's query using the provided web context.
 Current Time: {current_time}
 Important: For election-related queries, please distinguish clearly between different election years and types (presidential vs. non-presidential). Only use information from the provided web context.
 Query: {query}
 Web Context:
 {context_lines}
 Provide a detailed answer in markdown format. Include relevant information from sources and cite them using [1], [2], etc. If the query is about elections, clearly specify which year and type of election you're discussing.
 Answer:"""
+def format_sources(web_results: List[Dict[str, str]]) -> str:
     """Format sources with more details"""
     if not web_results:
         return "<div class='no-sources'>No sources available</div>"
     sources_html = "<div class='sources-container'>"
     for i, res in enumerate(web_results, 1):
         title = res["title"] or "Source"
     sources_html += "</div>"
     return sources_html
 @spaces.GPU(duration=30)
+def generate_answer(prompt: str) -> str:
     """Generate answer using the DeepSeek model"""
     inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
         padding=True,
         truncation=True,
         max_length=512,
         return_attention_mask=True
     ).to(model.device)
     outputs = model.generate(
         inputs.input_ids,
         attention_mask=inputs.attention_mask,
     )
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
 @spaces.GPU(duration=60)
+def generate_speech_with_gpu(text: str, voice_name: str = 'af', tts_model = TTS_MODEL, voicepack = VOICEPACK) -> Tuple[int, np.ndarray] | None:
+    """Generate speech from text using Kokoro TTS model."""
+    if not TTS_ENABLED or tts_model is None:
+        print("TTS is not enabled or model is not loaded.")
+        return None
     try:
+        # Load voicepack if it hasn't been loaded or if a different voice is requested
+        if voice_name != 'af' or voicepack is None :
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+            voicepack = torch.load(f'Kokoro-82M/voices/{voice_name}.pt', map_location=device, weights_only=True)
         # Clean the text
         clean_text = ' '.join([line for line in text.split('\n') if not line.startswith('#')])
         clean_text = clean_text.replace('[', '').replace(']', '').replace('*', '')
+        # Split long text into chunks (improved logic)
         max_chars = 1000
         chunks = []
         if len(clean_text) > max_chars:
             sentences = clean_text.split('.')
             current_chunk = ""
             for sentence in sentences:
+                if len(current_chunk) + len(sentence) + 1 < max_chars:  # +1 for the dot
                     current_chunk += sentence + "."
                 else:
+                    chunks.append(current_chunk.strip())
                     current_chunk = sentence + "."
+            if current_chunk:  # Add the last chunk
+                chunks.append(current_chunk.strip())
         else:
             chunks = [clean_text]
         # Generate audio for each chunk
         audio_chunks = []
         for chunk in chunks:
             if chunk.strip():  # Only process non-empty chunks
+                chunk_audio, _ = generate(tts_model, chunk, voicepack, lang='a')
                 if isinstance(chunk_audio, torch.Tensor):
                     chunk_audio = chunk_audio.cpu().numpy()
                 audio_chunks.append(chunk_audio)
+        # Concatenate chunks
         if audio_chunks:
+            final_audio = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
             return (24000, final_audio)
+        else:
+            return None
     except Exception as e:
         print(f"Error generating speech: {str(e)}")
         import traceback
         traceback.print_exc()
         return None
+def process_query(query: str, history: List[List[str]], selected_voice: str = 'af') -> Dict[str, Any]:
     """Process user query with streaming effect"""
     try:
         if history is None:
             history = []
         # Get web results first
         web_results = get_web_results(query)
         sources_html = format_sources(web_results)
         current_history = history + [[query, "*Searching...*"]]
         yield {
             answer_output: gr.Markdown("*Searching & Thinking...*"),
             chat_history_display: current_history,
             audio_output: None
         }
         # Generate answer
         prompt = format_prompt(query, web_results)
         answer = generate_answer(prompt)
         final_answer = answer.split("Answer:")[-1].strip()
+        # Update history *before* TTS (important for correct display)
+        updated_history = history + [[query, final_answer]]
+        # Generate speech from the answer (only if enabled)
         if TTS_ENABLED:
+            yield {  # Intermediate update before TTS
+                answer_output: gr.Markdown(final_answer),
+                sources_output: gr.HTML(sources_html),
+                search_btn: gr.Button("Generating audio...", interactive=False),
+                chat_history_display: updated_history,
+                audio_output: None
+            }
             try:
                 audio = generate_speech_with_gpu(final_answer, selected_voice)
             except Exception as e:
+                print(f"Error during TTS: {e}")
                 audio = None
         else:
             audio = None
         yield {
             answer_output: gr.Markdown(final_answer),
             sources_output: gr.HTML(sources_html),
             search_btn: gr.Button("Search", interactive=True),
             chat_history_display: updated_history,
+            audio_output: audio if audio is not None else gr.Audio(value=None)  # Ensure valid audio output
         }
     except Exception as e:
         error_message = str(e)
         if "GPU quota" in error_message:
+            error_message = "⚠️ GPU quota exceeded.  Please try again later when the daily quota resets."
         yield {
             answer_output: gr.Markdown(f"Error: {error_message}"),
+            sources_output: gr.HTML(sources_html), #Still show sources on error
             search_btn: gr.Button("Search", interactive=True),
             chat_history_display: history + [[query, f"*Error: {error_message}*"]],
             audio_output: None