import asyncio import ollama from typing import List import time def cosine_similarity(embedding_0, embedding_1): pass def generate_embedding(model, text: str, model_type: str) -> List[float]: pass async def generate_stream(query: str): """Stream responses from Ollama with automatic retries.""" max_retries = 5 # Retry 5 times delay = 3 # Wait 3 seconds before retrying for attempt in range(max_retries): try: stream = ollama.chat( model="mistral", # Use your preferred model messages=[{"role": "user", "content": query}], stream=True ) for chunk in stream: if "message" in chunk and "content" in chunk["message"]: yield chunk["message"]["content"] await asyncio.sleep(0) return except Exception as e: print(f"❌ Ollama connection failed (Attempt {attempt+1}/{max_retries}): {str(e)}") if attempt < max_retries - 1: time.sleep(delay) # Wait before retrying else: yield "⚠️ Error: Could not connect to Ollama after multiple attempts."