import asyncio
import ollama
from typing import List
import time
def cosine_similarity(embedding_0, embedding_1):
    pass

def generate_embedding(model, text: str, model_type: str) -> List[float]:
    pass

async def generate_stream(query: str):
    """Stream responses from Ollama with automatic retries."""
    max_retries = 5  # Retry 5 times
    delay = 3  # Wait 3 seconds before retrying

    for attempt in range(max_retries):
        try:
            stream = ollama.chat(
                model="mistral",  # Use your preferred model
                messages=[{"role": "user", "content": query}],
                stream=True
            )
            for chunk in stream:
                if "message" in chunk and "content" in chunk["message"]:
                    yield chunk["message"]["content"]
                    await asyncio.sleep(0)
            return
        except Exception as e:
            print(f"❌ Ollama connection failed (Attempt {attempt+1}/{max_retries}): {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(delay)  # Wait before retrying
            else:
                yield "⚠️ Error: Could not connect to Ollama after multiple attempts."