llm_host / utils.py
Bahodir Nematjonov
updated
8b883c8
raw
history blame
688 Bytes
import asyncio
import ollama
from typing import List
def cosine_similarity(embedding_0, embedding_1):
pass
def generate_embedding(model, text: str, model_type: str) -> List[float]:
pass
async def generate_stream(query: str):
"""Stream responses from Ollama in real-time."""
stream = ollama.chat(
model="llama3.2", # Choose your model (mistral, llama2, gemma)
messages=[{"role": "user", "content": query}],
stream=True # Enable streaming
)
for chunk in stream:
if "message" in chunk and "content" in chunk["message"]:
yield chunk["message"]["content"]
await asyncio.sleep(0) # Allow async executi