llm_host / utils.py
Bahodir Nematjonov
index files updated
667244c
raw
history blame
587 Bytes
import asyncio
import ollama
async def generate_stream(query: str):
"""Generates streamed responses from Ollama using LLaMA 3."""
try:
stream = ollama.chat(
model="llama3.2",
messages=[{"role": "user", "content": query}],
stream=True
)
# Stream output without unnecessary delay
for chunk in stream:
if "message" in chunk and "content" in chunk["message"]:
yield chunk["message"]["content"] # ✅ No sleep needed
except Exception as e:
yield f"⚠️ Error: {str(e)}"