import asyncio import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline # Load latest available LLaMA model (Change this if LLaMA 3 becomes available) MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf" # Detect device (Use GPU if available) device = "cuda" if torch.cuda.is_available() else "cpu" # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" ).to(device) # Text generation pipeline for efficient inference generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) async def generate_stream(query: str): """Stream responses using LLaMA.""" input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device) # Generate text output = generator(query, max_length=512, do_sample=True, temperature=0.7) response_text = output[0]["generated_text"] # Simulate streaming for word in response_text.split(): yield word + " " await asyncio.sleep(0.05) yield "\n"