from fastapi import FastAPI, HTTPException from pydantic import BaseModel from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import torch app = FastAPI() # Define paths base_model_path = "NousResearch/Hermes-3-Llama-3.2-3B" adapter_path = "zach9111/llama_startup_adapter" # Check if GPU is available device = "cuda" if torch.cuda.is_available() else "cpu" # Load base model with `device_map="auto"` to handle GPUs automatically base_model = AutoModelForCausalLM.from_pretrained( base_model_path, torch_dtype=torch.float16, device_map="auto" ) # Load adapter and ensure it is on the correct device model = PeftModel.from_pretrained(base_model, adapter_path).to(device) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(base_model_path) class GenerateRequest(BaseModel): prompt: str # **Use `model.generate()` instead of `pipeline()`** def generate_text_from_model(prompt: str): try: input_ids = tokenizer(f"[INST] {prompt} [/INST]", return_tensors="pt").input_ids.to(device) output_ids = model.generate(input_ids, max_length=512) generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) return generated_text except Exception as e: raise HTTPException(status_code=500, detail=str(e)) # Root endpoint for testing @app.get("/") async def root(): return {"message": "Model is running! Use /generate/ for text generation."} # Text generation endpoint @app.post("/generate/") async def generate_text(request: GenerateRequest): response = generate_text_from_model(request.prompt) return {"response": response}