from fastapi import FastAPI, HTTPException from pydantic import BaseModel from vllm import LLM, SamplingParams app = FastAPI() # Initialize the model (we'll use a small model for this example) model = LLM(model="EleutherAI/gpt-neo-125M") class GenerateRequest(BaseModel): prompt: str @app.post("/generate") async def generate(request: GenerateRequest): try: sampling_params = SamplingParams(temperature=0.7, max_tokens=100) outputs = model.generate([request.prompt], sampling_params) return {"generated_text": outputs[0].outputs[0].text} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/") async def root(): return {"message": "vLLM server is running"}