Spaces:

abhijit2k01
/

vllm-benchmark

Sleeping

vllm-benchmark / app /main.py

GPT-Neo 125M created in app/main.py

4c931c6 verified 9 months ago

742 Bytes

	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from vllm import LLM, SamplingParams

	app = FastAPI()

	# Initialize the model (we'll use a small model for this example)
	model = LLM(model="EleutherAI/gpt-neo-125M")

	class GenerateRequest(BaseModel):
	prompt: str

	@app.post("/generate")
	async def generate(request: GenerateRequest):
	try:
	sampling_params = SamplingParams(temperature=0.7, max_tokens=100)
	outputs = model.generate([request.prompt], sampling_params)
	return {"generated_text": outputs[0].outputs[0].text}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	@app.get("/")
	async def root():
	return {"message": "vLLM server is running"}