abhijit2k01 commited on
Commit
4f4c02c
·
verified ·
1 Parent(s): 04451d9

updated with a smaller model.

Browse files
Files changed (1) hide show
  1. app/main.py +10 -6
app/main.py CHANGED
@@ -1,11 +1,14 @@
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
- from vllm import LLM, SamplingParams
 
4
 
5
  app = FastAPI()
6
 
7
  # Initialize the model (we'll use a small model for this example)
8
- model = LLM(model="EleutherAI/gpt-neo-125M")
 
 
9
 
10
  class GenerateRequest(BaseModel):
11
  prompt: str
@@ -13,12 +16,13 @@ class GenerateRequest(BaseModel):
13
  @app.post("/generate")
14
  async def generate(request: GenerateRequest):
15
  try:
16
- sampling_params = SamplingParams(temperature=0.7, max_tokens=100)
17
- outputs = model.generate([request.prompt], sampling_params)
18
- return {"generated_text": outputs[0].outputs[0].text}
 
19
  except Exception as e:
20
  raise HTTPException(status_code=500, detail=str(e))
21
 
22
  @app.get("/")
23
  async def root():
24
- return {"message": "vLLM server is running"}
 
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+ import torch
5
 
6
  app = FastAPI()
7
 
8
  # Initialize the model (we'll use a small model for this example)
9
+ model_name = "EleutherAI/gpt-neo-125M"
10
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
11
+ model = AutoModelForCausalLM.from_pretrained(model_name)
12
 
13
  class GenerateRequest(BaseModel):
14
  prompt: str
 
16
  @app.post("/generate")
17
  async def generate(request: GenerateRequest):
18
  try:
19
+ input_ids = tokenizer.encode(request.prompt, return_tensors="pt")
20
+ output = model.generate(input_ids, max_length=100, num_return_sequences=1)
21
+ generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
22
+ return {"generated_text": generated_text}
23
  except Exception as e:
24
  raise HTTPException(status_code=500, detail=str(e))
25
 
26
  @app.get("/")
27
  async def root():
28
+ return {"message": "Model server is running"}