File size: 706 Bytes
52af2c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from fastapi import FastAPI
from pydantic import BaseModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

app = FastAPI()

# Cargar modelo
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "nikravan/glm-4vq"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")

class Query(BaseModel):
    question: str

@app.post("/predict")
def predict(data: Query):
    inputs = tokenizer(data.question, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_length=200)
    return {"answer": tokenizer.decode(outputs[0])}