Spaces:

usag1e
/

my-llm-endpoint-fresh

Runtime error

File size: 1,138 Bytes

8e1f74f
a931f78
8e1f74f
 
32777f1
8e1f74f
 
 
2c0f83d
 
 
 
7ac3def
419e196
2c0f83d
2cb7578
 
 
8e1f74f
a931f78
c9f6dd3
 
8e1f74f

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the model and tokenizer
MODEL_NAME = "deepseek-ai/DeepSeek-V3-Base"  # Change to the model you want
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,  # Allow execution of custom code
    low_cpu_mem_usage=True  # Ensures reduced memory usage
).to(device)

app = FastAPI()

class Query(BaseModel):
    input_text: str

@app.post("/predict")
async def predict(query: Query):
    input_text = query.input_text
    if not input_text:
        raise HTTPException(status_code=400, detail="Input text cannot be empty.")
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    outputs = model.generate(inputs["input_ids"], max_new_tokens=50, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return {"response": response}