Lhumpal commited on
Commit
e412a0c
·
verified ·
1 Parent(s): 5221913

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -15
app.py CHANGED
@@ -7,14 +7,6 @@ import time # Import time module for measuring response time
7
 
8
  app = FastAPI()
9
 
10
- # Get the token from the environment variable
11
- hf_token = os.environ.get("HF_TOKEN")
12
-
13
- if hf_token:
14
- client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct", token=hf_token)
15
- else:
16
- raise ValueError("HF_TOKEN environment variable not set. Please add it as a secret in your Hugging Face Space.")
17
-
18
  # model_name = 'llama3.2'
19
  model_name = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M'
20
 
@@ -29,13 +21,13 @@ class ChatRequest(BaseModel):
29
 
30
  class ChatResponse(BaseModel):
31
  model_status: str
32
- response: str
33
- response_time: float # Add field for response time
34
 
35
  @app.post("/chat", response_model=ChatResponse)
36
  async def chat(request: ChatRequest):
37
  try:
38
- start_time = time.time() # Start measuring time
39
 
40
  messages = [
41
  {'role': 'system', 'content': request.system_message},
@@ -44,13 +36,14 @@ async def chat(request: ChatRequest):
44
 
45
  # Simulate getting the response from the model
46
  response = ollama.chat(model=model_name, messages=messages)
 
 
47
  # response = "pending"
48
- response = str(response)
49
 
50
- end_time = time.time() # End measuring time
51
- response_time = end_time - start_time # Calculate the response time
52
 
53
- return {"model_status": model.status, "response": response, "response_time": response_time}
54
 
55
  except Exception as e:
56
  raise HTTPException(status_code=500, detail=str(e))
 
7
 
8
  app = FastAPI()
9
 
 
 
 
 
 
 
 
 
10
  # model_name = 'llama3.2'
11
  model_name = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M'
12
 
 
21
 
22
  class ChatResponse(BaseModel):
23
  model_status: str
24
+ response: dict
25
+ response_time: float
26
 
27
  @app.post("/chat", response_model=ChatResponse)
28
  async def chat(request: ChatRequest):
29
  try:
30
+ start_time = time.time()
31
 
32
  messages = [
33
  {'role': 'system', 'content': request.system_message},
 
36
 
37
  # Simulate getting the response from the model
38
  response = ollama.chat(model=model_name, messages=messages)
39
+ response_dict = vars(response)
40
+ assistant_response = response_dict["message"]["content"]
41
  # response = "pending"
 
42
 
43
+ end_time = time.time()
44
+ response_time = end_time - start_time
45
 
46
+ return {"model_status": model.status, "response_time": response_time, "response": assistant_response}
47
 
48
  except Exception as e:
49
  raise HTTPException(status_code=500, detail=str(e))