Kaan commited on
Commit
c074c98
·
verified ·
1 Parent(s): e5c27c2
Files changed (1) hide show
  1. app.py +11 -12
app.py CHANGED
@@ -1,5 +1,6 @@
1
  from fastapi import FastAPI
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
3
 
4
 
5
  # Create an instance of the FastAPI class
@@ -8,18 +9,16 @@ app = FastAPI()
8
  # Define a route for the root endpoint
9
  @app.get("/llm")
10
  async def read_root():
11
- device = "cpu"
12
- model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
13
- tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
14
- text = """<s>[INST] What is your favourite condiment? [/INST]
15
- """
16
- encodeds = tokenizer(text, return_tensors="pt", add_special_tokens=False)
17
- model_inputs = encodeds.to(device)
18
- model.to(device)
19
- generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True)
20
- decoded = tokenizer.batch_decode(generated_ids)
21
- print(decoded[0])
22
- return {"message": decoded[0]}
23
 
24
 
25
 
 
1
  from fastapi import FastAPI
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ from llama_cpp import Llama
4
 
5
 
6
  # Create an instance of the FastAPI class
 
9
  # Define a route for the root endpoint
10
  @app.get("/llm")
11
  async def read_root():
12
+ llm = Llama.from_pretrained(
13
+ repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
14
+ filename="*q8_0.gguf",
15
+ verbose=False)
16
+ output = llm(
17
+ "Q: Name the planets in the solar system? A: ", # Prompt
18
+ max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
19
+ stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
20
+ echo=True # Echo the prompt back in the output)
21
+ return {"message": output}
 
 
22
 
23
 
24