File size: 3,292 Bytes
bebe766
98d2712
d75ac70
bebe766
98d2712
 
 
 
25f269c
25d3011
25f269c
98d2712
 
d75ac70
 
98d2712
 
0f1a312
 
 
 
1fd682a
 
 
98d2712
 
25d3011
 
 
bebe766
0f1a312
25d3011
98d2712
 
 
 
0f1a312
 
 
 
 
 
 
25d3011
 
 
 
 
 
 
 
 
0f1a312
 
e6bc530
1fd682a
e6bc530
 
 
0f1a312
 
25f269c
1fd682a
25d3011
25f269c
 
d7e0f2f
 
 
1fd682a
d7e0f2f
 
 
0f1a312
 
 
 
d7e0f2f
25f269c
98d2712
bebe766
 
 
98d2712
 
 
1fd682a
 
 
 
 
 
 
 
 
 
 
 
 
73b6736
98d2712
e6bc530
 
 
 
 
0f1a312
 
d7e0f2f
 
 
 
0f1a312
 
52612f6
0f1a312
 
 
 
d7e0f2f
 
0f1a312
 
a43c0c5
0f1a312
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from fastapi import FastAPI
import os
from typing import Union

from custom_llm import CustomLLM

from pydantic import BaseModel
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFacePipeline
from langchain_huggingface import HuggingFaceEndpoint


class ConversationPost(BaseModel):
    tenant: Union[str, None] = None
    module: Union[str, None] = None
    question: str

class InferencePost(BaseModel):
    question: str
    with_template: Union[str, None] = None

class LLMPost(BaseModel):
    model: str
    question: str

API_TOKEN = os.environ['HF_API_KEY']

os.environ["HUGGINGFACEHUB_API_TOKEN"] = API_TOKEN

app = FastAPI()
prompt_qwen = PromptTemplate.from_template("""<|im_start|>system
Kamu adalah Asisten AI yang dikembangkan oleh Jonthan Jordan. Answer strictly in Bahasa Indonesia<|im_end|>
<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
""")

prompt_llama = PromptTemplate.from_template("""<|start_header_id|>system<|end_header_id|>

Kamu adalah Asisten AI yang dikembangkan oleh Jonthan Jordan. Answer strictly in Bahasa Indonesia<|eot_id|><|start_header_id|>user<|end_header_id|>

{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
""")
# llm = prompt | HuggingFacePipeline.from_model_id(
#     model_id="Qwen/Qwen2-1.5B-Instruct",
#     task="text-generation",
#     pipeline_kwargs={
#         "max_new_tokens": 150,
#         "return_full_text":False
#     },
# )

llama = HuggingFaceEndpoint(
    repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
    task="text-generation",
    max_new_tokens=4096,
    do_sample=False,
)

qwen = HuggingFaceEndpoint(
    repo_id="Qwen/Qwen1.5-4B-Chat",
    task="text-generation",
    max_new_tokens=4096,
    do_sample=False,
)

qwen2 = HuggingFaceEndpoint(
    repo_id="Qwen/Qwen2-1.5B-Instruct",
    task="text-generation",
    max_new_tokens=4096,
    do_sample=False,
)

llm = prompt_qwen | qwen

llm2 = prompt_llama | llama

llm3 = prompt_qwen | qwen2
# llm = prompt | CustomLLM(repo_id="Qwen/Qwen-VL-Chat", model_type='text-generation', api_token=API_TOKEN, max_new_tokens=150).bind(stop=['<|im_end|>'])


@app.get("/")
def greet_json():
    return {"Hello": "World!"}



@app.post("/chat")
async def chat(data: LLMPost):
    if data.model == 'llama':
        return {"data":llama.invoke(data.question)}
    elif data.model == 'qwen':
        return {"data":qwen.invoke(data.question)}
    else:
        return {"data":qwen2.invoke(data.question)}




@app.post("/conversation")
async def conversation(data : ConversationPost):
    return {"output":llm.invoke({"question":data.question})}


@app.post("/conversation2")
async def conversation2(data : ConversationPost):
    return {"output":llm2.invoke({"question":data.question})}

@app.post("/conversation3")
async def conversation3(data : ConversationPost):
    return {"output":llm3.invoke({"question":data.question})}


@app.post("/inference")
async def inference(data : InferencePost):
    if data.with_template == 'llama':
        out = llm2.invoke(data.question)
    elif data.with_template == 'qwen':
        out = llm.invoke(data.question)
    elif data.with_template == 'qwen2':
        out = llm3.invoke(data.question)
    else:
        out = llama.invoke(data.question)
        
    return {"output":out}