handsomeguy001
support litellm
8723eb9
from lagent.llms import BaseAPIModel
from typing import List, Optional, Union
from litellm import completion
from lagent.schema import ModelStatusCode
from lagent.utils.util import filter_suffix
import os
from lagent.llms import (GPTAPI, INTERNLM2_META, HFTransformerCasualLM,
LMDeployClient, LMDeployServer)
internlm_server = dict(type=LMDeployServer,
path='internlm/internlm2_5-7b-chat',
model_name='internlm2',
meta_template=INTERNLM2_META,
top_p=0.8,
top_k=1,
temperature=0,
max_new_tokens=8192,
repetition_penalty=1.02,
stop_words=['<|im_end|>'])
internlm_client = dict(type=LMDeployClient,
model_name='internlm2_5-7b-chat',
url='http://127.0.0.1:23333',
meta_template=INTERNLM2_META,
top_p=0.8,
top_k=1,
temperature=0,
max_new_tokens=8192,
repetition_penalty=1.02,
stop_words=['<|im_end|>'])
internlm_hf = dict(type=HFTransformerCasualLM,
path='internlm/internlm2_5-7b-chat',
meta_template=INTERNLM2_META,
top_p=0.8,
top_k=None,
temperature=1e-6,
max_new_tokens=8192,
repetition_penalty=1.02,
stop_words=['<|im_end|>'])
# openai_api_base needs to fill in the complete chat api address, such as: https://api.openai.com/v1/chat/completions
gpt4 = dict(type=GPTAPI,
model_type='gpt-4-turbo',
key=os.environ.get('OPENAI_API_KEY', 'YOUR OPENAI API KEY'),
openai_api_base=os.environ.get(
'OPENAI_API_BASE', 'https://api.openai.com/v1/chat/completions'),
)
url = 'https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation'
qwen = dict(type=GPTAPI,
model_type='qwen-max-longcontext',
key=os.environ.get('QWEN_API_KEY', 'YOUR QWEN API KEY'),
openai_api_base=url,
meta_template=[
dict(role='system', api_role='system'),
dict(role='user', api_role='user'),
dict(role='assistant', api_role='assistant'),
dict(role='environment', api_role='system')
],
top_p=0.8,
top_k=1,
temperature=0,
max_new_tokens=4096,
repetition_penalty=1.02,
stop_words=['<|im_end|>'])
internlm_silicon = dict(type=GPTAPI,
model_type='internlm/internlm2_5-7b-chat',
key=os.environ.get(
'SILICON_API_KEY', 'YOUR SILICON API KEY'),
openai_api_base='https://api.siliconflow.cn/v1/chat/completions',
meta_template=[
dict(role='system', api_role='system'),
dict(role='user', api_role='user'),
dict(role='assistant', api_role='assistant'),
dict(role='environment', api_role='system')
],
top_p=0.8,
top_k=1,
temperature=0,
max_new_tokens=8192,
repetition_penalty=1.02,
stop_words=['<|im_end|>'])
class litellmCompletion(BaseAPIModel):
"""
Args:
path (str): The path to the model.
It could be one of the following options:
- i) A local directory path of a turbomind model which is
converted by `lmdeploy convert` command or download
from ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "internlm/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "internlm-chat-7b",
"Qwen-7B-Chat ", "Baichuan2-7B-Chat" and so on.
tp (int): tensor parallel
pipeline_cfg (dict): config of pipeline
"""
def __init__(self,
path='',
model_name="command-r",
**kwargs):
self.model_name = model_name
super().__init__(path, **kwargs)
def generate(self,
inputs: Union[str, List[str]],
do_preprocess: bool = None,
skip_special_tokens: bool = False,
**kwargs):
"""Return the chat completions in non-stream mode.
Args:
inputs (Union[str, List[str]]): input texts to be completed.
do_preprocess (bool): whether pre-process the messages. Default to
True, which means chat_template will be applied.
skip_special_tokens (bool): Whether or not to remove special tokens
in the decoding. Default to be False.
Returns:
(a list of/batched) text/chat completion
"""
batched = True
if isinstance(inputs, str):
inputs = [inputs]
prompts = inputs
messages = [{"role": "user", "content": prompt}for prompt in prompts]
gen_params = self.update_gen_params(**kwargs)
response = completion(model=self.model_name, messages=messages)
response = [resp.message.content for resp in response.choices]
# remove stop_words
response = filter_suffix(response, self.gen_params.get('stop_words'))
if batched:
return response
return response[0]
def stream_chat(self,
inputs: List[dict],
stream: bool = True,
ignore_eos: bool = False,
skip_special_tokens: Optional[bool] = False,
timeout: int = 30,
**kwargs):
"""Start a new round conversation of a session. Return the chat
completions in stream mode.
Args:
session_id (int): the identical id of a session
inputs (List[dict]): user's inputs in this round conversation
sequence_start (bool): start flag of a session
sequence_end (bool): end flag of a session
stream (bool): return in a streaming format if enabled
ignore_eos (bool): indicator for ignoring eos
skip_special_tokens (bool): Whether or not to remove special tokens
in the decoding. Default to be False.
timeout (int): max time to wait for response
Returns:
tuple(Status, str, int): status, text/chat completion,
generated token number
"""
gen_params = self.update_gen_params(**kwargs)
max_new_tokens = gen_params.pop('max_new_tokens')
gen_params.update(max_tokens=max_new_tokens)
resp = ''
finished = False
stop_words = gen_params.get('stop_words')
if stop_words is None:
stop_words = []
messages = self.template_parser._prompt2api(inputs)
for text in completion(
self.model_name,
messages,
stream=stream,
**gen_params):
if not text.choices[0].delta.content:
continue
resp += text.choices[0].delta.content
if not resp:
continue
# remove stop_words
for sw in stop_words:
if sw in resp:
resp = filter_suffix(resp, stop_words)
finished = True
break
yield ModelStatusCode.STREAM_ING, resp, None
if finished:
break
yield ModelStatusCode.END, resp, None
litellm_completion = dict(type=litellmCompletion,
# model_name="deepseek/deepseek-chat",
meta_template=[
dict(role='system', api_role='system'),
dict(role='user', api_role='user'),
dict(role='assistant', api_role='assistant'),
dict(role='environment', api_role='system')
]
)