Spaces:
Runtime error
Runtime error
"""Callback Handler that prints to std out.""" | |
import threading | |
from typing import Any, Dict, List | |
from langchain_core.callbacks import BaseCallbackHandler | |
from langchain_core.messages import AIMessage | |
from langchain_core.outputs import ChatGeneration, LLMResult | |
MODEL_COST_PER_1K_TOKENS = { | |
# GPT-4o input | |
"gpt-4o": 0.005, | |
"gpt-4o-2024-05-13": 0.005, | |
# GPT-4o output | |
"gpt-4o-completion": 0.015, | |
"gpt-4o-2024-05-13-completion": 0.015, | |
# GPT-4 input | |
"gpt-4": 0.03, | |
"gpt-4-0314": 0.03, | |
"gpt-4-0613": 0.03, | |
"gpt-4-32k": 0.06, | |
"gpt-4-32k-0314": 0.06, | |
"gpt-4-32k-0613": 0.06, | |
"gpt-4-vision-preview": 0.01, | |
"gpt-4-1106-preview": 0.01, | |
"gpt-4-0125-preview": 0.01, | |
"gpt-4-turbo-preview": 0.01, | |
"gpt-4-turbo": 0.01, | |
"gpt-4-turbo-2024-04-09": 0.01, | |
# GPT-4 output | |
"gpt-4-completion": 0.06, | |
"gpt-4-0314-completion": 0.06, | |
"gpt-4-0613-completion": 0.06, | |
"gpt-4-32k-completion": 0.12, | |
"gpt-4-32k-0314-completion": 0.12, | |
"gpt-4-32k-0613-completion": 0.12, | |
"gpt-4-vision-preview-completion": 0.03, | |
"gpt-4-1106-preview-completion": 0.03, | |
"gpt-4-0125-preview-completion": 0.03, | |
"gpt-4-turbo-preview-completion": 0.03, | |
"gpt-4-turbo-completion": 0.03, | |
"gpt-4-turbo-2024-04-09-completion": 0.03, | |
# GPT-3.5 input | |
# gpt-3.5-turbo points at gpt-3.5-turbo-0613 until Feb 16, 2024. | |
# Switches to gpt-3.5-turbo-0125 after. | |
"gpt-3.5-turbo": 0.0015, | |
"gpt-3.5-turbo-0125": 0.0005, | |
"gpt-3.5-turbo-0301": 0.0015, | |
"gpt-3.5-turbo-0613": 0.0015, | |
"gpt-3.5-turbo-1106": 0.001, | |
"gpt-3.5-turbo-instruct": 0.0015, | |
"gpt-3.5-turbo-16k": 0.003, | |
"gpt-3.5-turbo-16k-0613": 0.003, | |
# GPT-3.5 output | |
# gpt-3.5-turbo points at gpt-3.5-turbo-0613 until Feb 16, 2024. | |
# Switches to gpt-3.5-turbo-0125 after. | |
"gpt-3.5-turbo-completion": 0.002, | |
"gpt-3.5-turbo-0125-completion": 0.0015, | |
"gpt-3.5-turbo-0301-completion": 0.002, | |
"gpt-3.5-turbo-0613-completion": 0.002, | |
"gpt-3.5-turbo-1106-completion": 0.002, | |
"gpt-3.5-turbo-instruct-completion": 0.002, | |
"gpt-3.5-turbo-16k-completion": 0.004, | |
"gpt-3.5-turbo-16k-0613-completion": 0.004, | |
# Azure GPT-35 input | |
"gpt-35-turbo": 0.0015, # Azure OpenAI version of ChatGPT | |
"gpt-35-turbo-0301": 0.0015, # Azure OpenAI version of ChatGPT | |
"gpt-35-turbo-0613": 0.0015, | |
"gpt-35-turbo-instruct": 0.0015, | |
"gpt-35-turbo-16k": 0.003, | |
"gpt-35-turbo-16k-0613": 0.003, | |
# Azure GPT-35 output | |
"gpt-35-turbo-completion": 0.002, # Azure OpenAI version of ChatGPT | |
"gpt-35-turbo-0301-completion": 0.002, # Azure OpenAI version of ChatGPT | |
"gpt-35-turbo-0613-completion": 0.002, | |
"gpt-35-turbo-instruct-completion": 0.002, | |
"gpt-35-turbo-16k-completion": 0.004, | |
"gpt-35-turbo-16k-0613-completion": 0.004, | |
# Others | |
"text-ada-001": 0.0004, | |
"ada": 0.0004, | |
"text-babbage-001": 0.0005, | |
"babbage": 0.0005, | |
"text-curie-001": 0.002, | |
"curie": 0.002, | |
"text-davinci-003": 0.02, | |
"text-davinci-002": 0.02, | |
"code-davinci-002": 0.02, | |
# Fine Tuned input | |
"babbage-002-finetuned": 0.0016, | |
"davinci-002-finetuned": 0.012, | |
"gpt-3.5-turbo-0613-finetuned": 0.003, | |
"gpt-3.5-turbo-1106-finetuned": 0.003, | |
"gpt-3.5-turbo-0125-finetuned": 0.003, | |
# Fine Tuned output | |
"babbage-002-finetuned-completion": 0.0016, | |
"davinci-002-finetuned-completion": 0.012, | |
"gpt-3.5-turbo-0613-finetuned-completion": 0.006, | |
"gpt-3.5-turbo-1106-finetuned-completion": 0.006, | |
"gpt-3.5-turbo-0125-finetuned-completion": 0.006, | |
# Azure Fine Tuned input | |
"babbage-002-azure-finetuned": 0.0004, | |
"davinci-002-azure-finetuned": 0.002, | |
"gpt-35-turbo-0613-azure-finetuned": 0.0015, | |
# Azure Fine Tuned output | |
"babbage-002-azure-finetuned-completion": 0.0004, | |
"davinci-002-azure-finetuned-completion": 0.002, | |
"gpt-35-turbo-0613-azure-finetuned-completion": 0.002, | |
# Legacy fine-tuned models | |
"ada-finetuned-legacy": 0.0016, | |
"babbage-finetuned-legacy": 0.0024, | |
"curie-finetuned-legacy": 0.012, | |
"davinci-finetuned-legacy": 0.12, | |
} | |
def standardize_model_name( | |
model_name: str, | |
is_completion: bool = False, | |
) -> str: | |
""" | |
Standardize the model name to a format that can be used in the OpenAI API. | |
Args: | |
model_name: Model name to standardize. | |
is_completion: Whether the model is used for completion or not. | |
Defaults to False. | |
Returns: | |
Standardized model name. | |
""" | |
model_name = model_name.lower() | |
if ".ft-" in model_name: | |
model_name = model_name.split(".ft-")[0] + "-azure-finetuned" | |
if ":ft-" in model_name: | |
model_name = model_name.split(":")[0] + "-finetuned-legacy" | |
if "ft:" in model_name: | |
model_name = model_name.split(":")[1] + "-finetuned" | |
if is_completion and ( | |
model_name.startswith("gpt-4") | |
or model_name.startswith("gpt-3.5") | |
or model_name.startswith("gpt-35") | |
or ("finetuned" in model_name and "legacy" not in model_name) | |
): | |
return model_name + "-completion" | |
else: | |
return model_name | |
def get_openai_token_cost_for_model( | |
model_name: str, num_tokens: int, is_completion: bool = False | |
) -> float: | |
""" | |
Get the cost in USD for a given model and number of tokens. | |
Args: | |
model_name: Name of the model | |
num_tokens: Number of tokens. | |
is_completion: Whether the model is used for completion or not. | |
Defaults to False. | |
Returns: | |
Cost in USD. | |
""" | |
model_name = standardize_model_name(model_name, is_completion=is_completion) | |
if model_name not in MODEL_COST_PER_1K_TOKENS: | |
raise ValueError( | |
f"Unknown model: {model_name}. Please provide a valid OpenAI model name." | |
"Known models are: " + ", ".join(MODEL_COST_PER_1K_TOKENS.keys()) | |
) | |
return MODEL_COST_PER_1K_TOKENS[model_name] * (num_tokens / 1000) | |
class OpenAICallbackHandler(BaseCallbackHandler): | |
"""Callback Handler that tracks OpenAI info.""" | |
total_tokens: int = 0 | |
prompt_tokens: int = 0 | |
completion_tokens: int = 0 | |
successful_requests: int = 0 | |
total_cost: float = 0.0 | |
def __init__(self) -> None: | |
super().__init__() | |
self._lock = threading.Lock() | |
def __repr__(self) -> str: | |
return ( | |
f"Tokens Used: {self.total_tokens}\n" | |
f"\tPrompt Tokens: {self.prompt_tokens}\n" | |
f"\tCompletion Tokens: {self.completion_tokens}\n" | |
f"Successful Requests: {self.successful_requests}\n" | |
f"Total Cost (USD): ${self.total_cost}" | |
) | |
def always_verbose(self) -> bool: | |
"""Whether to call verbose callbacks even if verbose is False.""" | |
return True | |
def on_llm_start( | |
self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any | |
) -> None: | |
"""Print out the prompts.""" | |
pass | |
def on_llm_new_token(self, token: str, **kwargs: Any) -> None: | |
"""Print out the token.""" | |
pass | |
def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None: | |
"""Collect token usage.""" | |
# Check for usage_metadata (langchain-core >= 0.2.2) | |
try: | |
generation = response.generations[0][0] | |
except IndexError: | |
generation = None | |
if isinstance(generation, ChatGeneration): | |
try: | |
message = generation.message | |
if isinstance(message, AIMessage): | |
usage_metadata = message.usage_metadata | |
else: | |
usage_metadata = None | |
except AttributeError: | |
usage_metadata = None | |
else: | |
usage_metadata = None | |
if usage_metadata: | |
token_usage = {"total_tokens": usage_metadata["total_tokens"]} | |
completion_tokens = usage_metadata["output_tokens"] | |
prompt_tokens = usage_metadata["input_tokens"] | |
if response.llm_output is None: | |
# model name (and therefore cost) is unavailable in | |
# streaming responses | |
model_name = "" | |
else: | |
model_name = standardize_model_name( | |
response.llm_output.get("model_name", "") | |
) | |
else: | |
if response.llm_output is None: | |
return None | |
if "token_usage" not in response.llm_output: | |
with self._lock: | |
self.successful_requests += 1 | |
return None | |
# compute tokens and cost for this request | |
token_usage = response.llm_output["token_usage"] | |
completion_tokens = token_usage.get("completion_tokens", 0) | |
prompt_tokens = token_usage.get("prompt_tokens", 0) | |
model_name = standardize_model_name( | |
response.llm_output.get("model_name", "") | |
) | |
if model_name in MODEL_COST_PER_1K_TOKENS: | |
completion_cost = get_openai_token_cost_for_model( | |
model_name, completion_tokens, is_completion=True | |
) | |
prompt_cost = get_openai_token_cost_for_model(model_name, prompt_tokens) | |
else: | |
completion_cost = 0 | |
prompt_cost = 0 | |
# update shared state behind lock | |
with self._lock: | |
self.total_cost += prompt_cost + completion_cost | |
self.total_tokens += token_usage.get("total_tokens", 0) | |
self.prompt_tokens += prompt_tokens | |
self.completion_tokens += completion_tokens | |
self.successful_requests += 1 | |
def __copy__(self) -> "OpenAICallbackHandler": | |
"""Return a copy of the callback handler.""" | |
return self | |
def __deepcopy__(self, memo: Any) -> "OpenAICallbackHandler": | |
"""Return a deep copy of the callback handler.""" | |
return self | |