Spaces:
Runtime error
Runtime error
File size: 6,370 Bytes
ed4d993 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
from __future__ import annotations
from typing import List, Optional
from langchain import hub
from langchain.callbacks.tracers.evaluation import EvaluatorCallbackHandler
from langchain.callbacks.tracers.schemas import Run
from langchain.schema import (
AIMessage,
BaseMessage,
HumanMessage,
StrOutputParser,
get_buffer_string,
)
from langchain_community.chat_models import ChatOpenAI
from langchain_core.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.runnables import Runnable
from langsmith.evaluation import EvaluationResult, RunEvaluator
from langsmith.schemas import Example
###############################################################################
# | Chat Bot Evaluator Definition
# | This section defines an evaluator that evaluates any chat bot
# | without explicit user feedback. It formats the dialog up to
# | the current message and then instructs an LLM to grade the last AI response
# | based on the subsequent user response. If no chat history is present,
# V the evaluator is not called.
###############################################################################
class ResponseEffectiveness(BaseModel):
"""Score the effectiveness of the AI chat bot response."""
reasoning: str = Field(
...,
description="Explanation for the score.",
)
score: int = Field(
...,
min=0,
max=5,
description="Effectiveness of AI's final response.",
)
def format_messages(input: dict) -> List[BaseMessage]:
"""Format the messages for the evaluator."""
chat_history = input.get("chat_history") or []
results = []
for message in chat_history:
if message["type"] == "human":
results.append(HumanMessage.parse_obj(message))
else:
results.append(AIMessage.parse_obj(message))
return results
def format_dialog(input: dict) -> dict:
"""Format messages and convert to a single string."""
chat_history = format_messages(input)
formatted_dialog = get_buffer_string(chat_history) + f"\nhuman: {input['text']}"
return {"dialog": formatted_dialog}
def normalize_score(response: dict) -> dict:
"""Normalize the score to be between 0 and 1."""
response["score"] = int(response["score"]) / 5
return response
# To view the prompt in the playground: https://smith.langchain.com/hub/wfh/response-effectiveness
evaluation_prompt = hub.pull("wfh/response-effectiveness")
evaluate_response_effectiveness = (
format_dialog
| evaluation_prompt
# bind_functions formats the schema for the OpenAI function
# calling endpoint, which returns more reliable structured data.
| ChatOpenAI(model="gpt-3.5-turbo").bind_functions(
functions=[ResponseEffectiveness],
function_call="ResponseEffectiveness",
)
# Convert the model's output to a dict
| JsonOutputFunctionsParser(args_only=True)
| normalize_score
)
class ResponseEffectivenessEvaluator(RunEvaluator):
"""Evaluate the chat bot based the subsequent user responses."""
def __init__(self, evaluator_runnable: Runnable) -> None:
super().__init__()
self.runnable = evaluator_runnable
def evaluate_run(
self, run: Run, example: Optional[Example] = None
) -> EvaluationResult:
# This evaluator grades the AI's PREVIOUS response.
# If no chat history is present, there isn't anything to evaluate
# (it's the user's first message)
if not run.inputs.get("chat_history"):
return EvaluationResult(
key="response_effectiveness", comment="No chat history present."
)
# This only occurs if the client isn't correctly sending the run IDs
# of the previous calls.
elif "last_run_id" not in run.inputs:
return EvaluationResult(
key="response_effectiveness", comment="No last run ID present."
)
# Call the LLM to evaluate the response
eval_grade: Optional[dict] = self.runnable.invoke(run.inputs)
target_run_id = run.inputs["last_run_id"]
return EvaluationResult(
**eval_grade,
key="response_effectiveness",
target_run_id=target_run_id, # Requires langsmith >= 0.0.54
)
###############################################################################
# | The chat bot definition
# | This is what is actually exposed by LangServe in the API
# | It can be any chain that accepts the ChainInput schema and returns a str
# | all that is required is the with_config() call at the end to add the
# V evaluators as "listeners" to the chain.
# ############################################################################
class ChainInput(BaseModel):
"""Input for the chat bot."""
chat_history: Optional[List[BaseMessage]] = Field(
description="Previous chat messages."
)
text: str = Field(..., description="User's latest query.")
last_run_id: Optional[str] = Field("", description="Run ID of the last run.")
_prompt = ChatPromptTemplate.from_messages(
[
(
"system",
"You are a helpful assistant who speaks like a pirate",
),
MessagesPlaceholder(variable_name="chat_history"),
("user", "{text}"),
]
)
_model = ChatOpenAI()
def format_chat_history(chain_input: dict) -> dict:
messages = format_messages(chain_input)
return {
"chat_history": messages,
"text": chain_input.get("text"),
}
# if you update the name of this, you MUST also update ../pyproject.toml
# with the new `tool.langserve.export_attr`
chain = (
(format_chat_history | _prompt | _model | StrOutputParser())
# This is to add the evaluators as "listeners"
# and to customize the name of the chain.
# Any chain that accepts a compatible input type works here.
.with_config(
run_name="ChatBot",
callbacks=[
EvaluatorCallbackHandler(
evaluators=[
ResponseEffectivenessEvaluator(evaluate_response_effectiveness)
]
)
],
)
)
chain = chain.with_types(input_type=ChainInput)
|