File size: 6,370 Bytes
ed4d993
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
from __future__ import annotations

from typing import List, Optional

from langchain import hub
from langchain.callbacks.tracers.evaluation import EvaluatorCallbackHandler
from langchain.callbacks.tracers.schemas import Run
from langchain.schema import (
    AIMessage,
    BaseMessage,
    HumanMessage,
    StrOutputParser,
    get_buffer_string,
)
from langchain_community.chat_models import ChatOpenAI
from langchain_core.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.runnables import Runnable
from langsmith.evaluation import EvaluationResult, RunEvaluator
from langsmith.schemas import Example

###############################################################################
# |   Chat Bot Evaluator Definition
# | This section defines an evaluator that evaluates any chat bot
# | without explicit user feedback. It formats the dialog up to
# | the current message and then instructs an LLM to grade the last AI response
# | based on the subsequent user response. If no chat history is present,
# V the evaluator is not called.
###############################################################################


class ResponseEffectiveness(BaseModel):
    """Score the effectiveness of the AI chat bot response."""

    reasoning: str = Field(
        ...,
        description="Explanation for the score.",
    )
    score: int = Field(
        ...,
        min=0,
        max=5,
        description="Effectiveness of AI's final response.",
    )


def format_messages(input: dict) -> List[BaseMessage]:
    """Format the messages for the evaluator."""
    chat_history = input.get("chat_history") or []
    results = []
    for message in chat_history:
        if message["type"] == "human":
            results.append(HumanMessage.parse_obj(message))
        else:
            results.append(AIMessage.parse_obj(message))
    return results


def format_dialog(input: dict) -> dict:
    """Format messages and convert to a single string."""
    chat_history = format_messages(input)
    formatted_dialog = get_buffer_string(chat_history) + f"\nhuman: {input['text']}"
    return {"dialog": formatted_dialog}


def normalize_score(response: dict) -> dict:
    """Normalize the score to be between 0 and 1."""
    response["score"] = int(response["score"]) / 5
    return response


# To view the prompt in the playground: https://smith.langchain.com/hub/wfh/response-effectiveness
evaluation_prompt = hub.pull("wfh/response-effectiveness")
evaluate_response_effectiveness = (
    format_dialog
    | evaluation_prompt
    # bind_functions formats the schema for the OpenAI function
    # calling endpoint, which returns more reliable structured data.
    | ChatOpenAI(model="gpt-3.5-turbo").bind_functions(
        functions=[ResponseEffectiveness],
        function_call="ResponseEffectiveness",
    )
    # Convert the model's output to a dict
    | JsonOutputFunctionsParser(args_only=True)
    | normalize_score
)


class ResponseEffectivenessEvaluator(RunEvaluator):
    """Evaluate the chat bot based the subsequent user responses."""

    def __init__(self, evaluator_runnable: Runnable) -> None:
        super().__init__()
        self.runnable = evaluator_runnable

    def evaluate_run(
        self, run: Run, example: Optional[Example] = None
    ) -> EvaluationResult:
        # This evaluator grades the AI's PREVIOUS response.
        # If no chat history is present, there isn't anything to evaluate
        # (it's the user's first message)
        if not run.inputs.get("chat_history"):
            return EvaluationResult(
                key="response_effectiveness", comment="No chat history present."
            )
        # This only occurs if the client isn't correctly sending the run IDs
        # of the previous calls.
        elif "last_run_id" not in run.inputs:
            return EvaluationResult(
                key="response_effectiveness", comment="No last run ID present."
            )
        # Call the LLM to evaluate the response
        eval_grade: Optional[dict] = self.runnable.invoke(run.inputs)
        target_run_id = run.inputs["last_run_id"]
        return EvaluationResult(
            **eval_grade,
            key="response_effectiveness",
            target_run_id=target_run_id,  # Requires langsmith >= 0.0.54
        )


###############################################################################
# |           The chat bot definition
# | This is what is actually exposed by LangServe in the API
# | It can be any chain that accepts the ChainInput schema and returns a str
# | all that is required is the with_config() call at the end to add the
# V evaluators as "listeners" to the chain.
# ############################################################################


class ChainInput(BaseModel):
    """Input for the chat bot."""

    chat_history: Optional[List[BaseMessage]] = Field(
        description="Previous chat messages."
    )
    text: str = Field(..., description="User's latest query.")
    last_run_id: Optional[str] = Field("", description="Run ID of the last run.")


_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant who speaks like a pirate",
        ),
        MessagesPlaceholder(variable_name="chat_history"),
        ("user", "{text}"),
    ]
)
_model = ChatOpenAI()


def format_chat_history(chain_input: dict) -> dict:
    messages = format_messages(chain_input)

    return {
        "chat_history": messages,
        "text": chain_input.get("text"),
    }


# if you update the name of this, you MUST also update ../pyproject.toml
# with the new `tool.langserve.export_attr`
chain = (
    (format_chat_history | _prompt | _model | StrOutputParser())
    # This is to add the evaluators as "listeners"
    # and to customize the name of the chain.
    # Any chain that accepts a compatible input type works here.
    .with_config(
        run_name="ChatBot",
        callbacks=[
            EvaluatorCallbackHandler(
                evaluators=[
                    ResponseEffectivenessEvaluator(evaluate_response_effectiveness)
                ]
            )
        ],
    )
)

chain = chain.with_types(input_type=ChainInput)