File size: 4,163 Bytes
62da328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

import re

from pydantic import BaseModel, Field, field_validator


class AlpacaItem(BaseModel):
    r"""Represents an instruction-response item in the Alpaca format.

    Appropripate for both cases where input field is empty, or populated.
    Provides parsing from string format using the class method from_string().

    Args:
        instruction (str): The instruction/question/prompt
        input (str): Input context or examples (put empty string if none)
        output (str): The response/answer to the instruction
    """

    instruction: str = Field(description="The instruction/question/prompt")
    input: str = Field(
        description="Optional context or input for the task."
        " For example, when the instruction is \"Summarize the "
        "following article\", the input is the article."
    )
    output: str = Field(description="The response/answer to the instruction")

    @field_validator('instruction', 'output')
    def no_section_markers(cls, value: str) -> str:
        r"""Ensures fields don't contain section markers like '###
        Response:'
        """
        if (
            '### Response' in value
            or '### Instruction' in value
            or '### Input' in value
        ):
            raise ValueError("Field cannot contain section markers")
        return value.strip()

    @classmethod
    def from_string(cls, text: str) -> "AlpacaItem":
        r"""Creates an AlpacaItem from a formatted string.

        Args:
            text: String in either of these formats:
                 With input:
                 ### Instruction:
                 {instruction}
                 ### Input:
                 {input}
                 ### Response:
                 {response}

                 Without input:
                 ### Instruction:
                 {instruction}
                 ### Response:
                 {response}

        Returns:
            AlpacaItem: Parsed instance

        Raises:
            ValueError: text doesn't match expected format or sections missing
        """
        # Strip and standardize newlines
        text = text.strip().replace('\r\n', '\n')

        # Try to extract sections using regex
        instruction_match = re.search(
            r'###\s*Instruction:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL
        )
        input_match = re.search(
            r'###\s*Input:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL
        )
        response_match = re.search(
            r'###\s*Response:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL
        )

        if not instruction_match or not response_match:
            raise ValueError(
                "Text must contain '### Instruction:'"
                " and '### Response:' sections"
            )

        return cls(
            instruction=instruction_match.group(1).strip(),
            input=input_match.group(1).strip() if input_match else "",
            output=response_match.group(1).strip(),
        )

    def to_string(self) -> str:
        r"""Converts the AlpacaItem to its string representation.

        Returns:
            str: Formatted string representation with sections markers
        """
        return "\n".join(
            [
                "### Instruction:",
                self.instruction,
                "",
                "### Input:",
                self.input,
                "",
                "### Response:",
                self.output,
            ]
        )