Spaces:

rote1
/

IAGO

Running

File size: 4,163 Bytes

62da328

# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========

import re

from pydantic import BaseModel, Field, field_validator


class AlpacaItem(BaseModel):
    r"""Represents an instruction-response item in the Alpaca format.

    Appropripate for both cases where input field is empty, or populated.
    Provides parsing from string format using the class method from_string().

    Args:
        instruction (str): The instruction/question/prompt
        input (str): Input context or examples (put empty string if none)
        output (str): The response/answer to the instruction
    """

    instruction: str = Field(description="The instruction/question/prompt")
    input: str = Field(
        description="Optional context or input for the task."
        " For example, when the instruction is \"Summarize the "
        "following article\", the input is the article."
    )
    output: str = Field(description="The response/answer to the instruction")

    @field_validator('instruction', 'output')
    def no_section_markers(cls, value: str) -> str:
        r"""Ensures fields don't contain section markers like '###
        Response:'
        """
        if (
            '### Response' in value
            or '### Instruction' in value
            or '### Input' in value
        ):
            raise ValueError("Field cannot contain section markers")
        return value.strip()

    @classmethod
    def from_string(cls, text: str) -> "AlpacaItem":
        r"""Creates an AlpacaItem from a formatted string.

        Args:
            text: String in either of these formats:
                 With input:
                 ### Instruction:
                 {instruction}
                 ### Input:
                 {input}
                 ### Response:
                 {response}

                 Without input:
                 ### Instruction:
                 {instruction}
                 ### Response:
                 {response}

        Returns:
            AlpacaItem: Parsed instance

        Raises:
            ValueError: text doesn't match expected format or sections missing
        """
        # Strip and standardize newlines
        text = text.strip().replace('\r\n', '\n')

        # Try to extract sections using regex
        instruction_match = re.search(
            r'###\s*Instruction:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL
        )
        input_match = re.search(
            r'###\s*Input:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL
        )
        response_match = re.search(
            r'###\s*Response:\s*\n(.+?)(?=\n###|\Z)', text, re.DOTALL
        )

        if not instruction_match or not response_match:
            raise ValueError(
                "Text must contain '### Instruction:'"
                " and '### Response:' sections"
            )

        return cls(
            instruction=instruction_match.group(1).strip(),
            input=input_match.group(1).strip() if input_match else "",
            output=response_match.group(1).strip(),
        )

    def to_string(self) -> str:
        r"""Converts the AlpacaItem to its string representation.

        Returns:
            str: Formatted string representation with sections markers
        """
        return "\n".join(
            [
                "### Instruction:",
                self.instruction,
                "",
                "### Input:",
                self.input,
                "",
                "### Response:",
                self.output,
            ]
        )