File size: 2,301 Bytes
fc3399e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pandas as pd
from pydantic import ConfigDict

from bytelatent.data.data_types import BltExample
from bytelatent.data.iterators.abstract_iterator import (
    PydanticIteratorState,
    StatefulIterator,
)


class BltTestIteratorState(PydanticIteratorState):
    model_config = ConfigDict(extra="forbid")
    position: int
    total: int

    def build(self):
        blt_iter = BltTestIteratorState(total=self.total)
        blt_iter.position = self.position
        return blt_iter


class BltTestIterator(StatefulIterator):
    def __init__(self, total: int):
        self.position = 0
        self.total = total

    def get_state(self):
        return BltTestIteratorState(position=self.position, total=self.total)

    def create_iter(self):
        for i in range(self.total):
            self.position += 1
            yield BltExample(
                sample_id=f"test_{i}",
                text=f"This is some test {i} text.",
                tokens=None,
                mask=None,
                entropies=None,
                patch_lengths=None,
            )


class BltTestWithEntropiesIteratorState(PydanticIteratorState):
    model_config = ConfigDict(extra="forbid")
    position: int
    total: int

    def build(self):
        blt_iter = BltTestWithEntropiesIteratorState(total=self.total)
        blt_iter.position = self.position
        return blt_iter


class BltTestWithEntropiesIterator(StatefulIterator):
    def __init__(self, total: int):
        self.position = 0
        self.total = total

    def get_state(self):
        return BltTestIteratorState(position=self.position, total=self.total)

    def create_iter(self):
        text = "Daenerys Targaryen is in Game of Thrones, a fantasy epic by George R.R. Martin."
        df = pd.read_json("fixtures/tokens_with_entropies.json")
        tokens = df["token_ids"].tolist()
        entropies = df["entropies"].tolist()
        # BOS and EOS
        assert len(tokens) == len(text) + 2
        for i in range(self.total):
            self.position += 1
            yield BltExample(
                sample_id=f"test_{i}",
                text=text,
                tokens=tokens,
                mask=[True] * len(tokens),
                entropies=entropies,
                patch_lengths=None,
            )