File size: 5,016 Bytes
73ab266
 
 
 
c98496e
73ab266
 
 
 
 
 
 
 
 
c98496e
 
b17a7e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c98496e
73ab266
 
 
 
b17a7e8
 
c98496e
73ab266
 
 
b17a7e8
 
 
 
c98496e
 
 
 
b17a7e8
 
 
 
73ab266
 
b17a7e8
 
c98496e
 
b17a7e8
 
 
 
 
 
c98496e
73ab266
c98496e
73ab266
b17a7e8
73ab266
b17a7e8
73ab266
 
c98496e
73ab266
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b17a7e8
 
 
 
 
 
 
c98496e
b17a7e8
 
 
c98496e
73ab266
 
b17a7e8
 
73ab266
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
import pyarrow.parquet as pq
import pyarrow.compute as pc
from transformers import AutoTokenizer
from datasets import load_dataset
import os
import numpy as np


token_table = pq.read_table("weights/tokens.parquet")
cache_path = "weights/caches"
parquets = os.listdir(cache_path)
TOKENIZER = "microsoft/Phi-3-mini-4k-instruct"

dataset = load_dataset("kisate-team/feature-explanations", split="train")

def find_revions():
    revisions = set()
    for parquet in parquets:
        if parquet.endswith(".parquet"):
            parts = parquet.split("-")
            if len(parts) > 2:
                revisions.add(int(parts[2][1:]))
    return sorted(revisions)

def find_layers(revision):
    layers = set()
    for parquet in parquets:
        if parquet.endswith(".parquet"):
            parts = parquet.split("-")
            if len(parts) > 2 and int(parts[2][1:]) == revision:
                layers.add(int(parts[1][1:]))
    return sorted(layers)

revisions = find_revions()
layers = {
    revision: find_layers(revision) for revision in revisions
}

features = {
    revision: {
        layer: {
            item["feature"]:item for item in dataset if item["layer"] == layer and item["version"] == revision
        } for layer in layers[revision]
    } for revision in revisions
}

# layers = dataset.unique("layer")

nearby = 8
stride = 0.25
n_bins = 10

def make_cache_name(layer, revision):
    return f"{cache_path}/phi-l{layer}-r{revision}-st0.25x128-activations.parquet"

with gr.Blocks() as demo:
    feature_table = gr.State(None)

    tokenizer_name = gr.Textbox(TOKENIZER, label="Tokenizer")
    revision_dropdown = gr.Dropdown(revisions, label="Revision")

    layer_dropdown = gr.Dropdown(layers[4], label="Layer")

    def update_features(layer):
        feature_dropdown = gr.Dropdown(features[layer].keys())
        return feature_dropdown
    
    def update_layers(revision):
        layer_dropdown = gr.Dropdown(layers[revision])
        return layer_dropdown

    frequency = gr.Number(0, label="Total frequency (%)")

    # layer_dropdown.input(update_features, layer_dropdown, feature_dropdown)
    # histogram = gr.LinePlot(x="activation", y="freq")

    revision_dropdown.input(update_layers, revision_dropdown, layer_dropdown)

    feature_input = gr.Number(0, label="Feature")

    autoi_expl = gr.Textbox(label="AutoInterp Explanation")
    selfe_expl = gr.Textbox(label="SelfExplain Explanation")

    cm = gr.HighlightedText()
    frame = gr.Highlightedtext()

    def update(revision, layer, feature, tokenizer_name):
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        table = pq.read_table(make_cache_name(layer, revision))
        table_feat = table.filter(pc.field("feature") == feature).to_pandas()

        # freq_t = table_feat[["activation", "freq"]]
        total_freq = float(table_feat["freq"].sum()) * 100
        
        table_feat = table_feat[table_feat["activation"] > 0]
        table_feat = table_feat[table_feat["freq"] > 0]

        table_feat = table_feat.sort_values("activation", ascending=False)

        texts = table_feat["token"].apply(
            lambda x: tokenizer.decode(token_table[max(0, x - nearby - 1):x + nearby + 1]["tokens"].to_numpy())
        )

        texts = [tokenizer.tokenize(text) for text in texts]
        activations = table_feat["nearby"].to_numpy()
        if len(activations) > 0:
            activations = np.stack(activations) * stride
            max_act = table_feat["activation"].max()
            activations = activations / max_act

            highlight_data = [
                [(token, activation) for token, activation in zip(text, activation)] + [("\n", 0)]
                for text, activation in zip(texts, activations)
            ]

            flat_data = [item for sublist in highlight_data for item in sublist]
            
            color_map_data = [i / n_bins for i in range(n_bins + 1)]
            color_map_data = [(f"{i*max_act:.2f}", i) for i in color_map_data]
        else:
            flat_data = []
            color_map_data = []

        if feature in features[revision][layer]:
            autoi_expl = features[revision][layer][feature]["explanation"]
            selfe_expl = features[revision][layer][feature]["gen_explanations"]
            if selfe_expl is not None:
                selfe_expl = "\n".join(
                    f"{i+1}. \"{x}\"" for i, x in enumerate(selfe_expl)
                )

        else:
            autoi_expl = "No explanation found"
            selfe_expl = "No explanation found"
        return flat_data, color_map_data, total_freq, autoi_expl, selfe_expl
        

    # feature_dropdown.change(update, [layer_dropdown, feature_dropdown, tokenizer_name], [frame, cm, frequency, autoi_expl, selfe_expl])
    feature_input.change(update, [revision_dropdown, layer_dropdown, feature_input, tokenizer_name], [frame, cm, frequency, autoi_expl, selfe_expl])


if __name__ == "__main__":
    demo.launch(share=True)