Spaces:
Sleeping
Sleeping
File size: 2,772 Bytes
73ab266 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import gradio as gr
import pyarrow.parquet as pq
import pyarrow.compute as pc
from transformers import AutoTokenizer
import os
import numpy as np
token_table = pq.read_table("weights/tokens.parquet")
cache_path = "weights/caches"
parquets = os.listdir(cache_path)
TOKENIZER = "microsoft/Phi-3-mini-4k-instruct"
nearby = 8
stride = 0.25
n_bins = 10
with gr.Blocks() as demo:
feature_table = gr.State(None)
tokenizer_name = gr.Textbox(TOKENIZER)
dropdown = gr.Dropdown(parquets)
feature_input = gr.Number(0)
token_range = gr.Number(64)
frequency = gr.Number(0, label="Total frequency (%)")
histogram = gr.LinePlot(x="activation", y="freq")
cm = gr.HighlightedText()
frame = gr.Highlightedtext(
show_legend=True
)
def update(cache_name, feature, tokenizer_name, token_range):
if cache_name is None:
return
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
table = pq.read_table(f"{cache_path}/{cache_name}")
table_feat = table.filter(pc.field("feature") == feature).to_pandas()
freq_t = table_feat[["activation", "freq"]]
total_freq = float(table_feat["freq"].sum()) * 100
table_feat = table_feat[table_feat["activation"] > 0]
table_feat = table_feat[table_feat["freq"] > 0]
table_feat = table_feat.sort_values("activation", ascending=False)
texts = table_feat["token"].apply(
lambda x: tokenizer.decode(token_table[max(0, x - nearby - 1):x + nearby + 1]["tokens"].to_numpy())
)
texts = [tokenizer.tokenize(text) for text in texts]
activations = table_feat["nearby"].to_numpy()
if len(activations) > 0:
activations = np.stack(activations) * stride
max_act = table_feat["activation"].max()
activations = activations / max_act
highlight_data = [
[(token, activation) for token, activation in zip(text, activation)] + [("\n", 0)]
for text, activation in zip(texts, activations)
]
flat_data = [item for sublist in highlight_data for item in sublist]
color_map_data = [i / n_bins for i in range(n_bins + 1)]
color_map_data = [(f"{i*max_act:.2f}", i) for i in color_map_data]
else:
flat_data = []
color_map_data = []
return flat_data, color_map_data, freq_t, total_freq
dropdown.change(update, [dropdown, feature_input, tokenizer_name, token_range], [frame, cm, histogram, frequency])
feature_input.change(update, [dropdown, feature_input, tokenizer_name, token_range], [frame, cm, histogram, frequency])
if __name__ == "__main__":
demo.launch(share=True)
|