Spaces:

sweepai
/

qwen-tokenizer

Sleeping

App Files Files Community

kevinlu1248 commited on 14 days ago

Commit

8e147a8

verified ·

1 Parent(s): 7c8b294

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +62 -39

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,63 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+import html
 import streamlit as st
+from transformers import AutoTokenizer
+import colorsys
+st.set_page_config(layout="wide", page_title="Text Tokenizer")
+def get_random_color(token_id):
+    # Generate a color based on the token id to ensure consistency
+    hue = (hash(str(token_id)) % 1000) / 1000.0
+    return f"hsla({int(hue * 360)}, 70%, 30%, 70%)"
+def load_tokenizer(model_name="Qwen/Qwen2.5-Coder-7B-Instruct"):
+    if 'tokenizer' not in st.session_state:
+        st.session_state.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    return st.session_state.tokenizer
+st.title("Text Tokenizer")
+selected_model = "Qwen/Qwen2.5-Coder-7B-Instruct"
+# Load tokenizer based on selection
+try:
+    tokenizer = load_tokenizer(selected_model)
+    st.success(f"Loaded tokenizer: {selected_model}")
+except Exception as e:
+    st.error(f"Failed to load tokenizer: {e}")
+    st.stop()
+# Input text area
+input_text = st.text_area("Enter text to tokenize", height=200)
+# Tokenize button
+if st.button("Tokenize") and input_text:
+    tokens = tokenizer.encode(input_text)
+    st.write(f"Total tokens: {len(tokens)}")
+    # Generate colored text visualization
+    result = ""
+    prev_tokens = []
+    prev_string = ""
+    for token in tokens:
+        color = get_random_color(token)
+        current_string = tokenizer.decode(prev_tokens + [token])
+        prev_tokens.append(token)
+        current_delta = current_string[len(prev_string):]
+        prev_string = current_string
+        current_delta = html.escape(current_delta)
+        current_delta = (current_delta
+            .replace("\n", "↵<br/>")
+            .replace(" ", "&nbsp;")
+            .replace("\t", "&nbsp;&nbsp;&nbsp;&nbsp;"))
+        result += f'<span style="background-color: {color};">{current_delta}</span>'
+    st.html(f'<pre style="background-color: #222; padding: 10px; font-family: Courier, monospace;">{result}</pre>')
+    # Show raw tokens (optional)
+    with st.expander("View raw tokens"):
+        token_strings = [tokenizer.decode([t]) for t in tokens]
+        for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
+            st.write(f"{i}: Token ID {token_id} → '{token_str}'")