File size: 6,941 Bytes
8250356
400e980
8250356
 
c62a8cf
8250356
 
 
 
 
 
 
c083776
 
 
 
8250356
 
 
 
 
 
 
 
 
 
c083776
8250356
 
400e980
 
 
 
8250356
 
 
c083776
 
 
 
 
 
 
 
 
 
8250356
 
400e980
c083776
8250356
 
c083776
8250356
 
c62a8cf
 
 
 
 
 
 
400e980
c62a8cf
8250356
400e980
b1e1362
400e980
 
 
 
 
 
 
 
 
 
b1e1362
400e980
 
8250356
c62a8cf
8250356
c083776
 
400e980
 
 
 
 
 
 
 
 
 
c083776
 
 
 
 
 
 
 
 
 
8250356
 
400e980
c083776
8250356
 
c083776
8250356
 
 
 
 
 
 
 
 
 
400e980
 
 
 
8250356
 
e38a225
8250356
 
 
 
 
 
 
e38a225
8250356
 
 
 
400e980
8250356
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400e980
 
8250356
 
c083776
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import gradio as gr
import json
import pandas as pd
import tiktoken
import anthropic

def process_csv(file, calculate_openai, openai_model, calculate_anthropic, anthropic_model):
    # Check if file is uploaded
    if file is None:
        return "Please upload a CSV file."
    
    # Read the CSV file
    try:
        df = pd.read_csv(file)#.name)
    except Exception as e:
        return f"Error reading CSV file: {e}"
    
    # Initialize output string
    output = ""
    
    if calculate_openai:
        # Get the OpenAI tokenizer for the selected model
        try:
            openai_encoding = tiktoken.encoding_for_model(openai_model)
        except KeyError:
            # Default encoding if model is not found
            openai_encoding = tiktoken.get_encoding("cl100k_base")
        
        token_counts_openai = {}
        try:
            total_tokens_openai = len(openai_encoding.encode(df.to_csv(index=False)))
        except Exception as e:
            return f"Error counting tokens with OpenAI model: {e}"        
        
        # Iterate over columns
        for col in df.columns:
            #tokens_col_openai = 0
            try:
                tokens_openai = openai_encoding.encode('\n'.join([col]+list(df[col].astype(str).values)))
            except Exception as e:
                return f"Error counting tokens with OpenAI model: {e}"
            # for cell in df[col].astype(str):
            #     tokens_openai = openai_encoding.encode(cell)
            #     tokens_col_openai += len(tokens_openai)
            token_counts_openai[col] = len(tokens_openai)
            #total_tokens_openai += tokens_openai
        
        # Prepare OpenAI output
        output += f"\n**Total OpenAI Tokens ({openai_model}): {total_tokens_openai}**\n"
        output += f"\n**OpenAI Token Counts per Column ({openai_model}):**\n\n"
        for col, count in token_counts_openai.items():
            output += f"- {col}: {count} tokens\n"
        
    
    if calculate_anthropic:
        # Get the Anthropic API key from environment variables
        #anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY")
        #if not anthropic_api_key:
        #    return "Please set the ANTHROPIC_API_KEY environment variable."
        
        # Initialize the Anthropic client
        #client = anthropic.Anthropic(api_key=anthropic_api_key)
        client = anthropic.Anthropic()
        
        token_counts_anthropic = {}
        #total_tokens_anthropic = client.count_tokens(df.to_csv(index=False))
        try:
            response = client.beta.messages.count_tokens(
                betas=["token-counting-2024-11-01"],
                model=anthropic_model, #"claude-3-5-sonnet-20241022",
                #system="You are a scientist",
                messages=[{
                    "role": "user",
                    "content": df.to_csv(index=False)
                    }],
                    )
            total_tokens_anthropic = json.loads(response.json())['input_tokens']
        except Exception as e:
            return f"Error counting tokens with Anthropic model: {e}"
        
        
        # Iterate over columns
        for col in df.columns:
            #tokens_col_anthropic = 0
            try:
                #tokens_anthropic = client.count_tokens('\n'.join([col]+list(df[col].astype(str).values))) #0.37.1 version
                response = client.beta.messages.count_tokens(
                    betas=["token-counting-2024-11-01"],
                    model=anthropic_model,
                    messages=[{
                        "role": "user",
                        "content": '\n'.join([col]+list(df[col].astype(str).values))
                        }],
                        )
                tokens_anthropic = json.loads(response.json())['input_tokens']
            except Exception as e:
                return f"Error counting tokens with Anthropic model: {e}"
            # for cell in df[col].astype(str):
            #     try:
            #         tokens_anthropic = client.count_tokens(cell)
            #     except Exception as e:
            #         return f"Error counting tokens with Anthropic model: {e}"
            #     tokens_col_anthropic += tokens_anthropic
            token_counts_anthropic[col] = tokens_anthropic
            #total_tokens_anthropic += tokens_anthropic
        
        # Prepare Anthropic output
        output += f"\n**Total Anthropic Tokens ({anthropic_model}): {total_tokens_anthropic}**\n"
        output += f"\n**Anthropic Token Counts per Column ({anthropic_model}):**\n"
        for col, count in token_counts_anthropic.items():
            output += f"- {col}: {count} tokens\n"
        
    
    if not calculate_openai and not calculate_anthropic:
        output = "Please select at least one model to calculate tokens."
    
    return output

def main():
    with gr.Blocks() as demo:
        gr.Markdown("# Token Counter")
        gr.Markdown("Upload a CSV file to see token counts per column and total tokens.")
        gr.Markdown("""
                    For OpenAI models Python package `tiktoken` is used.
                    For Anthropic models beta version of [Token counting](https://docs.anthropic.com/en/docs/build-with-claude/token-counting) is used.
                    """)
        
        with gr.Row():
            file_input = gr.File(label="Upload CSV File", type="filepath")
        
        with gr.Row():
            calculate_openai = gr.Checkbox(label="Calculate tokens for OpenAI models")
            calculate_anthropic = gr.Checkbox(label="Calculate tokens for Anthropic models")
        
        with gr.Row():
            openai_model = gr.Dropdown(
                choices=['gpt-4o', 'gpt-4o-mini', 'gpt-4'],
                label="Select OpenAI Model",
                visible=False
            )
            anthropic_model = gr.Dropdown(
                choices=['claude-3-5-sonnet-latest', 'claude-3-5-haiku-latest', 'claude-3-opus-latest', 'claude-3-haiku-20240307'],
                label="Select Anthropic Model",
                visible=False
            )
        
        def update_openai_visibility(selected):
            return gr.update(visible=selected)
        
        def update_anthropic_visibility(selected):
            return gr.update(visible=selected)
        
        calculate_openai.change(fn=update_openai_visibility, inputs=calculate_openai, outputs=openai_model)
        calculate_anthropic.change(fn=update_anthropic_visibility, inputs=calculate_anthropic, outputs=anthropic_model)
        
        submit_button = gr.Button("Calculate Tokens")
        output = gr.Markdown()
        
        inputs = [file_input, calculate_openai, openai_model, calculate_anthropic, anthropic_model]
        submit_button.click(fn=process_csv, inputs=inputs, outputs=output)
        
    #demo.launch(share=True)
    demo.launch()

if __name__ == "__main__":
    main()