tomaszki commited on
Commit
e2dbcf5
·
1 Parent(s): 485fe95

First app version

Browse files
Files changed (1) hide show
  1. app.py +91 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from io import StringIO
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+ from torch.nn import functional as F
5
+ import torch
6
+ import numpy as np
7
+
8
+ import numpyAc
9
+
10
+ st.set_page_config(layout="wide")
11
+
12
+ @st.cache_resource
13
+ def load_model():
14
+ return AutoModelForCausalLM.from_pretrained(
15
+ "codellama/CodeLlama-7b-Python-hf",
16
+ device_map='auto',
17
+ )
18
+
19
+ @st.cache_resource
20
+ def load_tokenizer():
21
+ return AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Python-hf")
22
+
23
+ model = load_model()
24
+ tokenizer = load_tokenizer()
25
+
26
+ st.title('Python file compressor')
27
+ encode_col, decode_col = st.columns(2, gap='medium')
28
+
29
+ @st.cache_data
30
+ def encode(text):
31
+ codec = numpyAc.arithmeticCoding()
32
+ tokenized = tokenizer(text, return_tensors='pt').input_ids.to('cuda')
33
+ output = list()
34
+ past_key_values = None
35
+
36
+ for i in range(tokenized.shape[1]):
37
+ with torch.no_grad():
38
+ output_ = model(
39
+ input_ids=tokenized[:, i:i + 1],
40
+ use_cache=True,
41
+ past_key_values=past_key_values
42
+ )
43
+ past_key_values = output_.past_key_values
44
+ logits = output_.logits[0, -1:, :]
45
+ output.append(logits)
46
+ output = torch.cat(output, dim=0)
47
+ output = F.softmax(output, dim=-1)
48
+ tokenized = torch.cat((tokenized.squeeze()[1:], torch.tensor([2], device='cuda'))) # Add EOS
49
+ tokenized = tokenized.type(torch.int16).cpu().numpy()
50
+ byte_stream, _ = codec.encode(output.cpu(), tokenized)
51
+ return byte_stream
52
+
53
+ @st.cache_data
54
+ def decode(byte_stream):
55
+ decodec = numpyAc.arithmeticDeCoding(byte_stream, 32_000)
56
+ input_ids = [1]
57
+ past_key_values = None
58
+
59
+ while input_ids[-1] != 2:
60
+ with torch.no_grad():
61
+ output = model(
62
+ input_ids=torch.tensor([input_ids[-1:]], device='cuda'),
63
+ use_cache=True,
64
+ past_key_values=past_key_values
65
+ )
66
+ past_key_values = output.past_key_values
67
+ logits = output.logits[0, -1:, :]
68
+ logits = F.softmax(logits, dim=-1).cpu()
69
+ next_token = decodec.decode(logits)
70
+ input_ids.append(next_token)
71
+ return input_ids
72
+
73
+ with encode_col:
74
+ st.header('Convert your python file to binary.')
75
+ python_file = st.file_uploader("Upload your python file here. I recommend files up to 50-100 lines, so it doesn't take too long.")
76
+ if python_file is not None:
77
+ stringio = StringIO(python_file.getvalue().decode("utf-8"))
78
+ code = stringio.read()
79
+ bytes_stream = encode(code)
80
+ bin_filename = f'{python_file.name.split(".")[0]}.bin'
81
+ st.download_button('Download binary file', bytes_stream, bin_filename)
82
+
83
+ with decode_col:
84
+ st.header('Convert your binary file to python')
85
+ binary_file = st.file_uploader('Upload your binary file here')
86
+ if binary_file is not None:
87
+ tokens = decode(binary_file.read())
88
+ decompressed = tokenizer.decode(tokens, skip_special_tokens=True)
89
+ py_filename = f'{binary_file.name.split(".")[0]}.py'
90
+ st.download_button('Download python file', decompressed, py_filename)
91
+ st.code(decompressed)