malvika2003 commited on
Commit
af25edf
·
verified ·
1 Parent(s): c841e02

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +279 -339
app.py CHANGED
@@ -1,340 +1,280 @@
1
- import os
2
- from transformers import AutoTokenizer, AutoConfig
3
- from optimum.intel.openvino import OVModelForCausalLM
4
- from generation_utils import run_generation, estimate_latency, reset_textbox,get_special_token_id
5
- from config import SUPPORTED_LLM_MODELS
6
- import gradio as gr
7
- from threading import Thread
8
- from time import perf_counter
9
- from typing import List
10
- from transformers import AutoTokenizer, TextIteratorStreamer
11
- import numpy as np
12
- import os
13
- from flask import Flask, render_template, redirect, url_for, request, flash
14
- from flask_sqlalchemy import SQLAlchemy
15
- from flask_login import LoginManager, UserMixin, login_user, login_required, logout_user, current_user
16
- from werkzeug.security import generate_password_hash, check_password_hash
17
-
18
- app = Flask(__name__)
19
- app.config['SECRET_KEY'] = 'your_secret_key'
20
- app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
21
- db = SQLAlchemy(app)
22
- login_manager = LoginManager()
23
- login_manager.init_app(app)
24
- login_manager.login_view = 'login'
25
-
26
- class User(db.Model):
27
- id = db.Column(db.Integer, primary_key=True)
28
- username = db.Column(db.String(80), unique=True, nullable=False)
29
- email = db.Column(db.String(120), unique=True, nullable=False)
30
-
31
- def __repr__(self):
32
- return '<User %r>' % self.username
33
-
34
- # Create the database tables
35
- with app.app_context():
36
- db.create_all()
37
-
38
- @login_manager.user_loader
39
- def load_user(user_id):
40
- return User.query.get(int(user_id))
41
-
42
- @app.route('/signup', methods=['GET', 'POST'])
43
- def signup():
44
- if request.method == 'POST':
45
- username = request.form['username']
46
- password = request.form['password']
47
- hashed_password = generate_password_hash(password, method='sha256')
48
-
49
- new_user = User(username=username, password=hashed_password)
50
- db.session.add(new_user)
51
- db.session.commit()
52
- flash('Signup successful!', 'success')
53
- return redirect(url_for('login'))
54
-
55
- return render_template('signup.html')
56
-
57
- @app.route('/login', methods=['GET', 'POST'])
58
- def login():
59
- if request.method == 'POST':
60
- username = request.form['username']
61
- password = request.form['password']
62
- user = User.query.filter_by(username=username).first()
63
- if user and check_password_hash(user.password, password):
64
- login_user(user)
65
- return redirect(url_for('dashboard'))
66
- flash('Invalid username or password', 'danger')
67
-
68
- return render_template('login.html')
69
-
70
- @app.route('/dashboard')
71
- @login_required
72
- def dashboard():
73
- return render_template('dashboard.html', name=current_user.username)
74
-
75
- @app.route('/logout')
76
- @login_required
77
- def logout():
78
- logout_user()
79
- return redirect(url_for('login'))
80
-
81
- if __name__ == '__main__':
82
- app.run(debug=True)
83
- model_dir = "C:/Users/KIIT/OneDrive/Desktop/INTEL/phi-2/INT8_compressed_weights"
84
- print(f"Checking model directory: {model_dir}")
85
- print(f"Contents: {os.listdir(model_dir)}") # Check contents of the directory
86
-
87
- print(f"Loading model from {model_dir}")
88
-
89
-
90
- model_name = "susnato/phi-2"
91
- model_configuration = SUPPORTED_LLM_MODELS["phi-2"]
92
- ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}
93
-
94
- tok = AutoTokenizer.from_pretrained(model_name)
95
-
96
- ov_model = OVModelForCausalLM.from_pretrained(
97
- model_dir,
98
- device="CPU",
99
- ov_config=ov_config,
100
- )
101
- tokenizer = AutoTokenizer.from_pretrained(model_name)
102
- tokenizer_kwargs = model_configuration.get("toeknizer_kwargs", {})
103
- # Continue with your tokenizer usage
104
- response_key = model_configuration.get("response_key")
105
- tokenizer_response_key = None
106
-
107
- def get_special_token_id(tokenizer: AutoTokenizer, key: str) -> int:
108
- """
109
- Gets the token ID for a given string that has been added to the tokenizer as a special token.
110
-
111
- Args:
112
- tokenizer (PreTrainedTokenizer): the tokenizer
113
- key (str): the key to convert to a single token
114
-
115
- Raises:
116
- ValueError: if more than one ID was generated
117
-
118
- Returns:
119
- int: the token ID for the given key
120
- """
121
- token_ids = tokenizer.encode(key)
122
- if len(token_ids) > 1:
123
- raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
124
- return token_ids[0]
125
- if response_key is not None:
126
- tokenizer_response_key = next(
127
- (token for token in tokenizer.additional_special_tokens if token.startswith(response_key)),
128
- None,
129
- )
130
-
131
- end_key_token_id = None
132
- if tokenizer_response_key:
133
- try:
134
- end_key = model_configuration.get("end_key")
135
- if end_key:
136
- end_key_token_id =get_special_token_id(tokenizer, end_key)
137
- # Ensure generation stops once it generates "### End"
138
- except ValueError:
139
- pass
140
-
141
- prompt_template = model_configuration.get("prompt_template", "{instruction}")
142
- end_key_token_id = end_key_token_id or tokenizer.eos_token_id
143
- pad_token_id = end_key_token_id or tokenizer.pad_token_id
144
-
145
- def estimate_latency(
146
- current_time: float,
147
- current_perf_text: str,
148
- new_gen_text: str,
149
- per_token_time: List[float],
150
- num_tokens: int,
151
- ):
152
- """
153
- Helper function for performance estimation
154
-
155
- Parameters:
156
- current_time (float): This step time in seconds.
157
- current_perf_text (str): Current content of performance UI field.
158
- new_gen_text (str): New generated text.
159
- per_token_time (List[float]): history of performance from previous steps.
160
- num_tokens (int): Total number of generated tokens.
161
-
162
- Returns:
163
- update for performance text field
164
- update for a total number of tokens
165
- """
166
- num_current_toks = len(tokenizer.encode(new_gen_text))
167
- num_tokens += num_current_toks
168
- per_token_time.append(num_current_toks / current_time)
169
- if len(per_token_time) > 10 and len(per_token_time) % 4 == 0:
170
- current_bucket = per_token_time[:-10]
171
- return (
172
- f"Average generation speed: {np.mean(current_bucket):.2f} tokens/s. Total generated tokens: {num_tokens}",
173
- num_tokens,
174
- )
175
- return current_perf_text, num_tokens
176
- def run_generation(
177
- user_text: str,
178
- top_p: float,
179
- temperature: float,
180
- top_k: int,
181
- max_new_tokens: int,
182
- perf_text: str,
183
- ):
184
- """
185
- Text generation function
186
-
187
- Parameters:
188
- user_text (str): User-provided instruction for a generation.
189
- top_p (float): Nucleus sampling. If set to < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for a generation.
190
- temperature (float): The value used to module the logits distribution.
191
- top_k (int): The number of highest probability vocabulary tokens to keep for top-k-filtering.
192
- max_new_tokens (int): Maximum length of generated sequence.
193
- perf_text (str): Content of text field for printing performance results.
194
- Returns:
195
- model_output (str) - model-generated text
196
- perf_text (str) - updated perf text filed content
197
- """
198
-
199
- # Prepare input prompt according to model expected template
200
- prompt_text = prompt_template.format(instruction=user_text)
201
-
202
- # Tokenize the user text.
203
- model_inputs = tokenizer(prompt_text, return_tensors="pt", **tokenizer_kwargs)
204
-
205
- # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
206
- # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
207
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
208
- generate_kwargs = dict(
209
- model_inputs,
210
- streamer=streamer,
211
- max_new_tokens=max_new_tokens,
212
- do_sample=True,
213
- top_p=top_p,
214
- temperature=float(temperature),
215
- top_k=top_k,
216
- eos_token_id=end_key_token_id,
217
- pad_token_id=pad_token_id,
218
- )
219
- t = Thread(target=ov_model.generate, kwargs=generate_kwargs)
220
- t.start()
221
-
222
- # Pull the generated text from the streamer, and update the model output.
223
- model_output = ""
224
- per_token_time = []
225
- num_tokens = 0
226
- start = perf_counter()
227
- for new_text in streamer:
228
- current_time = perf_counter() - start
229
- model_output += new_text
230
- perf_text, num_tokens = estimate_latency(current_time, perf_text, new_text, per_token_time, num_tokens)
231
- yield model_output, perf_text
232
- start = perf_counter()
233
- return model_output, perf_text
234
- def reset_textbox(instruction: str, response: str, perf: str):
235
- """
236
- Helper function for resetting content of all text fields
237
-
238
- Parameters:
239
- instruction (str): Content of user instruction field.
240
- response (str): Content of model response field.
241
- perf (str): Content of performance info filed
242
-
243
- Returns:
244
- empty string for each placeholder
245
- """
246
- return "", "", ""
247
-
248
-
249
-
250
- examples = [
251
- "Give me a recipe for pizza with pineapple",
252
- "Write me a tweet about the new OpenVINO release",
253
- "Explain the difference between CPU and GPU",
254
- "Give five ideas for a great weekend with family",
255
- "Do Androids dream of Electric sheep?",
256
- "Who is Dolly?",
257
- "Please give me advice on how to write resume?",
258
- "Name 3 advantages to being a cat",
259
- "Write instructions on how to become a good AI engineer",
260
- "Write a love letter to my best friend",
261
- ]
262
-
263
- def main():
264
- with gr.Blocks() as demo:
265
- gr.Markdown(
266
- "# Question Answering with Model and OpenVINO.\n"
267
- "Provide instruction which describes a task below or select among predefined examples and model writes response that performs requested task."
268
- )
269
-
270
- with gr.Row():
271
- with gr.Column(scale=4):
272
- user_text = gr.Textbox(
273
- placeholder="Write an email about an alpaca that likes flan",
274
- label="User instruction",
275
- )
276
- model_output = gr.Textbox(label="Model response", interactive=False)
277
- performance = gr.Textbox(label="Performance", lines=1, interactive=False)
278
- with gr.Column(scale=1):
279
- button_clear = gr.Button(value="Clear")
280
- button_submit = gr.Button(value="Submit")
281
- gr.Examples(examples, user_text)
282
- with gr.Column(scale=1):
283
- max_new_tokens = gr.Slider(
284
- minimum=1,
285
- maximum=1000,
286
- value=256,
287
- step=1,
288
- interactive=True,
289
- label="Max New Tokens",
290
- )
291
- top_p = gr.Slider(
292
- minimum=0.05,
293
- maximum=1.0,
294
- value=0.92,
295
- step=0.05,
296
- interactive=True,
297
- label="Top-p (nucleus sampling)",
298
- )
299
- top_k = gr.Slider(
300
- minimum=0,
301
- maximum=50,
302
- value=0,
303
- step=1,
304
- interactive=True,
305
- label="Top-k",
306
- )
307
- temperature = gr.Slider(
308
- minimum=0.1,
309
- maximum=5.0,
310
- value=0.8,
311
- step=0.1,
312
- interactive=True,
313
- label="Temperature",
314
- )
315
-
316
- user_text.submit(
317
- run_generation,
318
- [user_text, top_p, temperature, top_k, max_new_tokens, performance],
319
- [model_output, performance],
320
- )
321
- button_submit.click(
322
- run_generation,
323
- [user_text, top_p, temperature, top_k, max_new_tokens, performance],
324
- [model_output, performance],
325
- )
326
- button_clear.click(
327
- reset_textbox,
328
- [user_text, model_output, performance],
329
- [user_text, model_output, performance],
330
- )
331
-
332
- if __name__ == "__main__":
333
- demo.queue()
334
- try:
335
- demo.launch(height=800)
336
- except Exception:
337
- demo.launch(share=True, height=800)
338
-
339
- # Call main function to start Gradio interface
340
  main()
 
1
+ import os
2
+ from transformers import AutoTokenizer, AutoConfig
3
+ from optimum.intel.openvino import OVModelForCausalLM
4
+ from generation_utils import run_generation, estimate_latency, reset_textbox,get_special_token_id
5
+ from config import SUPPORTED_LLM_MODELS
6
+ import gradio as gr
7
+ from threading import Thread
8
+ from time import perf_counter
9
+ from typing import List
10
+ from transformers import AutoTokenizer, TextIteratorStreamer
11
+ import numpy as np
12
+ import os
13
+ from flask import Flask, render_template, redirect, url_for, request, flash
14
+ from flask_sqlalchemy import SQLAlchemy
15
+ from flask_login import LoginManager, UserMixin, login_user, login_required, logout_user, current_user
16
+ from werkzeug.security import generate_password_hash, check_password_hash
17
+
18
+ app = Flask(__name__)
19
+
20
+
21
+ if __name__ == '__main__':
22
+ app.run(debug=True)
23
+ model_dir = "C:/phi-2/INT8_compressed_weights"
24
+ print(f"Checking model directory: {model_dir}")
25
+ print(f"Contents: {os.listdir(model_dir)}") # Check contents of the directory
26
+
27
+ print(f"Loading model from {model_dir}")
28
+
29
+
30
+ model_name = "susnato/phi-2"
31
+ model_configuration = SUPPORTED_LLM_MODELS["phi-2"]
32
+ ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}
33
+
34
+ tok = AutoTokenizer.from_pretrained(model_name)
35
+
36
+ ov_model = OVModelForCausalLM.from_pretrained(
37
+ model_dir,
38
+ device="CPU",
39
+ ov_config=ov_config,
40
+ )
41
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
42
+ tokenizer_kwargs = model_configuration.get("toeknizer_kwargs", {})
43
+ # Continue with your tokenizer usage
44
+ response_key = model_configuration.get("response_key")
45
+ tokenizer_response_key = None
46
+
47
+ def get_special_token_id(tokenizer: AutoTokenizer, key: str) -> int:
48
+ """
49
+ Gets the token ID for a given string that has been added to the tokenizer as a special token.
50
+
51
+ Args:
52
+ tokenizer (PreTrainedTokenizer): the tokenizer
53
+ key (str): the key to convert to a single token
54
+
55
+ Raises:
56
+ ValueError: if more than one ID was generated
57
+
58
+ Returns:
59
+ int: the token ID for the given key
60
+ """
61
+ token_ids = tokenizer.encode(key)
62
+ if len(token_ids) > 1:
63
+ raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
64
+ return token_ids[0]
65
+ if response_key is not None:
66
+ tokenizer_response_key = next(
67
+ (token for token in tokenizer.additional_special_tokens if token.startswith(response_key)),
68
+ None,
69
+ )
70
+
71
+ end_key_token_id = None
72
+ if tokenizer_response_key:
73
+ try:
74
+ end_key = model_configuration.get("end_key")
75
+ if end_key:
76
+ end_key_token_id =get_special_token_id(tokenizer, end_key)
77
+ # Ensure generation stops once it generates "### End"
78
+ except ValueError:
79
+ pass
80
+
81
+ prompt_template = model_configuration.get("prompt_template", "{instruction}")
82
+ end_key_token_id = end_key_token_id or tokenizer.eos_token_id
83
+ pad_token_id = end_key_token_id or tokenizer.pad_token_id
84
+
85
+ def estimate_latency(
86
+ current_time: float,
87
+ current_perf_text: str,
88
+ new_gen_text: str,
89
+ per_token_time: List[float],
90
+ num_tokens: int,
91
+ ):
92
+ """
93
+ Helper function for performance estimation
94
+
95
+ Parameters:
96
+ current_time (float): This step time in seconds.
97
+ current_perf_text (str): Current content of performance UI field.
98
+ new_gen_text (str): New generated text.
99
+ per_token_time (List[float]): history of performance from previous steps.
100
+ num_tokens (int): Total number of generated tokens.
101
+
102
+ Returns:
103
+ update for performance text field
104
+ update for a total number of tokens
105
+ """
106
+ num_current_toks = len(tokenizer.encode(new_gen_text))
107
+ num_tokens += num_current_toks
108
+ per_token_time.append(num_current_toks / current_time)
109
+ if len(per_token_time) > 10 and len(per_token_time) % 4 == 0:
110
+ current_bucket = per_token_time[:-10]
111
+ return (
112
+ f"Average generation speed: {np.mean(current_bucket):.2f} tokens/s. Total generated tokens: {num_tokens}",
113
+ num_tokens,
114
+ )
115
+ return current_perf_text, num_tokens
116
+ def run_generation(
117
+ user_text: str,
118
+ top_p: float,
119
+ temperature: float,
120
+ top_k: int,
121
+ max_new_tokens: int,
122
+ perf_text: str,
123
+ ):
124
+ """
125
+ Text generation function
126
+
127
+ Parameters:
128
+ user_text (str): User-provided instruction for a generation.
129
+ top_p (float): Nucleus sampling. If set to < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for a generation.
130
+ temperature (float): The value used to module the logits distribution.
131
+ top_k (int): The number of highest probability vocabulary tokens to keep for top-k-filtering.
132
+ max_new_tokens (int): Maximum length of generated sequence.
133
+ perf_text (str): Content of text field for printing performance results.
134
+ Returns:
135
+ model_output (str) - model-generated text
136
+ perf_text (str) - updated perf text filed content
137
+ """
138
+
139
+ # Prepare input prompt according to model expected template
140
+ prompt_text = prompt_template.format(instruction=user_text)
141
+
142
+ # Tokenize the user text.
143
+ model_inputs = tokenizer(prompt_text, return_tensors="pt", **tokenizer_kwargs)
144
+
145
+ # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
146
+ # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
147
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
148
+ generate_kwargs = dict(
149
+ model_inputs,
150
+ streamer=streamer,
151
+ max_new_tokens=max_new_tokens,
152
+ do_sample=True,
153
+ top_p=top_p,
154
+ temperature=float(temperature),
155
+ top_k=top_k,
156
+ eos_token_id=end_key_token_id,
157
+ pad_token_id=pad_token_id,
158
+ )
159
+ t = Thread(target=ov_model.generate, kwargs=generate_kwargs)
160
+ t.start()
161
+
162
+ # Pull the generated text from the streamer, and update the model output.
163
+ model_output = ""
164
+ per_token_time = []
165
+ num_tokens = 0
166
+ start = perf_counter()
167
+ for new_text in streamer:
168
+ current_time = perf_counter() - start
169
+ model_output += new_text
170
+ perf_text, num_tokens = estimate_latency(current_time, perf_text, new_text, per_token_time, num_tokens)
171
+ yield model_output, perf_text
172
+ start = perf_counter()
173
+ return model_output, perf_text
174
+ def reset_textbox(instruction: str, response: str, perf: str):
175
+ """
176
+ Helper function for resetting content of all text fields
177
+
178
+ Parameters:
179
+ instruction (str): Content of user instruction field.
180
+ response (str): Content of model response field.
181
+ perf (str): Content of performance info filed
182
+
183
+ Returns:
184
+ empty string for each placeholder
185
+ """
186
+ return "", "", ""
187
+
188
+
189
+
190
+ examples = [
191
+ "Give me a recipe for pizza with pineapple",
192
+ "Write me a tweet about the new OpenVINO release",
193
+ "Explain the difference between CPU and GPU",
194
+ "Give five ideas for a great weekend with family",
195
+ "Do Androids dream of Electric sheep?",
196
+ "Who is Dolly?",
197
+ "Please give me advice on how to write resume?",
198
+ "Name 3 advantages to being a cat",
199
+ "Write instructions on how to become a good AI engineer",
200
+ "Write a love letter to my best friend",
201
+ ]
202
+
203
+ def main():
204
+ with gr.Blocks() as demo:
205
+ gr.Markdown(
206
+ "# Question Answering with Model and OpenVINO.\n"
207
+ "Provide instruction which describes a task below or select among predefined examples and model writes response that performs requested task."
208
+ )
209
+
210
+ with gr.Row():
211
+ with gr.Column(scale=4):
212
+ user_text = gr.Textbox(
213
+ placeholder="Write an email about an alpaca that likes flan",
214
+ label="User instruction",
215
+ )
216
+ model_output = gr.Textbox(label="Model response", interactive=False)
217
+ performance = gr.Textbox(label="Performance", lines=1, interactive=False)
218
+ with gr.Column(scale=1):
219
+ button_clear = gr.Button(value="Clear")
220
+ button_submit = gr.Button(value="Submit")
221
+ gr.Examples(examples, user_text)
222
+ with gr.Column(scale=1):
223
+ max_new_tokens = gr.Slider(
224
+ minimum=1,
225
+ maximum=1000,
226
+ value=256,
227
+ step=1,
228
+ interactive=True,
229
+ label="Max New Tokens",
230
+ )
231
+ top_p = gr.Slider(
232
+ minimum=0.05,
233
+ maximum=1.0,
234
+ value=0.92,
235
+ step=0.05,
236
+ interactive=True,
237
+ label="Top-p (nucleus sampling)",
238
+ )
239
+ top_k = gr.Slider(
240
+ minimum=0,
241
+ maximum=50,
242
+ value=0,
243
+ step=1,
244
+ interactive=True,
245
+ label="Top-k",
246
+ )
247
+ temperature = gr.Slider(
248
+ minimum=0.1,
249
+ maximum=5.0,
250
+ value=0.8,
251
+ step=0.1,
252
+ interactive=True,
253
+ label="Temperature",
254
+ )
255
+
256
+ user_text.submit(
257
+ run_generation,
258
+ [user_text, top_p, temperature, top_k, max_new_tokens, performance],
259
+ [model_output, performance],
260
+ )
261
+ button_submit.click(
262
+ run_generation,
263
+ [user_text, top_p, temperature, top_k, max_new_tokens, performance],
264
+ [model_output, performance],
265
+ )
266
+ button_clear.click(
267
+ reset_textbox,
268
+ [user_text, model_output, performance],
269
+ [user_text, model_output, performance],
270
+ )
271
+
272
+ if __name__ == "__main__":
273
+ demo.queue()
274
+ try:
275
+ demo.launch(height=800)
276
+ except Exception:
277
+ demo.launch(share=True, height=800)
278
+
279
+ # Call main function to start Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  main()