malvika2003 commited on
Commit
0d5912e
·
verified ·
1 Parent(s): af25edf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -96
app.py CHANGED
@@ -1,67 +1,38 @@
1
  import os
2
- from transformers import AutoTokenizer, AutoConfig
3
  from optimum.intel.openvino import OVModelForCausalLM
4
- from generation_utils import run_generation, estimate_latency, reset_textbox,get_special_token_id
5
  from config import SUPPORTED_LLM_MODELS
6
  import gradio as gr
7
  from threading import Thread
8
  from time import perf_counter
9
  from typing import List
10
- from transformers import AutoTokenizer, TextIteratorStreamer
11
  import numpy as np
12
- import os
13
- from flask import Flask, render_template, redirect, url_for, request, flash
14
- from flask_sqlalchemy import SQLAlchemy
15
- from flask_login import LoginManager, UserMixin, login_user, login_required, logout_user, current_user
16
- from werkzeug.security import generate_password_hash, check_password_hash
17
-
18
- app = Flask(__name__)
19
-
20
-
21
- if __name__ == '__main__':
22
- app.run(debug=True)
23
- model_dir = "C:/phi-2/INT8_compressed_weights"
24
- print(f"Checking model directory: {model_dir}")
25
- print(f"Contents: {os.listdir(model_dir)}") # Check contents of the directory
26
-
27
- print(f"Loading model from {model_dir}")
28
-
29
 
 
 
30
  model_name = "susnato/phi-2"
31
  model_configuration = SUPPORTED_LLM_MODELS["phi-2"]
32
  ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}
33
 
34
- tok = AutoTokenizer.from_pretrained(model_name)
35
 
36
  ov_model = OVModelForCausalLM.from_pretrained(
37
  model_dir,
38
  device="CPU",
39
  ov_config=ov_config,
40
  )
41
- tokenizer = AutoTokenizer.from_pretrained(model_name)
42
  tokenizer_kwargs = model_configuration.get("toeknizer_kwargs", {})
43
- # Continue with your tokenizer usage
44
  response_key = model_configuration.get("response_key")
45
  tokenizer_response_key = None
46
 
47
  def get_special_token_id(tokenizer: AutoTokenizer, key: str) -> int:
48
- """
49
- Gets the token ID for a given string that has been added to the tokenizer as a special token.
50
-
51
- Args:
52
- tokenizer (PreTrainedTokenizer): the tokenizer
53
- key (str): the key to convert to a single token
54
-
55
- Raises:
56
- ValueError: if more than one ID was generated
57
-
58
- Returns:
59
- int: the token ID for the given key
60
- """
61
  token_ids = tokenizer.encode(key)
62
  if len(token_ids) > 1:
63
  raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
64
  return token_ids[0]
 
65
  if response_key is not None:
66
  tokenizer_response_key = next(
67
  (token for token in tokenizer.additional_special_tokens if token.startswith(response_key)),
@@ -73,8 +44,7 @@ if tokenizer_response_key:
73
  try:
74
  end_key = model_configuration.get("end_key")
75
  if end_key:
76
- end_key_token_id =get_special_token_id(tokenizer, end_key)
77
- # Ensure generation stops once it generates "### End"
78
  except ValueError:
79
  pass
80
 
@@ -89,20 +59,6 @@ def estimate_latency(
89
  per_token_time: List[float],
90
  num_tokens: int,
91
  ):
92
- """
93
- Helper function for performance estimation
94
-
95
- Parameters:
96
- current_time (float): This step time in seconds.
97
- current_perf_text (str): Current content of performance UI field.
98
- new_gen_text (str): New generated text.
99
- per_token_time (List[float]): history of performance from previous steps.
100
- num_tokens (int): Total number of generated tokens.
101
-
102
- Returns:
103
- update for performance text field
104
- update for a total number of tokens
105
- """
106
  num_current_toks = len(tokenizer.encode(new_gen_text))
107
  num_tokens += num_current_toks
108
  per_token_time.append(num_current_toks / current_time)
@@ -113,6 +69,7 @@ def estimate_latency(
113
  num_tokens,
114
  )
115
  return current_perf_text, num_tokens
 
116
  def run_generation(
117
  user_text: str,
118
  top_p: float,
@@ -121,29 +78,8 @@ def run_generation(
121
  max_new_tokens: int,
122
  perf_text: str,
123
  ):
124
- """
125
- Text generation function
126
-
127
- Parameters:
128
- user_text (str): User-provided instruction for a generation.
129
- top_p (float): Nucleus sampling. If set to < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for a generation.
130
- temperature (float): The value used to module the logits distribution.
131
- top_k (int): The number of highest probability vocabulary tokens to keep for top-k-filtering.
132
- max_new_tokens (int): Maximum length of generated sequence.
133
- perf_text (str): Content of text field for printing performance results.
134
- Returns:
135
- model_output (str) - model-generated text
136
- perf_text (str) - updated perf text filed content
137
- """
138
-
139
- # Prepare input prompt according to model expected template
140
  prompt_text = prompt_template.format(instruction=user_text)
141
-
142
- # Tokenize the user text.
143
  model_inputs = tokenizer(prompt_text, return_tensors="pt", **tokenizer_kwargs)
144
-
145
- # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
146
- # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
147
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
148
  generate_kwargs = dict(
149
  model_inputs,
@@ -158,8 +94,6 @@ def run_generation(
158
  )
159
  t = Thread(target=ov_model.generate, kwargs=generate_kwargs)
160
  t.start()
161
-
162
- # Pull the generated text from the streamer, and update the model output.
163
  model_output = ""
164
  per_token_time = []
165
  num_tokens = 0
@@ -171,22 +105,10 @@ def run_generation(
171
  yield model_output, perf_text
172
  start = perf_counter()
173
  return model_output, perf_text
174
- def reset_textbox(instruction: str, response: str, perf: str):
175
- """
176
- Helper function for resetting content of all text fields
177
 
178
- Parameters:
179
- instruction (str): Content of user instruction field.
180
- response (str): Content of model response field.
181
- perf (str): Content of performance info filed
182
-
183
- Returns:
184
- empty string for each placeholder
185
- """
186
  return "", "", ""
187
 
188
-
189
-
190
  examples = [
191
  "Give me a recipe for pizza with pineapple",
192
  "Write me a tweet about the new OpenVINO release",
@@ -269,12 +191,12 @@ def main():
269
  [user_text, model_output, performance],
270
  )
271
 
272
- if __name__ == "__main__":
273
- demo.queue()
274
- try:
275
- demo.launch(height=800)
276
- except Exception:
277
- demo.launch(share=True, height=800)
 
 
278
 
279
- # Call main function to start Gradio interface
280
- main()
 
1
  import os
2
+ from transformers import AutoTokenizer
3
  from optimum.intel.openvino import OVModelForCausalLM
4
+ from generation_utils import run_generation, estimate_latency, reset_textbox, get_special_token_id
5
  from config import SUPPORTED_LLM_MODELS
6
  import gradio as gr
7
  from threading import Thread
8
  from time import perf_counter
9
  from typing import List
10
+ from transformers import TextIteratorStreamer
11
  import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ # Model configuration and loading
14
+ model_dir = "C:/Users/KIIT/OneDrive/Desktop/INTEL/phi-2/INT8_compressed_weights"
15
  model_name = "susnato/phi-2"
16
  model_configuration = SUPPORTED_LLM_MODELS["phi-2"]
17
  ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}
18
 
19
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
20
 
21
  ov_model = OVModelForCausalLM.from_pretrained(
22
  model_dir,
23
  device="CPU",
24
  ov_config=ov_config,
25
  )
 
26
  tokenizer_kwargs = model_configuration.get("toeknizer_kwargs", {})
 
27
  response_key = model_configuration.get("response_key")
28
  tokenizer_response_key = None
29
 
30
  def get_special_token_id(tokenizer: AutoTokenizer, key: str) -> int:
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  token_ids = tokenizer.encode(key)
32
  if len(token_ids) > 1:
33
  raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
34
  return token_ids[0]
35
+
36
  if response_key is not None:
37
  tokenizer_response_key = next(
38
  (token for token in tokenizer.additional_special_tokens if token.startswith(response_key)),
 
44
  try:
45
  end_key = model_configuration.get("end_key")
46
  if end_key:
47
+ end_key_token_id = get_special_token_id(tokenizer, end_key)
 
48
  except ValueError:
49
  pass
50
 
 
59
  per_token_time: List[float],
60
  num_tokens: int,
61
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  num_current_toks = len(tokenizer.encode(new_gen_text))
63
  num_tokens += num_current_toks
64
  per_token_time.append(num_current_toks / current_time)
 
69
  num_tokens,
70
  )
71
  return current_perf_text, num_tokens
72
+
73
  def run_generation(
74
  user_text: str,
75
  top_p: float,
 
78
  max_new_tokens: int,
79
  perf_text: str,
80
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  prompt_text = prompt_template.format(instruction=user_text)
 
 
82
  model_inputs = tokenizer(prompt_text, return_tensors="pt", **tokenizer_kwargs)
 
 
 
83
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
84
  generate_kwargs = dict(
85
  model_inputs,
 
94
  )
95
  t = Thread(target=ov_model.generate, kwargs=generate_kwargs)
96
  t.start()
 
 
97
  model_output = ""
98
  per_token_time = []
99
  num_tokens = 0
 
105
  yield model_output, perf_text
106
  start = perf_counter()
107
  return model_output, perf_text
 
 
 
108
 
109
+ def reset_textbox(instruction: str, response: str, perf: str):
 
 
 
 
 
 
 
110
  return "", "", ""
111
 
 
 
112
  examples = [
113
  "Give me a recipe for pizza with pineapple",
114
  "Write me a tweet about the new OpenVINO release",
 
191
  [user_text, model_output, performance],
192
  )
193
 
194
+ demo.queue()
195
+ try:
196
+ demo.launch(height=800)
197
+ except Exception:
198
+ demo.launch(share=True, height=800)
199
+
200
+ if __name__ == "__main__":
201
+ main()
202