awacke1 commited on
Commit
607046c
ยท
1 Parent(s): 3dd5173

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -71
app.py CHANGED
@@ -1,10 +1,7 @@
1
  from huggingface_hub import InferenceClient
2
  import gradio as gr
3
 
4
- client = InferenceClient(
5
- "mistralai/Mistral-7B-Instruct-v0.1"
6
- )
7
-
8
 
9
  def format_prompt(message, history):
10
  prompt = "<s>"
@@ -14,14 +11,11 @@ def format_prompt(message, history):
14
  prompt += f"[INST] {message} [/INST]"
15
  return prompt
16
 
17
- def generate(
18
- prompt, history, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,
19
- ):
20
  temperature = float(temperature)
21
  if temperature < 1e-2:
22
  temperature = 1e-2
23
  top_p = float(top_p)
24
-
25
  generate_kwargs = dict(
26
  temperature=temperature,
27
  max_new_tokens=max_new_tokens,
@@ -30,67 +24,23 @@ def generate(
30
  do_sample=True,
31
  seed=42,
32
  )
33
-
34
  formatted_prompt = format_prompt(prompt, history)
35
-
36
  stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
37
  output = ""
38
-
39
  for response in stream:
40
  output += response.token.text
41
  yield output
42
  return output
43
 
44
-
45
  additional_inputs=[
46
- gr.Slider(
47
- label="Temperature",
48
- value=0.9,
49
- minimum=0.0,
50
- maximum=1.0,
51
- step=0.05,
52
- interactive=True,
53
- info="Higher values produce more diverse outputs",
54
- ),
55
- gr.Slider(
56
- label="Max new tokens",
57
- value=256,
58
- minimum=0,
59
- maximum=1048,
60
- step=64,
61
- interactive=True,
62
- info="The maximum numbers of new tokens",
63
- ),
64
- gr.Slider(
65
- label="Top-p (nucleus sampling)",
66
- value=0.90,
67
- minimum=0.0,
68
- maximum=1,
69
- step=0.05,
70
- interactive=True,
71
- info="Higher values sample more low-probability tokens",
72
- ),
73
- gr.Slider(
74
- label="Repetition penalty",
75
- value=1.2,
76
- minimum=1.0,
77
- maximum=2.0,
78
- step=0.05,
79
- interactive=True,
80
- info="Penalize repeated tokens",
81
- )
82
  ]
83
 
84
- css = """
85
- #mkd {
86
- height: 200px;
87
- overflow: auto;
88
- border: 1px solid #ccc;
89
- }
90
- """
91
-
92
  with gr.Blocks(css=css) as demo:
93
-
94
  gr.ChatInterface(
95
  generate,
96
  additional_inputs=additional_inputs,
@@ -151,35 +101,25 @@ with gr.Blocks(css=css) as demo:
151
  | ๐Ÿ“ˆ Expected speedups with Flash Attention 2 | Upcoming update expected to bring speed improvements. | Keep an eye out for this update to benefit from performance gains. |
152
 
153
  # ๐Ÿ›  Model Features and More ๐Ÿ› 
154
-
155
  ## Features
156
-
157
  - ๐ŸชŸ Sliding Window Attention with 128K tokens span
158
  - **Byline**: Increases model's understanding of context, resulting in more coherent and contextually relevant outputs.
159
-
160
  - ๐Ÿš€ GQA for faster inference
161
  - **Byline**: Speeds up the model inference time without sacrificing too much on accuracy.
162
-
163
  - ๐Ÿ“ Byte-fallback BPE tokenizer
164
  - **Byline**: Allows the tokenizer to handle a wider variety of input text while keeping token size manageable.
165
-
166
  - ๐Ÿ“œ License: Released under Apache 2.0 License
167
  - **Byline**: Gives you a permissive free software license, allowing you freedom to use, modify, and distribute the code.
168
-
169
  ## Usage ๐Ÿ“ฆ
170
-
171
  - ๐Ÿ“š Available on Huggingface Hub
172
  - **Byline**: Makes it easier to integrate the model into various projects.
173
-
174
  - ๐Ÿ Python code snippets for easy setup
175
  - **Byline**: Facilitates rapid development and deployment, especially useful for prototyping.
176
-
177
  - ๐Ÿ“ˆ Expected speedups with Flash Attention 2
178
  - **Byline**: Keep an eye out for this update to benefit from performance gains.
179
  """
180
  gr.Markdown(markdown)
181
-
182
-
183
  def SpeechSynthesis(result):
184
  documentHTML5='''
185
  <!DOCTYPE html>
@@ -207,9 +147,7 @@ with gr.Blocks(css=css) as demo:
207
  </html>
208
  '''
209
  gr.HTML(documentHTML5)
210
- # components.html(documentHTML5, width=1280, height=1024)
211
- #return result
212
  SpeechSynthesis(markdown)
213
 
214
-
215
  demo.queue().launch(debug=True)
 
1
  from huggingface_hub import InferenceClient
2
  import gradio as gr
3
 
4
+ client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
 
 
 
5
 
6
  def format_prompt(message, history):
7
  prompt = "<s>"
 
11
  prompt += f"[INST] {message} [/INST]"
12
  return prompt
13
 
14
+ def generate(prompt, history, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,):
 
 
15
  temperature = float(temperature)
16
  if temperature < 1e-2:
17
  temperature = 1e-2
18
  top_p = float(top_p)
 
19
  generate_kwargs = dict(
20
  temperature=temperature,
21
  max_new_tokens=max_new_tokens,
 
24
  do_sample=True,
25
  seed=42,
26
  )
 
27
  formatted_prompt = format_prompt(prompt, history)
 
28
  stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
29
  output = ""
 
30
  for response in stream:
31
  output += response.token.text
32
  yield output
33
  return output
34
 
 
35
  additional_inputs=[
36
+ gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs",),
37
+ gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=1048, step=64, interactive=True, info="The maximum numbers of new tokens",),
38
+ gr.Slider(label="Top-p (nucleus sampling)",value=0.90,minimum=0.0,maximum=1,step=0.05,interactive=True,info="Higher values sample more low-probability tokens",),
39
+ gr.Slider(label="Repetition penalty",value=1.2,minimum=1.0,maximum=2.0,step=0.05,interactive=True,info="Penalize repeated tokens",)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  ]
41
 
42
+ css = """#mkd {height: 200px; overflow: auto; border: 1px solid #ccc;}"""
 
 
 
 
 
 
 
43
  with gr.Blocks(css=css) as demo:
 
44
  gr.ChatInterface(
45
  generate,
46
  additional_inputs=additional_inputs,
 
101
  | ๐Ÿ“ˆ Expected speedups with Flash Attention 2 | Upcoming update expected to bring speed improvements. | Keep an eye out for this update to benefit from performance gains. |
102
 
103
  # ๐Ÿ›  Model Features and More ๐Ÿ› 
 
104
  ## Features
 
105
  - ๐ŸชŸ Sliding Window Attention with 128K tokens span
106
  - **Byline**: Increases model's understanding of context, resulting in more coherent and contextually relevant outputs.
 
107
  - ๐Ÿš€ GQA for faster inference
108
  - **Byline**: Speeds up the model inference time without sacrificing too much on accuracy.
 
109
  - ๐Ÿ“ Byte-fallback BPE tokenizer
110
  - **Byline**: Allows the tokenizer to handle a wider variety of input text while keeping token size manageable.
 
111
  - ๐Ÿ“œ License: Released under Apache 2.0 License
112
  - **Byline**: Gives you a permissive free software license, allowing you freedom to use, modify, and distribute the code.
 
113
  ## Usage ๐Ÿ“ฆ
 
114
  - ๐Ÿ“š Available on Huggingface Hub
115
  - **Byline**: Makes it easier to integrate the model into various projects.
 
116
  - ๐Ÿ Python code snippets for easy setup
117
  - **Byline**: Facilitates rapid development and deployment, especially useful for prototyping.
 
118
  - ๐Ÿ“ˆ Expected speedups with Flash Attention 2
119
  - **Byline**: Keep an eye out for this update to benefit from performance gains.
120
  """
121
  gr.Markdown(markdown)
122
+
 
123
  def SpeechSynthesis(result):
124
  documentHTML5='''
125
  <!DOCTYPE html>
 
147
  </html>
148
  '''
149
  gr.HTML(documentHTML5)
150
+
 
151
  SpeechSynthesis(markdown)
152
 
 
153
  demo.queue().launch(debug=True)