semantic-entropy-probes

Sleeping

App Files Files Community

s-a-malik commited on Jul 17, 2024

Commit

0120475

1 Parent(s): b501b77

thread

Browse files

Files changed (1) hide show

app.py +71 -102

app.py CHANGED Viewed

@@ -26,10 +26,10 @@ DESCRIPTION = """
 """
 EXAMPLES = [
-    ["What is the capital of France?", "You are a helpful assistant."],
-    ["Who landed on the moon?", "You are a knowledgeable historian."],
-    ["Who is Yarin Gal?", "You are a helpful assistant."],
-    ["Explain the theory of relativity in simple terms.", "You are an expert physicist explaining concepts to a layman."],
 ]
 if torch.cuda.is_available():
@@ -93,28 +93,7 @@ class CustomStreamer(TextIteratorStreamer):
-#     se_highlighted_text = ""
-#     acc_highlighted_text = ""
-#     for new_text in streamer:
-#         hidden_states = streamer.hidden_states_queue.get()
-#         # Semantic Uncertainty Probe
-#         se_token_embeddings = torch.stack([layer[0, -1, :].cpu() for layer in hidden_states])
-#         se_concat_layers = se_token_embeddings.numpy()[se_layer_range[0]:se_layer_range[1]].reshape(-1)
-#         se_probe_pred = se_probe.predict_proba(se_concat_layers.reshape(1, -1))[0][1] * 2 - 1
-#         # Accuracy Probe
-#         acc_token_embeddings = torch.stack([layer[0, -1, :].cpu() for layer in hidden_states])
-#         acc_concat_layers = acc_token_embeddings.numpy()[acc_layer_range[0]:acc_layer_range[1]].reshape(-1)
-#         acc_probe_pred = acc_probe.predict_proba(acc_concat_layers.reshape(1, -1))[0][1] * 2 - 1
-#         se_new_highlighted_text = highlight_text(new_text, se_probe_pred)
-#         acc_new_highlighted_text = highlight_text(new_text, acc_probe_pred)
-#         se_highlighted_text += se_new_highlighted_text
-#         acc_highlighted_text += acc_new_highlighted_text
-#         yield se_highlighted_text, acc_highlighted_text
 @spaces.GPU
 def generate(
@@ -137,7 +116,8 @@ def generate(
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=max_new_tokens,
@@ -150,41 +130,84 @@ def generate(
         output_hidden_states=True,
         return_dict_in_generate=True,
     )
-    # Generate without threading
-    with torch.no_grad():
-        outputs = model.generate(**generation_kwargs)
-    generated_tokens = outputs.sequences[0, input_ids.shape[1]:]
-    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
-    # hidden states
-    hidden = outputs.hidden_states  # list of tensors, one for each token, then (batch size, sequence length, hidden size)
-    # TODO do this loop on the fly instead of waiting for the whole generation
     se_highlighted_text = ""
     acc_highlighted_text = ""
-    for i in range(1, len(hidden)):
         # Semantic Uncertainty Probe
-        token_embeddings = torch.stack([generated_token[0, 0, :].cpu() for generated_token in hidden[i]]).numpy()   # (num_layers, hidden_size)
         se_concat_layers = token_embeddings[se_layer_range[0]:se_layer_range[1]].reshape(-1)
         se_probe_pred = se_probe.predict_proba(se_concat_layers.reshape(1, -1))[0][1] * 2 - 1
         # Accuracy Probe
-        # acc_token_embeddings = torch.stack([layer[0, -1, :].cpu() for layer in hidden_states])
         acc_concat_layers = token_embeddings[acc_layer_range[0]:acc_layer_range[1]].reshape(-1)
         acc_probe_pred = (1 - acc_probe.predict_proba(acc_concat_layers.reshape(1, -1))[0][1]) * 2 - 1
-        output_id = outputs.sequences[0, input_ids.shape[1]+i]
-        output_word = tokenizer.decode(output_id)
-        print(output_id, output_word, se_probe_pred, acc_probe_pred)
-        se_new_highlighted_text = highlight_text(output_word, se_probe_pred)
-        acc_new_highlighted_text = highlight_text(output_word, acc_probe_pred)
         se_highlighted_text += f" {se_new_highlighted_text}"
         acc_highlighted_text += f" {acc_new_highlighted_text}"
         # yield se_highlighted_text, acc_highlighted_text
-    return se_highlighted_text, acc_highlighted_text
@@ -215,7 +238,7 @@ with gr.Blocks(title="Llama-2 7B Chat with Dual Probes", css="footer {visibility
         with gr.Column():
             max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
-            temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
             top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
             top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
             repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
@@ -243,7 +266,6 @@ with gr.Blocks(title="Llama-2 7B Chat with Dual Probes", css="footer {visibility
         inputs=[message, system_prompt],
         outputs=[se_output, acc_output],
         fn=generate,
     )
     generate_btn.click(
@@ -252,59 +274,6 @@ with gr.Blocks(title="Llama-2 7B Chat with Dual Probes", css="footer {visibility
         outputs=[se_output, acc_output]
     )
-# chat_interface = gr.ChatInterface(
-#     fn=generate,
-#     additional_inputs=[
-#         gr.Textbox(label="System prompt", lines=6),
-#         gr.Slider(
-#             label="Max new tokens",
-#             minimum=1,
-#             maximum=MAX_MAX_NEW_TOKENS,
-#             step=1,
-#             value=DEFAULT_MAX_NEW_TOKENS,
-#         ),
-#         gr.Slider(
-#             label="Temperature",
-#             minimum=0.1,
-#             maximum=4.0,
-#             step=0.1,
-#             value=0.6,
-#         ),
-#         gr.Slider(
-#             label="Top-p (nucleus sampling)",
-#             minimum=0.05,
-#             maximum=1.0,
-#             step=0.05,
-#             value=0.9,
-#         ),
-#         gr.Slider(
-#             label="Top-k",
-#             minimum=1,
-#             maximum=1000,
-#             step=1,
-#             value=50,
-#         ),
-#         gr.Slider(
-#             label="Repetition penalty",
-#             minimum=1.0,
-#             maximum=2.0,
-#             step=0.05,
-#             value=1.2,
-#         ),
-#     ],
-#     stop_btn=None,
-#     examples=[
-#         ["What is the capital of France?"],
-#         ["Who landed on the moon?"],
-#         ["Who is Yarin Gal?"]
-#     ],
-#     title="Llama-2 7B Chat with Streamable Semantic Uncertainty Probe",
-#     description=DESCRIPTION,
-# )
-# if __name__ == "__main__":
-#     chat_interface.launch()
 if __name__ == "__main__":
     demo.launch()

 """
 EXAMPLES = [
+    ["What is the capital of France?", ""],
+    ["Who landed on the moon?", ""],
+    ["Who is Yarin Gal?", ""],
+    ["Explain the theory of relativity in simple terms.", ""],
 ]
 if torch.cuda.is_available():
 @spaces.GPU
 def generate(
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
+    # streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    streamer = CustomStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens=max_new_tokens,
         output_hidden_states=True,
         return_dict_in_generate=True,
     )
+    # with threading
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
     se_highlighted_text = ""
     acc_highlighted_text = ""
+    for new_text in streamer:
+        hidden_states = streamer.hidden_states_queue.get()
         # Semantic Uncertainty Probe
+        token_embeddings = torch.stack([generated_token[0, 0, :].cpu() for generated_token in hidden_states]).numpy()   # (num_layers, hidden_size)
         se_concat_layers = token_embeddings[se_layer_range[0]:se_layer_range[1]].reshape(-1)
         se_probe_pred = se_probe.predict_proba(se_concat_layers.reshape(1, -1))[0][1] * 2 - 1
         # Accuracy Probe
         acc_concat_layers = token_embeddings[acc_layer_range[0]:acc_layer_range[1]].reshape(-1)
         acc_probe_pred = (1 - acc_probe.predict_proba(acc_concat_layers.reshape(1, -1))[0][1]) * 2 - 1
+        print(new_text, se_probe_pred, acc_probe_pred)
+        se_new_highlighted_text = highlight_text(new_text, se_probe_pred)
+        acc_new_highlighted_text = highlight_text(new_text, acc_probe_pred)
         se_highlighted_text += f" {se_new_highlighted_text}"
         acc_highlighted_text += f" {acc_new_highlighted_text}"
+        yield se_highlighted_text, acc_highlighted_text
+        # Semantic Uncertainty Probe
+        # se_token_embeddings = torch.stack([layer[0, -1, :].cpu() for layer in hidden_states])
+        # se_concat_layers = se_token_embeddings.numpy()[se_layer_range[0]:se_layer_range[1]].reshape(-1)
+        # se_probe_pred = se_probe.predict_proba(se_concat_layers.reshape(1, -1))[0][1] * 2 - 1
+        # # Accuracy Probe
+        # acc_token_embeddings = torch.stack([layer[0, -1, :].cpu() for layer in hidden_states])
+        # acc_concat_layers = acc_token_embeddings.numpy()[acc_layer_range[0]:acc_layer_range[1]].reshape(-1)
+        # acc_probe_pred = acc_probe.predict_proba(acc_concat_layers.reshape(1, -1))[0][1] * 2 - 1
+        # se_new_highlighted_text = highlight_text(new_text, se_probe_pred)
+        # acc_new_highlighted_text = highlight_text(new_text, acc_probe_pred)
+        # se_highlighted_text += se_new_highlighted_text
+        # acc_highlighted_text += acc_new_highlighted_text
         # yield se_highlighted_text, acc_highlighted_text
+    # Generate without threading
+    # with torch.no_grad():
+    #     outputs = model.generate(**generation_kwargs)
+    # generated_tokens = outputs.sequences[0, input_ids.shape[1]:]
+    # generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+    # # hidden states
+    # hidden = outputs.hidden_states  # list of tensors, one for each token, then (batch size, sequence length, hidden size)
+    # # TODO do this loop on the fly instead of waiting for the whole generation
+    # se_highlighted_text = ""
+    # acc_highlighted_text = ""
+    # for i in range(1, len(hidden)):
+    #     # Semantic Uncertainty Probe
+    #     token_embeddings = torch.stack([generated_token[0, 0, :].cpu() for generated_token in hidden[i]]).numpy()   # (num_layers, hidden_size)
+    #     se_concat_layers = token_embeddings[se_layer_range[0]:se_layer_range[1]].reshape(-1)
+    #     se_probe_pred = se_probe.predict_proba(se_concat_layers.reshape(1, -1))[0][1] * 2 - 1
+    #     # Accuracy Probe
+    #     # acc_token_embeddings = torch.stack([layer[0, -1, :].cpu() for layer in hidden_states])
+    #     acc_concat_layers = token_embeddings[acc_layer_range[0]:acc_layer_range[1]].reshape(-1)
+    #     acc_probe_pred = (1 - acc_probe.predict_proba(acc_concat_layers.reshape(1, -1))[0][1]) * 2 - 1
+    #     output_id = outputs.sequences[0, input_ids.shape[1]+i]
+    #     output_word = tokenizer.decode(output_id)
+    #     print(output_id, output_word, se_probe_pred, acc_probe_pred)
+    #     se_new_highlighted_text = highlight_text(output_word, se_probe_pred)
+    #     acc_new_highlighted_text = highlight_text(output_word, acc_probe_pred)
+    #     se_highlighted_text += f" {se_new_highlighted_text}"
+    #     acc_highlighted_text += f" {acc_new_highlighted_text}"
+    #     # yield se_highlighted_text, acc_highlighted_text
+    # return se_highlighted_text, acc_highlighted_text
         with gr.Column():
             max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
+            temperature = gr.Slider(label="Temperature", minimum=0.01, maximum=2.0, step=0.1, value=0.01)
             top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
             top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
             repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         inputs=[message, system_prompt],
         outputs=[se_output, acc_output],
         fn=generate,
     )
     generate_btn.click(
         outputs=[se_output, acc_output]
     )
 if __name__ == "__main__":
     demo.launch()