locked the thread before generation on llama
Browse files
app.py
CHANGED
@@ -3,8 +3,7 @@ import gradio as gr
|
|
3 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
|
4 |
from huggingface_hub import login
|
5 |
import os
|
6 |
-
|
7 |
-
from openai import OpenAI
|
8 |
import spaces
|
9 |
# import multiprocessing as mp
|
10 |
import sys
|
@@ -111,13 +110,19 @@ def llama_generation(input_text: str,
|
|
111 |
if temperature == 0:
|
112 |
generate_kwargs["do_sample"] = False
|
113 |
|
|
|
|
|
|
|
114 |
# # Place the generation in a thread so we can access it.
|
115 |
# # place the function as target and place the kwargs next as the kwargs
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
|
|
|
|
|
|
121 |
thread.start()
|
122 |
thread.join()
|
123 |
return streamer
|
|
|
3 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
|
4 |
from huggingface_hub import login
|
5 |
import os
|
6 |
+
import threading
|
|
|
7 |
import spaces
|
8 |
# import multiprocessing as mp
|
9 |
import sys
|
|
|
110 |
if temperature == 0:
|
111 |
generate_kwargs["do_sample"] = False
|
112 |
|
113 |
+
# Use a lock object to synchronize access to the llama_model
|
114 |
+
lock = threading.lock()
|
115 |
+
|
116 |
# # Place the generation in a thread so we can access it.
|
117 |
# # place the function as target and place the kwargs next as the kwargs
|
118 |
+
def generation_llama():
|
119 |
+
with lock:
|
120 |
+
# Generate response using Llama3
|
121 |
+
response = llama_model.generate(**generate_kwargs)
|
122 |
+
return response
|
123 |
+
|
124 |
+
# start the thread and wait for it to finish
|
125 |
+
thread = threading.Thread(target=generation_llama)
|
126 |
thread.start()
|
127 |
thread.join()
|
128 |
return streamer
|