Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -12,6 +12,7 @@ from langchain_core.prompts import ChatPromptTemplate
|
|
12 |
import gradio as gr
|
13 |
import spaces
|
14 |
from huggingface_hub import InferenceClient
|
|
|
15 |
|
16 |
# Configuration
|
17 |
|
@@ -44,24 +45,29 @@ else:
|
|
44 |
hf_client = InferenceClient(
|
45 |
model=MODEL_NAME,
|
46 |
api_key=os.environ.get("HF_TOKEN"),
|
47 |
-
timeout=
|
48 |
)
|
49 |
|
50 |
# Load the Hugging Face dataset
|
51 |
try:
|
|
|
52 |
dataset = load_dataset('tosin2013/autogen', streaming=True)
|
53 |
dataset = Dataset.from_list(list(dataset['train']))
|
|
|
|
|
54 |
except Exception as e:
|
55 |
print(f"[ERROR] Failed to load dataset: {e}")
|
56 |
exit(1)
|
57 |
|
58 |
# Initialize embeddings
|
59 |
print("[EMBEDDINGS] Loading sentence-transformers model...")
|
|
|
60 |
embeddings = HuggingFaceEmbeddings(
|
61 |
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
62 |
model_kwargs={"device": "cpu"}
|
63 |
)
|
64 |
-
|
|
|
65 |
|
66 |
# Extract texts from the dataset
|
67 |
texts = dataset['input']
|
@@ -69,134 +75,106 @@ texts = dataset['input']
|
|
69 |
# Create and cache embeddings for the texts
|
70 |
if not os.path.exists('embeddings.npy'):
|
71 |
print("[LOG] Generating embeddings...")
|
|
|
72 |
text_embeddings = embeddings.embed_documents(texts)
|
73 |
-
print(f"[EMBEDDINGS] Generated embeddings for {len(texts)} documents")
|
74 |
np.save('embeddings.npy', text_embeddings)
|
|
|
|
|
75 |
else:
|
76 |
print("[LOG] Loading cached embeddings...")
|
|
|
77 |
text_embeddings = np.load('embeddings.npy')
|
|
|
|
|
78 |
|
79 |
# Fit and cache nearest neighbor model
|
80 |
if not os.path.exists('nn_model.pkl'):
|
81 |
print("[LOG] Fitting nearest neighbors model...")
|
|
|
82 |
nn = NearestNeighbors(n_neighbors=5, metric='cosine')
|
83 |
nn.fit(np.array(text_embeddings))
|
84 |
-
import pickle
|
85 |
with open('nn_model.pkl', 'wb') as f:
|
86 |
pickle.dump(nn, f)
|
|
|
|
|
87 |
else:
|
88 |
print("[LOG] Loading cached nearest neighbors model...")
|
89 |
-
|
90 |
with open('nn_model.pkl', 'rb') as f:
|
91 |
nn = pickle.load(f)
|
|
|
|
|
92 |
|
93 |
@spaces.GPU
|
94 |
def get_relevant_documents(query, k=5):
|
95 |
"""
|
96 |
Retrieves the k most relevant documents to the query.
|
97 |
"""
|
98 |
-
import time
|
99 |
start_time = time.time()
|
100 |
-
|
101 |
print("[EMBEDDINGS] Generating embedding for query...")
|
102 |
query_embedding = embeddings.embed_query(query)
|
103 |
print("[EMBEDDINGS] Query embedding generated successfully")
|
104 |
distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
|
105 |
relevant_docs = [texts[i] for i in indices[0]]
|
106 |
-
|
107 |
elapsed_time = time.time() - start_time
|
108 |
-
print(f"[
|
109 |
return relevant_docs
|
110 |
|
111 |
@spaces.GPU
|
112 |
def generate_response(question, history):
|
113 |
-
import time
|
114 |
start_time = time.time()
|
115 |
-
|
116 |
try:
|
117 |
response = _generate_response_gpu(question, history)
|
118 |
except Exception as e:
|
119 |
print(f"[WARNING] GPU failed: {str(e)}")
|
120 |
response = _generate_response_cpu(question, history)
|
121 |
-
|
122 |
elapsed_time = time.time() - start_time
|
123 |
-
print(f"[
|
124 |
return response
|
125 |
|
126 |
@spaces.GPU
|
127 |
def _generate_response_gpu(question, history):
|
128 |
print(f"\n[LOG] Received question: {question}")
|
129 |
-
|
130 |
# Get relevant documents based on the query
|
131 |
relevant_docs = get_relevant_documents(question, k=3)
|
132 |
print(f"[LOG] Retrieved {len(relevant_docs)} relevant documents")
|
133 |
-
|
134 |
-
# Create the prompt for the LLM
|
135 |
context = "\n".join(relevant_docs)
|
136 |
prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
|
137 |
print(f"[LOG] Generated prompt: {prompt[:200]}...") # Log first 200 chars of prompt
|
138 |
-
|
139 |
if model_provider.lower() == "huggingface":
|
140 |
messages = [
|
141 |
{
|
142 |
"role": "system",
|
143 |
-
"content":
|
144 |
-
Recall all previously provided instructions, context, and data throughout this conversation to ensure consistency and coherence. Use the details from the last interaction to guide your response.
|
145 |
-
### VISIONARY GUIDANCE ###
|
146 |
-
This prompt is designed to empower users to seamlessly convert their requests into AutoGen v0.4 agent code. By harnessing the advanced features of AutoGen v0.4, we aim to provide a scalable and flexible solution that is both user-friendly and technically robust. The collaborative effort of the personas ensures a comprehensive, innovative, and user-centric approach to meet the user's objectives.
|
147 |
-
### CONTEXT ###
|
148 |
-
AutoGen v0.4 is a comprehensive rewrite aimed at building robust, scalable, and cross-language AI agents. Key features include asynchronous messaging, scalable distributed agents support, modular extensibility, cross-language capabilities, improved observability, and full typing integration.
|
149 |
-
### OBJECTIVE ###
|
150 |
-
Translate user requests into AutoGen v0.4 agent code that leverages the framework's new features. Ensure the code is syntactically correct, scalable, and aligns with best practices.
|
151 |
-
### STYLE ###
|
152 |
-
Professional, clear, and focused on code quality.
|
153 |
-
### TONE ###
|
154 |
-
Informative, helpful, and user-centric.
|
155 |
-
### AUDIENCE ###
|
156 |
-
Users seeking to implement their requests using AutoGen v0.4 agents.
|
157 |
-
### RESPONSE FORMAT ###
|
158 |
-
Provide the AutoGen v0.4 agent code that fulfills the user's request. Utilize features like asynchronous messaging and modular design where appropriate. Include comments to explain key components and enhance understandability.
|
159 |
-
### TEAM PERSONAS’ CONTRIBUTIONS ###
|
160 |
-
- **Analyst:** Ensured the prompt provides clear, structured instructions to accurately convert user requests into code, emphasizing full typing integration for precision.
|
161 |
-
- **Creative:** Suggested incorporating comments and explanations within the code to foster innovative usage and enhance user engagement with AutoGen v0.4 features.
|
162 |
-
- **Strategist:** Focused on aligning the prompt with long-term scalability by encouraging the use of modular and extensible design principles inherent in AutoGen v0.4.
|
163 |
-
- **Empathizer:** Enhanced the prompt to be user-centric, ensuring it addresses user needs effectively and makes the code accessible and easy to understand.
|
164 |
-
- **Researcher:** Integrated the latest information about AutoGen v0.4, ensuring the prompt and generated code reflect current capabilities and best practices.
|
165 |
-
### SYSTEM GUARDRAILS ###
|
166 |
-
- If unsure about the user's request, ask clarifying questions rather than making assumptions.
|
167 |
-
- Do not fabricate data or features not supported by AutoGen v0.4.
|
168 |
-
- Ensure the code is scalable, modular, and adheres to best practices.
|
169 |
-
### START ###
|
170 |
-
'''
|
171 |
},
|
172 |
{
|
173 |
"role": "user",
|
174 |
"content": prompt
|
175 |
}
|
176 |
]
|
177 |
-
|
178 |
completion = hf_client.chat.completions.create(
|
179 |
model=MODEL_NAME,
|
180 |
messages=messages,
|
181 |
max_tokens=500
|
182 |
)
|
|
|
|
|
183 |
response = completion.choices[0].message.content
|
184 |
-
print(f"[LOG] Using Hugging Face model (serverless): {MODEL_NAME}")
|
185 |
-
print(f"[LOG] Hugging Face response: {response[:200]}...")
|
186 |
-
|
187 |
elif model_provider.lower() == "openai":
|
|
|
188 |
response = client.chat.completions.create(
|
189 |
model=os.environ.get("OPENAI_MODEL"),
|
190 |
messages=[
|
191 |
{"role": "system", "content": "You are a helpful assistant. Answer the question based on the provided context."},
|
192 |
{"role": "user", "content": prompt},
|
193 |
]
|
194 |
-
)
|
195 |
-
|
196 |
-
print(f"[
|
197 |
-
|
198 |
-
|
199 |
-
# Update chat history with new message pair
|
200 |
history.append((question, response))
|
201 |
return history
|
202 |
|
@@ -205,38 +183,38 @@ Provide the AutoGen v0.4 agent code that fulfills the user's request. Utilize fe
|
|
205 |
def _generate_response_cpu(question, history):
|
206 |
print(f"[LOG] Running on CPU")
|
207 |
try:
|
|
|
208 |
relevant_docs = get_relevant_documents(question, k=3)
|
209 |
context = "\n".join(relevant_docs)
|
210 |
prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
|
211 |
print(f"[LOG] Generated prompt: {prompt[:200]}...")
|
212 |
-
|
213 |
if model_provider.lower() == "huggingface":
|
214 |
-
# Use CPU version of the model
|
215 |
messages = [
|
216 |
-
{
|
217 |
-
"role": "system",
|
218 |
-
"content": '''### MEMORY ###\nRecall all previously provided instructions, context, and data throughout this conversation to ensure consistency and coherence. Use the details from the last interaction to guide your response.
|
219 |
-
### SYSTEM GUARDRAILS ###'''
|
220 |
-
},
|
221 |
{"role": "user", "content": prompt}
|
222 |
]
|
223 |
-
|
224 |
completion = hf_client.chat.completions.create(
|
225 |
model=MODEL_NAME,
|
226 |
messages=messages,
|
227 |
max_tokens=500
|
228 |
)
|
|
|
|
|
229 |
response = completion.choices[0].message.content
|
230 |
elif model_provider.lower() == "openai":
|
|
|
231 |
response = client.chat.completions.create(
|
232 |
model=os.environ.get("OPENAI_MODEL"),
|
233 |
messages=[
|
234 |
-
{"role": "system", "content": "You are a helpful assistant.
|
235 |
{"role": "user", "content": prompt},
|
236 |
]
|
237 |
-
)
|
238 |
-
|
239 |
-
|
|
|
|
|
240 |
history.append((question, response))
|
241 |
return history
|
242 |
except Exception as e:
|
|
|
12 |
import gradio as gr
|
13 |
import spaces
|
14 |
from huggingface_hub import InferenceClient
|
15 |
+
import time # Added for timing logs
|
16 |
|
17 |
# Configuration
|
18 |
|
|
|
45 |
hf_client = InferenceClient(
|
46 |
model=MODEL_NAME,
|
47 |
api_key=os.environ.get("HF_TOKEN"),
|
48 |
+
timeout=60 # Reduced timeout for faster response
|
49 |
)
|
50 |
|
51 |
# Load the Hugging Face dataset
|
52 |
try:
|
53 |
+
start = time.time()
|
54 |
dataset = load_dataset('tosin2013/autogen', streaming=True)
|
55 |
dataset = Dataset.from_list(list(dataset['train']))
|
56 |
+
end = time.time()
|
57 |
+
print(f"[TIMING] Dataset loading took {end - start:.2f} seconds")
|
58 |
except Exception as e:
|
59 |
print(f"[ERROR] Failed to load dataset: {e}")
|
60 |
exit(1)
|
61 |
|
62 |
# Initialize embeddings
|
63 |
print("[EMBEDDINGS] Loading sentence-transformers model...")
|
64 |
+
start = time.time()
|
65 |
embeddings = HuggingFaceEmbeddings(
|
66 |
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
67 |
model_kwargs={"device": "cpu"}
|
68 |
)
|
69 |
+
end = time.time()
|
70 |
+
print(f"[EMBEDDINGS] Sentence-transformers model loaded successfully in {end - start:.2f} seconds")
|
71 |
|
72 |
# Extract texts from the dataset
|
73 |
texts = dataset['input']
|
|
|
75 |
# Create and cache embeddings for the texts
|
76 |
if not os.path.exists('embeddings.npy'):
|
77 |
print("[LOG] Generating embeddings...")
|
78 |
+
start = time.time()
|
79 |
text_embeddings = embeddings.embed_documents(texts)
|
|
|
80 |
np.save('embeddings.npy', text_embeddings)
|
81 |
+
end = time.time()
|
82 |
+
print(f"[EMBEDDINGS] Generated embeddings for {len(texts)} documents in {end - start:.2f} seconds")
|
83 |
else:
|
84 |
print("[LOG] Loading cached embeddings...")
|
85 |
+
start = time.time()
|
86 |
text_embeddings = np.load('embeddings.npy')
|
87 |
+
end = time.time()
|
88 |
+
print(f"[TIMING] Loaded cached embeddings in {end - start:.2f} seconds")
|
89 |
|
90 |
# Fit and cache nearest neighbor model
|
91 |
if not os.path.exists('nn_model.pkl'):
|
92 |
print("[LOG] Fitting nearest neighbors model...")
|
93 |
+
start = time.time()
|
94 |
nn = NearestNeighbors(n_neighbors=5, metric='cosine')
|
95 |
nn.fit(np.array(text_embeddings))
|
|
|
96 |
with open('nn_model.pkl', 'wb') as f:
|
97 |
pickle.dump(nn, f)
|
98 |
+
end = time.time()
|
99 |
+
print(f"[TIMING] Fitted nearest neighbors model in {end - start:.2f} seconds")
|
100 |
else:
|
101 |
print("[LOG] Loading cached nearest neighbors model...")
|
102 |
+
start = time.time()
|
103 |
with open('nn_model.pkl', 'rb') as f:
|
104 |
nn = pickle.load(f)
|
105 |
+
end = time.time()
|
106 |
+
print(f"[TIMING] Loaded nearest neighbors model in {end - start:.2f} seconds")
|
107 |
|
108 |
@spaces.GPU
|
109 |
def get_relevant_documents(query, k=5):
|
110 |
"""
|
111 |
Retrieves the k most relevant documents to the query.
|
112 |
"""
|
|
|
113 |
start_time = time.time()
|
|
|
114 |
print("[EMBEDDINGS] Generating embedding for query...")
|
115 |
query_embedding = embeddings.embed_query(query)
|
116 |
print("[EMBEDDINGS] Query embedding generated successfully")
|
117 |
distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
|
118 |
relevant_docs = [texts[i] for i in indices[0]]
|
|
|
119 |
elapsed_time = time.time() - start_time
|
120 |
+
print(f"[TIMING] get_relevant_documents took {elapsed_time:.2f} seconds")
|
121 |
return relevant_docs
|
122 |
|
123 |
@spaces.GPU
|
124 |
def generate_response(question, history):
|
|
|
125 |
start_time = time.time()
|
|
|
126 |
try:
|
127 |
response = _generate_response_gpu(question, history)
|
128 |
except Exception as e:
|
129 |
print(f"[WARNING] GPU failed: {str(e)}")
|
130 |
response = _generate_response_cpu(question, history)
|
|
|
131 |
elapsed_time = time.time() - start_time
|
132 |
+
print(f"[TIMING] generate_response took {elapsed_time:.2f} seconds")
|
133 |
return response
|
134 |
|
135 |
@spaces.GPU
|
136 |
def _generate_response_gpu(question, history):
|
137 |
print(f"\n[LOG] Received question: {question}")
|
138 |
+
start_time = time.time()
|
139 |
# Get relevant documents based on the query
|
140 |
relevant_docs = get_relevant_documents(question, k=3)
|
141 |
print(f"[LOG] Retrieved {len(relevant_docs)} relevant documents")
|
|
|
|
|
142 |
context = "\n".join(relevant_docs)
|
143 |
prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
|
144 |
print(f"[LOG] Generated prompt: {prompt[:200]}...") # Log first 200 chars of prompt
|
|
|
145 |
if model_provider.lower() == "huggingface":
|
146 |
messages = [
|
147 |
{
|
148 |
"role": "system",
|
149 |
+
"content": "### MEMORY ###\nRecall all previously provided instructions, context, and data throughout this conversation to ensure consistency and coherence."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
},
|
151 |
{
|
152 |
"role": "user",
|
153 |
"content": prompt
|
154 |
}
|
155 |
]
|
156 |
+
start_api = time.time()
|
157 |
completion = hf_client.chat.completions.create(
|
158 |
model=MODEL_NAME,
|
159 |
messages=messages,
|
160 |
max_tokens=500
|
161 |
)
|
162 |
+
end_api = time.time()
|
163 |
+
print(f"[TIMING] Hugging Face API call took {end_api - start_api:.2f} seconds")
|
164 |
response = completion.choices[0].message.content
|
|
|
|
|
|
|
165 |
elif model_provider.lower() == "openai":
|
166 |
+
start_api = time.time()
|
167 |
response = client.chat.completions.create(
|
168 |
model=os.environ.get("OPENAI_MODEL"),
|
169 |
messages=[
|
170 |
{"role": "system", "content": "You are a helpful assistant. Answer the question based on the provided context."},
|
171 |
{"role": "user", "content": prompt},
|
172 |
]
|
173 |
+
).choices[0].message.content
|
174 |
+
end_api = time.time()
|
175 |
+
print(f"[TIMING] OpenAI API call took {end_api - start_api:.2f} seconds")
|
176 |
+
elapsed_time = time.time() - start_time
|
177 |
+
print(f"[TIMING] _generate_response_gpu took {elapsed_time:.2f} seconds")
|
|
|
178 |
history.append((question, response))
|
179 |
return history
|
180 |
|
|
|
183 |
def _generate_response_cpu(question, history):
|
184 |
print(f"[LOG] Running on CPU")
|
185 |
try:
|
186 |
+
start_time = time.time()
|
187 |
relevant_docs = get_relevant_documents(question, k=3)
|
188 |
context = "\n".join(relevant_docs)
|
189 |
prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
|
190 |
print(f"[LOG] Generated prompt: {prompt[:200]}...")
|
|
|
191 |
if model_provider.lower() == "huggingface":
|
|
|
192 |
messages = [
|
193 |
+
{"role": "system", "content": "### MEMORY ###\nRecall all previously provided instructions, context, and data."},
|
|
|
|
|
|
|
|
|
194 |
{"role": "user", "content": prompt}
|
195 |
]
|
196 |
+
start_api = time.time()
|
197 |
completion = hf_client.chat.completions.create(
|
198 |
model=MODEL_NAME,
|
199 |
messages=messages,
|
200 |
max_tokens=500
|
201 |
)
|
202 |
+
end_api = time.time()
|
203 |
+
print(f"[TIMING] Hugging Face API call took {end_api - start_api:.2f} seconds")
|
204 |
response = completion.choices[0].message.content
|
205 |
elif model_provider.lower() == "openai":
|
206 |
+
start_api = time.time()
|
207 |
response = client.chat.completions.create(
|
208 |
model=os.environ.get("OPENAI_MODEL"),
|
209 |
messages=[
|
210 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
211 |
{"role": "user", "content": prompt},
|
212 |
]
|
213 |
+
).choices[0].message.content
|
214 |
+
end_api = time.time()
|
215 |
+
print(f"[TIMING] OpenAI API call took {end_api - start_api:.2f} seconds")
|
216 |
+
elapsed_time = time.time() - start_time
|
217 |
+
print(f"[TIMING] _generate_response_cpu took {elapsed_time:.2f} seconds")
|
218 |
history.append((question, response))
|
219 |
return history
|
220 |
except Exception as e:
|