tosin2013 commited on
Commit
dd4d93f
·
verified ·
1 Parent(s): edcf891

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -67
app.py CHANGED
@@ -12,6 +12,7 @@ from langchain_core.prompts import ChatPromptTemplate
12
  import gradio as gr
13
  import spaces
14
  from huggingface_hub import InferenceClient
 
15
 
16
  # Configuration
17
 
@@ -44,24 +45,29 @@ else:
44
  hf_client = InferenceClient(
45
  model=MODEL_NAME,
46
  api_key=os.environ.get("HF_TOKEN"),
47
- timeout=120 # Reduced timeout for faster response
48
  )
49
 
50
  # Load the Hugging Face dataset
51
  try:
 
52
  dataset = load_dataset('tosin2013/autogen', streaming=True)
53
  dataset = Dataset.from_list(list(dataset['train']))
 
 
54
  except Exception as e:
55
  print(f"[ERROR] Failed to load dataset: {e}")
56
  exit(1)
57
 
58
  # Initialize embeddings
59
  print("[EMBEDDINGS] Loading sentence-transformers model...")
 
60
  embeddings = HuggingFaceEmbeddings(
61
  model_name="sentence-transformers/all-MiniLM-L6-v2",
62
  model_kwargs={"device": "cpu"}
63
  )
64
- print("[EMBEDDINGS] Sentence-transformers model loaded successfully")
 
65
 
66
  # Extract texts from the dataset
67
  texts = dataset['input']
@@ -69,134 +75,106 @@ texts = dataset['input']
69
  # Create and cache embeddings for the texts
70
  if not os.path.exists('embeddings.npy'):
71
  print("[LOG] Generating embeddings...")
 
72
  text_embeddings = embeddings.embed_documents(texts)
73
- print(f"[EMBEDDINGS] Generated embeddings for {len(texts)} documents")
74
  np.save('embeddings.npy', text_embeddings)
 
 
75
  else:
76
  print("[LOG] Loading cached embeddings...")
 
77
  text_embeddings = np.load('embeddings.npy')
 
 
78
 
79
  # Fit and cache nearest neighbor model
80
  if not os.path.exists('nn_model.pkl'):
81
  print("[LOG] Fitting nearest neighbors model...")
 
82
  nn = NearestNeighbors(n_neighbors=5, metric='cosine')
83
  nn.fit(np.array(text_embeddings))
84
- import pickle
85
  with open('nn_model.pkl', 'wb') as f:
86
  pickle.dump(nn, f)
 
 
87
  else:
88
  print("[LOG] Loading cached nearest neighbors model...")
89
- import pickle
90
  with open('nn_model.pkl', 'rb') as f:
91
  nn = pickle.load(f)
 
 
92
 
93
  @spaces.GPU
94
  def get_relevant_documents(query, k=5):
95
  """
96
  Retrieves the k most relevant documents to the query.
97
  """
98
- import time
99
  start_time = time.time()
100
-
101
  print("[EMBEDDINGS] Generating embedding for query...")
102
  query_embedding = embeddings.embed_query(query)
103
  print("[EMBEDDINGS] Query embedding generated successfully")
104
  distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
105
  relevant_docs = [texts[i] for i in indices[0]]
106
-
107
  elapsed_time = time.time() - start_time
108
- print(f"[PERF] get_relevant_documents took {elapsed_time:.2f} seconds")
109
  return relevant_docs
110
 
111
  @spaces.GPU
112
  def generate_response(question, history):
113
- import time
114
  start_time = time.time()
115
-
116
  try:
117
  response = _generate_response_gpu(question, history)
118
  except Exception as e:
119
  print(f"[WARNING] GPU failed: {str(e)}")
120
  response = _generate_response_cpu(question, history)
121
-
122
  elapsed_time = time.time() - start_time
123
- print(f"[PERF] generate_response took {elapsed_time:.2f} seconds")
124
  return response
125
 
126
  @spaces.GPU
127
  def _generate_response_gpu(question, history):
128
  print(f"\n[LOG] Received question: {question}")
129
-
130
  # Get relevant documents based on the query
131
  relevant_docs = get_relevant_documents(question, k=3)
132
  print(f"[LOG] Retrieved {len(relevant_docs)} relevant documents")
133
-
134
- # Create the prompt for the LLM
135
  context = "\n".join(relevant_docs)
136
  prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
137
  print(f"[LOG] Generated prompt: {prompt[:200]}...") # Log first 200 chars of prompt
138
-
139
  if model_provider.lower() == "huggingface":
140
  messages = [
141
  {
142
  "role": "system",
143
- "content": '''### MEMORY ###
144
- Recall all previously provided instructions, context, and data throughout this conversation to ensure consistency and coherence. Use the details from the last interaction to guide your response.
145
- ### VISIONARY GUIDANCE ###
146
- This prompt is designed to empower users to seamlessly convert their requests into AutoGen v0.4 agent code. By harnessing the advanced features of AutoGen v0.4, we aim to provide a scalable and flexible solution that is both user-friendly and technically robust. The collaborative effort of the personas ensures a comprehensive, innovative, and user-centric approach to meet the user's objectives.
147
- ### CONTEXT ###
148
- AutoGen v0.4 is a comprehensive rewrite aimed at building robust, scalable, and cross-language AI agents. Key features include asynchronous messaging, scalable distributed agents support, modular extensibility, cross-language capabilities, improved observability, and full typing integration.
149
- ### OBJECTIVE ###
150
- Translate user requests into AutoGen v0.4 agent code that leverages the framework's new features. Ensure the code is syntactically correct, scalable, and aligns with best practices.
151
- ### STYLE ###
152
- Professional, clear, and focused on code quality.
153
- ### TONE ###
154
- Informative, helpful, and user-centric.
155
- ### AUDIENCE ###
156
- Users seeking to implement their requests using AutoGen v0.4 agents.
157
- ### RESPONSE FORMAT ###
158
- Provide the AutoGen v0.4 agent code that fulfills the user's request. Utilize features like asynchronous messaging and modular design where appropriate. Include comments to explain key components and enhance understandability.
159
- ### TEAM PERSONAS’ CONTRIBUTIONS ###
160
- - **Analyst:** Ensured the prompt provides clear, structured instructions to accurately convert user requests into code, emphasizing full typing integration for precision.
161
- - **Creative:** Suggested incorporating comments and explanations within the code to foster innovative usage and enhance user engagement with AutoGen v0.4 features.
162
- - **Strategist:** Focused on aligning the prompt with long-term scalability by encouraging the use of modular and extensible design principles inherent in AutoGen v0.4.
163
- - **Empathizer:** Enhanced the prompt to be user-centric, ensuring it addresses user needs effectively and makes the code accessible and easy to understand.
164
- - **Researcher:** Integrated the latest information about AutoGen v0.4, ensuring the prompt and generated code reflect current capabilities and best practices.
165
- ### SYSTEM GUARDRAILS ###
166
- - If unsure about the user's request, ask clarifying questions rather than making assumptions.
167
- - Do not fabricate data or features not supported by AutoGen v0.4.
168
- - Ensure the code is scalable, modular, and adheres to best practices.
169
- ### START ###
170
- '''
171
  },
172
  {
173
  "role": "user",
174
  "content": prompt
175
  }
176
  ]
177
-
178
  completion = hf_client.chat.completions.create(
179
  model=MODEL_NAME,
180
  messages=messages,
181
  max_tokens=500
182
  )
 
 
183
  response = completion.choices[0].message.content
184
- print(f"[LOG] Using Hugging Face model (serverless): {MODEL_NAME}")
185
- print(f"[LOG] Hugging Face response: {response[:200]}...")
186
-
187
  elif model_provider.lower() == "openai":
 
188
  response = client.chat.completions.create(
189
  model=os.environ.get("OPENAI_MODEL"),
190
  messages=[
191
  {"role": "system", "content": "You are a helpful assistant. Answer the question based on the provided context."},
192
  {"role": "user", "content": prompt},
193
  ]
194
- )
195
- response = response.choices[0].message.content
196
- print(f"[LOG] Using OpenAI model: {os.environ.get('OPENAI_MODEL')}")
197
- print(f"[LOG] OpenAI response: {response[:200]}...") # Log first 200 chars of response
198
-
199
- # Update chat history with new message pair
200
  history.append((question, response))
201
  return history
202
 
@@ -205,38 +183,38 @@ Provide the AutoGen v0.4 agent code that fulfills the user's request. Utilize fe
205
  def _generate_response_cpu(question, history):
206
  print(f"[LOG] Running on CPU")
207
  try:
 
208
  relevant_docs = get_relevant_documents(question, k=3)
209
  context = "\n".join(relevant_docs)
210
  prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
211
  print(f"[LOG] Generated prompt: {prompt[:200]}...")
212
-
213
  if model_provider.lower() == "huggingface":
214
- # Use CPU version of the model
215
  messages = [
216
- {
217
- "role": "system",
218
- "content": '''### MEMORY ###\nRecall all previously provided instructions, context, and data throughout this conversation to ensure consistency and coherence. Use the details from the last interaction to guide your response.
219
- ### SYSTEM GUARDRAILS ###'''
220
- },
221
  {"role": "user", "content": prompt}
222
  ]
223
-
224
  completion = hf_client.chat.completions.create(
225
  model=MODEL_NAME,
226
  messages=messages,
227
  max_tokens=500
228
  )
 
 
229
  response = completion.choices[0].message.content
230
  elif model_provider.lower() == "openai":
 
231
  response = client.chat.completions.create(
232
  model=os.environ.get("OPENAI_MODEL"),
233
  messages=[
234
- {"role": "system", "content": "You are a helpful assistant. Answer the question based on the provided context."},
235
  {"role": "user", "content": prompt},
236
  ]
237
- )
238
- response = response.choices[0].message.content
239
-
 
 
240
  history.append((question, response))
241
  return history
242
  except Exception as e:
 
12
  import gradio as gr
13
  import spaces
14
  from huggingface_hub import InferenceClient
15
+ import time # Added for timing logs
16
 
17
  # Configuration
18
 
 
45
  hf_client = InferenceClient(
46
  model=MODEL_NAME,
47
  api_key=os.environ.get("HF_TOKEN"),
48
+ timeout=60 # Reduced timeout for faster response
49
  )
50
 
51
  # Load the Hugging Face dataset
52
  try:
53
+ start = time.time()
54
  dataset = load_dataset('tosin2013/autogen', streaming=True)
55
  dataset = Dataset.from_list(list(dataset['train']))
56
+ end = time.time()
57
+ print(f"[TIMING] Dataset loading took {end - start:.2f} seconds")
58
  except Exception as e:
59
  print(f"[ERROR] Failed to load dataset: {e}")
60
  exit(1)
61
 
62
  # Initialize embeddings
63
  print("[EMBEDDINGS] Loading sentence-transformers model...")
64
+ start = time.time()
65
  embeddings = HuggingFaceEmbeddings(
66
  model_name="sentence-transformers/all-MiniLM-L6-v2",
67
  model_kwargs={"device": "cpu"}
68
  )
69
+ end = time.time()
70
+ print(f"[EMBEDDINGS] Sentence-transformers model loaded successfully in {end - start:.2f} seconds")
71
 
72
  # Extract texts from the dataset
73
  texts = dataset['input']
 
75
  # Create and cache embeddings for the texts
76
  if not os.path.exists('embeddings.npy'):
77
  print("[LOG] Generating embeddings...")
78
+ start = time.time()
79
  text_embeddings = embeddings.embed_documents(texts)
 
80
  np.save('embeddings.npy', text_embeddings)
81
+ end = time.time()
82
+ print(f"[EMBEDDINGS] Generated embeddings for {len(texts)} documents in {end - start:.2f} seconds")
83
  else:
84
  print("[LOG] Loading cached embeddings...")
85
+ start = time.time()
86
  text_embeddings = np.load('embeddings.npy')
87
+ end = time.time()
88
+ print(f"[TIMING] Loaded cached embeddings in {end - start:.2f} seconds")
89
 
90
  # Fit and cache nearest neighbor model
91
  if not os.path.exists('nn_model.pkl'):
92
  print("[LOG] Fitting nearest neighbors model...")
93
+ start = time.time()
94
  nn = NearestNeighbors(n_neighbors=5, metric='cosine')
95
  nn.fit(np.array(text_embeddings))
 
96
  with open('nn_model.pkl', 'wb') as f:
97
  pickle.dump(nn, f)
98
+ end = time.time()
99
+ print(f"[TIMING] Fitted nearest neighbors model in {end - start:.2f} seconds")
100
  else:
101
  print("[LOG] Loading cached nearest neighbors model...")
102
+ start = time.time()
103
  with open('nn_model.pkl', 'rb') as f:
104
  nn = pickle.load(f)
105
+ end = time.time()
106
+ print(f"[TIMING] Loaded nearest neighbors model in {end - start:.2f} seconds")
107
 
108
  @spaces.GPU
109
  def get_relevant_documents(query, k=5):
110
  """
111
  Retrieves the k most relevant documents to the query.
112
  """
 
113
  start_time = time.time()
 
114
  print("[EMBEDDINGS] Generating embedding for query...")
115
  query_embedding = embeddings.embed_query(query)
116
  print("[EMBEDDINGS] Query embedding generated successfully")
117
  distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
118
  relevant_docs = [texts[i] for i in indices[0]]
 
119
  elapsed_time = time.time() - start_time
120
+ print(f"[TIMING] get_relevant_documents took {elapsed_time:.2f} seconds")
121
  return relevant_docs
122
 
123
  @spaces.GPU
124
  def generate_response(question, history):
 
125
  start_time = time.time()
 
126
  try:
127
  response = _generate_response_gpu(question, history)
128
  except Exception as e:
129
  print(f"[WARNING] GPU failed: {str(e)}")
130
  response = _generate_response_cpu(question, history)
 
131
  elapsed_time = time.time() - start_time
132
+ print(f"[TIMING] generate_response took {elapsed_time:.2f} seconds")
133
  return response
134
 
135
  @spaces.GPU
136
  def _generate_response_gpu(question, history):
137
  print(f"\n[LOG] Received question: {question}")
138
+ start_time = time.time()
139
  # Get relevant documents based on the query
140
  relevant_docs = get_relevant_documents(question, k=3)
141
  print(f"[LOG] Retrieved {len(relevant_docs)} relevant documents")
 
 
142
  context = "\n".join(relevant_docs)
143
  prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
144
  print(f"[LOG] Generated prompt: {prompt[:200]}...") # Log first 200 chars of prompt
 
145
  if model_provider.lower() == "huggingface":
146
  messages = [
147
  {
148
  "role": "system",
149
+ "content": "### MEMORY ###\nRecall all previously provided instructions, context, and data throughout this conversation to ensure consistency and coherence."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  },
151
  {
152
  "role": "user",
153
  "content": prompt
154
  }
155
  ]
156
+ start_api = time.time()
157
  completion = hf_client.chat.completions.create(
158
  model=MODEL_NAME,
159
  messages=messages,
160
  max_tokens=500
161
  )
162
+ end_api = time.time()
163
+ print(f"[TIMING] Hugging Face API call took {end_api - start_api:.2f} seconds")
164
  response = completion.choices[0].message.content
 
 
 
165
  elif model_provider.lower() == "openai":
166
+ start_api = time.time()
167
  response = client.chat.completions.create(
168
  model=os.environ.get("OPENAI_MODEL"),
169
  messages=[
170
  {"role": "system", "content": "You are a helpful assistant. Answer the question based on the provided context."},
171
  {"role": "user", "content": prompt},
172
  ]
173
+ ).choices[0].message.content
174
+ end_api = time.time()
175
+ print(f"[TIMING] OpenAI API call took {end_api - start_api:.2f} seconds")
176
+ elapsed_time = time.time() - start_time
177
+ print(f"[TIMING] _generate_response_gpu took {elapsed_time:.2f} seconds")
 
178
  history.append((question, response))
179
  return history
180
 
 
183
  def _generate_response_cpu(question, history):
184
  print(f"[LOG] Running on CPU")
185
  try:
186
+ start_time = time.time()
187
  relevant_docs = get_relevant_documents(question, k=3)
188
  context = "\n".join(relevant_docs)
189
  prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
190
  print(f"[LOG] Generated prompt: {prompt[:200]}...")
 
191
  if model_provider.lower() == "huggingface":
 
192
  messages = [
193
+ {"role": "system", "content": "### MEMORY ###\nRecall all previously provided instructions, context, and data."},
 
 
 
 
194
  {"role": "user", "content": prompt}
195
  ]
196
+ start_api = time.time()
197
  completion = hf_client.chat.completions.create(
198
  model=MODEL_NAME,
199
  messages=messages,
200
  max_tokens=500
201
  )
202
+ end_api = time.time()
203
+ print(f"[TIMING] Hugging Face API call took {end_api - start_api:.2f} seconds")
204
  response = completion.choices[0].message.content
205
  elif model_provider.lower() == "openai":
206
+ start_api = time.time()
207
  response = client.chat.completions.create(
208
  model=os.environ.get("OPENAI_MODEL"),
209
  messages=[
210
+ {"role": "system", "content": "You are a helpful assistant."},
211
  {"role": "user", "content": prompt},
212
  ]
213
+ ).choices[0].message.content
214
+ end_api = time.time()
215
+ print(f"[TIMING] OpenAI API call took {end_api - start_api:.2f} seconds")
216
+ elapsed_time = time.time() - start_time
217
+ print(f"[TIMING] _generate_response_cpu took {elapsed_time:.2f} seconds")
218
  history.append((question, response))
219
  return history
220
  except Exception as e: