tosin2013 commited on
Commit
d4ba41f
·
1 Parent(s): c6155ce
Files changed (2) hide show
  1. app.py +79 -33
  2. embeddings.npy +3 -0
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  from openai import OpenAI
3
- from langchain_community.embeddings import HuggingFaceEmbeddings
4
  from datasets import load_dataset, Dataset
5
  from sklearn.neighbors import NearestNeighbors
6
  import numpy as np
@@ -34,23 +34,22 @@ if model_provider.lower() == "openai":
34
  MODEL_NAME = os.environ['OPENAI_MODEL']
35
  client = OpenAI(
36
  base_url=os.environ.get("OPENAI_BASE"),
37
- api_key=api_key
38
  )
39
  else:
40
- MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"
41
  # Initialize Hugging Face InferenceClient with GPU support
42
  hf_client = InferenceClient(
43
  model=MODEL_NAME,
44
  api_key=os.environ.get("HF_TOKEN"),
45
- timeout=120 # Increased timeout for GPU inference
46
  )
47
 
48
  # Load the Hugging Face dataset
49
  dataset = load_dataset('tosin2013/autogen', streaming=True)
50
  dataset = Dataset.from_list(list(dataset['train']))
51
 
52
- # Check GPU availability and initialize embeddings
53
- # Use CPU for embeddings since GPU is handled by spaces.GPU decorator
54
  embeddings = HuggingFaceEmbeddings(
55
  model_name="sentence-transformers/all-MiniLM-L6-v2",
56
  model_kwargs={"device": "cpu"}
@@ -59,41 +58,57 @@ embeddings = HuggingFaceEmbeddings(
59
  # Extract texts from the dataset
60
  texts = dataset['input']
61
 
62
- # Create embeddings for the texts
63
- text_embeddings = embeddings.embed_documents(texts)
64
-
65
- # Fit a nearest neighbor model
66
- nn = NearestNeighbors(n_neighbors=5, metric='cosine')
67
- nn.fit(np.array(text_embeddings))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  def get_relevant_documents(query, k=5):
70
  """
71
  Retrieves the k most relevant documents to the query.
72
  """
73
- try:
74
- # Try GPU first
75
- with spaces.GPU(duration=15):
76
- query_embedding = embeddings.embed_query(query)
77
- distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
78
- relevant_docs = [texts[i] for i in indices[0]]
79
- return relevant_docs
80
- except Exception as e:
81
- print(f"[WARNING] GPU failed, falling back to CPU: {str(e)}")
82
- # Fallback to CPU
83
- embeddings.model_kwargs["device"] = "cpu"
84
- query_embedding = embeddings.embed_query(query)
85
- distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
86
- relevant_docs = [texts[i] for i in indices[0]]
87
- return relevant_docs
88
 
89
  def generate_response(question, history):
 
 
 
90
  try:
91
- # Try GPU first with reduced duration
92
- with spaces.GPU(duration=60):
93
- return _generate_response_gpu(question, history)
94
  except Exception as e:
95
- print(f"[WARNING] GPU failed, falling back to CPU: {str(e)}")
96
- return _generate_response_cpu(question, history)
 
 
 
 
97
 
98
  def _generate_response_gpu(question, history):
99
  print(f"\n[LOG] Received question: {question}")
@@ -273,6 +288,7 @@ Provide the AutoGen v0.4 agent code that fulfills the user's request. Utilize fe
273
  history.append((question, error_msg))
274
  return history
275
 
 
276
  # Create Gradio interface
277
  with gr.Blocks() as demo:
278
  gr.Markdown(f"""
@@ -318,5 +334,35 @@ with gr.Blocks() as demo:
318
  outputs=[chatbot, question]
319
  )
320
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  if __name__ == "__main__":
322
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  from openai import OpenAI
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
  from datasets import load_dataset, Dataset
5
  from sklearn.neighbors import NearestNeighbors
6
  import numpy as np
 
34
  MODEL_NAME = os.environ['OPENAI_MODEL']
35
  client = OpenAI(
36
  base_url=os.environ.get("OPENAI_BASE"),
37
+ api_key=os.environ.get("OPENAI_API_KEY")
38
  )
39
  else:
40
+ MODEL_NAME = "deepseek-ai/deepseek-coder-33b-instruct"
41
  # Initialize Hugging Face InferenceClient with GPU support
42
  hf_client = InferenceClient(
43
  model=MODEL_NAME,
44
  api_key=os.environ.get("HF_TOKEN"),
45
+ timeout=30 # Reduced timeout for faster response
46
  )
47
 
48
  # Load the Hugging Face dataset
49
  dataset = load_dataset('tosin2013/autogen', streaming=True)
50
  dataset = Dataset.from_list(list(dataset['train']))
51
 
52
+ # Initialize embeddings
 
53
  embeddings = HuggingFaceEmbeddings(
54
  model_name="sentence-transformers/all-MiniLM-L6-v2",
55
  model_kwargs={"device": "cpu"}
 
58
  # Extract texts from the dataset
59
  texts = dataset['input']
60
 
61
+ # Create and cache embeddings for the texts
62
+ if not os.path.exists('embeddings.npy'):
63
+ print("[LOG] Generating embeddings...")
64
+ text_embeddings = embeddings.embed_documents(texts)
65
+ np.save('embeddings.npy', text_embeddings)
66
+ else:
67
+ print("[LOG] Loading cached embeddings...")
68
+ text_embeddings = np.load('embeddings.npy')
69
+
70
+ # Fit and cache nearest neighbor model
71
+ if not os.path.exists('nn_model.pkl'):
72
+ print("[LOG] Fitting nearest neighbors model...")
73
+ nn = NearestNeighbors(n_neighbors=5, metric='cosine')
74
+ nn.fit(np.array(text_embeddings))
75
+ import pickle
76
+ with open('nn_model.pkl', 'wb') as f:
77
+ pickle.dump(nn, f)
78
+ else:
79
+ print("[LOG] Loading cached nearest neighbors model...")
80
+ import pickle
81
+ with open('nn_model.pkl', 'rb') as f:
82
+ nn = pickle.load(f)
83
 
84
  def get_relevant_documents(query, k=5):
85
  """
86
  Retrieves the k most relevant documents to the query.
87
  """
88
+ import time
89
+ start_time = time.time()
90
+
91
+ query_embedding = embeddings.embed_query(query)
92
+ distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
93
+ relevant_docs = [texts[i] for i in indices[0]]
94
+
95
+ elapsed_time = time.time() - start_time
96
+ print(f"[PERF] get_relevant_documents took {elapsed_time:.2f} seconds")
97
+ return relevant_docs
 
 
 
 
 
98
 
99
  def generate_response(question, history):
100
+ import time
101
+ start_time = time.time()
102
+
103
  try:
104
+ response = _generate_response_gpu(question, history)
 
 
105
  except Exception as e:
106
+ print(f"[WARNING] GPU failed: {str(e)}")
107
+ response = _generate_response_cpu(question, history)
108
+
109
+ elapsed_time = time.time() - start_time
110
+ print(f"[PERF] generate_response took {elapsed_time:.2f} seconds")
111
+ return response
112
 
113
  def _generate_response_gpu(question, history):
114
  print(f"\n[LOG] Received question: {question}")
 
288
  history.append((question, error_msg))
289
  return history
290
 
291
+
292
  # Create Gradio interface
293
  with gr.Blocks() as demo:
294
  gr.Markdown(f"""
 
334
  outputs=[chatbot, question]
335
  )
336
 
337
+ import socket
338
+
339
+ def find_available_port(start_port=7860, end_port=7900):
340
+ for port in range(start_port, end_port + 1):
341
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
342
+ try:
343
+ s.bind(('', port))
344
+ return port
345
+ except OSError:
346
+ continue
347
+ raise OSError(f"No available ports between {start_port} and {end_port}")
348
+
349
  if __name__ == "__main__":
350
+ try:
351
+ port = find_available_port()
352
+ print(f"[LOG] Launching application on port {port}")
353
+ demo.launch(
354
+ server_port=port,
355
+ share=True,
356
+ server_name="0.0.0.0",
357
+ prevent_thread_lock=True
358
+ )
359
+ # Verify server is actually running
360
+ import time
361
+ time.sleep(2) # Give server time to start
362
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
363
+ if s.connect_ex(('localhost', port)) == 0:
364
+ print(f"[SUCCESS] Server is running on port {port}")
365
+ else:
366
+ print(f"[ERROR] Failed to bind to port {port}")
367
+ except Exception as e:
368
+ print(f"[ERROR] Failed to start application: {str(e)}")
embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9538fa345c8d0006fb2cb25372e1cbcd7d761ea7c02307196878823c3d09942b
3
+ size 1483904