Spaces:

tosin2013
/

autogen-agent-gen

Runtime error

App Files Files Community

tosin2013 commited on Jan 2

Commit

dd4d93f

verified ·

1 Parent(s): edcf891

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -67

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ from langchain_core.prompts import ChatPromptTemplate
 import gradio as gr
 import spaces
 from huggingface_hub import InferenceClient
 # Configuration
@@ -44,24 +45,29 @@ else:
     hf_client = InferenceClient(
         model=MODEL_NAME,
         api_key=os.environ.get("HF_TOKEN"),
-        timeout=120  # Reduced timeout for faster response
     )
 # Load the Hugging Face dataset
 try:
     dataset = load_dataset('tosin2013/autogen', streaming=True)
     dataset = Dataset.from_list(list(dataset['train']))
 except Exception as e:
     print(f"[ERROR] Failed to load dataset: {e}")
     exit(1)
 # Initialize embeddings
 print("[EMBEDDINGS] Loading sentence-transformers model...")
 embeddings = HuggingFaceEmbeddings(
     model_name="sentence-transformers/all-MiniLM-L6-v2",
     model_kwargs={"device": "cpu"}
 )
-print("[EMBEDDINGS] Sentence-transformers model loaded successfully")
 # Extract texts from the dataset
 texts = dataset['input']
@@ -69,134 +75,106 @@ texts = dataset['input']
 # Create and cache embeddings for the texts
 if not os.path.exists('embeddings.npy'):
     print("[LOG] Generating embeddings...")
     text_embeddings = embeddings.embed_documents(texts)
-    print(f"[EMBEDDINGS] Generated embeddings for {len(texts)} documents")
     np.save('embeddings.npy', text_embeddings)
 else:
     print("[LOG] Loading cached embeddings...")
     text_embeddings = np.load('embeddings.npy')
 # Fit and cache nearest neighbor model
 if not os.path.exists('nn_model.pkl'):
     print("[LOG] Fitting nearest neighbors model...")
     nn = NearestNeighbors(n_neighbors=5, metric='cosine')
     nn.fit(np.array(text_embeddings))
-    import pickle
     with open('nn_model.pkl', 'wb') as f:
         pickle.dump(nn, f)
 else:
     print("[LOG] Loading cached nearest neighbors model...")
-    import pickle
     with open('nn_model.pkl', 'rb') as f:
         nn = pickle.load(f)
 @spaces.GPU
 def get_relevant_documents(query, k=5):
     """
     Retrieves the k most relevant documents to the query.
     """
-    import time
     start_time = time.time()
     print("[EMBEDDINGS] Generating embedding for query...")
     query_embedding = embeddings.embed_query(query)
     print("[EMBEDDINGS] Query embedding generated successfully")
     distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
     relevant_docs = [texts[i] for i in indices[0]]
     elapsed_time = time.time() - start_time
-    print(f"[PERF] get_relevant_documents took {elapsed_time:.2f} seconds")
     return relevant_docs
 @spaces.GPU
 def generate_response(question, history):
-    import time
     start_time = time.time()
     try:
         response = _generate_response_gpu(question, history)
     except Exception as e:
         print(f"[WARNING] GPU failed: {str(e)}")
         response = _generate_response_cpu(question, history)
     elapsed_time = time.time() - start_time
-    print(f"[PERF] generate_response took {elapsed_time:.2f} seconds")
     return response
 @spaces.GPU
 def _generate_response_gpu(question, history):
     print(f"\n[LOG] Received question: {question}")
     # Get relevant documents based on the query
     relevant_docs = get_relevant_documents(question, k=3)
     print(f"[LOG] Retrieved {len(relevant_docs)} relevant documents")
-    # Create the prompt for the LLM
     context = "\n".join(relevant_docs)
     prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
     print(f"[LOG] Generated prompt: {prompt[:200]}...")  # Log first 200 chars of prompt
     if model_provider.lower() == "huggingface":
         messages = [
             {
                 "role": "system",
-                "content": '''### MEMORY ###
-Recall all previously provided instructions, context, and data throughout this conversation to ensure consistency and coherence. Use the details from the last interaction to guide your response.
-### VISIONARY GUIDANCE ###
-This prompt is designed to empower users to seamlessly convert their requests into AutoGen v0.4 agent code. By harnessing the advanced features of AutoGen v0.4, we aim to provide a scalable and flexible solution that is both user-friendly and technically robust. The collaborative effort of the personas ensures a comprehensive, innovative, and user-centric approach to meet the user's objectives.
-### CONTEXT ###
-AutoGen v0.4 is a comprehensive rewrite aimed at building robust, scalable, and cross-language AI agents. Key features include asynchronous messaging, scalable distributed agents support, modular extensibility, cross-language capabilities, improved observability, and full typing integration.
-### OBJECTIVE ###
-Translate user requests into AutoGen v0.4 agent code that leverages the framework's new features. Ensure the code is syntactically correct, scalable, and aligns with best practices.
-### STYLE ###
-Professional, clear, and focused on code quality.
-### TONE ###
-Informative, helpful, and user-centric.
-### AUDIENCE ###
-Users seeking to implement their requests using AutoGen v0.4 agents.
-### RESPONSE FORMAT ###
-Provide the AutoGen v0.4 agent code that fulfills the user's request. Utilize features like asynchronous messaging and modular design where appropriate. Include comments to explain key components and enhance understandability.
-### TEAM PERSONAS’ CONTRIBUTIONS ###
-- **Analyst:** Ensured the prompt provides clear, structured instructions to accurately convert user requests into code, emphasizing full typing integration for precision.
-- **Creative:** Suggested incorporating comments and explanations within the code to foster innovative usage and enhance user engagement with AutoGen v0.4 features.
-- **Strategist:** Focused on aligning the prompt with long-term scalability by encouraging the use of modular and extensible design principles inherent in AutoGen v0.4.
-- **Empathizer:** Enhanced the prompt to be user-centric, ensuring it addresses user needs effectively and makes the code accessible and easy to understand.
-- **Researcher:** Integrated the latest information about AutoGen v0.4, ensuring the prompt and generated code reflect current capabilities and best practices.
-### SYSTEM GUARDRAILS ###
-- If unsure about the user's request, ask clarifying questions rather than making assumptions.
-- Do not fabricate data or features not supported by AutoGen v0.4.
-- Ensure the code is scalable, modular, and adheres to best practices.
-### START ###
-'''
             },
             {
                 "role": "user",
                 "content": prompt
             }
         ]
         completion = hf_client.chat.completions.create(
             model=MODEL_NAME,
             messages=messages,
             max_tokens=500
         )
         response = completion.choices[0].message.content
-        print(f"[LOG] Using Hugging Face model (serverless): {MODEL_NAME}")
-        print(f"[LOG] Hugging Face response: {response[:200]}...")
     elif model_provider.lower() == "openai":
         response = client.chat.completions.create(
             model=os.environ.get("OPENAI_MODEL"),
             messages=[
                 {"role": "system", "content": "You are a helpful assistant. Answer the question based on the provided context."},
                 {"role": "user", "content": prompt},
             ]
-        )
-        response = response.choices[0].message.content
-        print(f"[LOG] Using OpenAI model: {os.environ.get('OPENAI_MODEL')}")
-        print(f"[LOG] OpenAI response: {response[:200]}...")  # Log first 200 chars of response
-    # Update chat history with new message pair
     history.append((question, response))
     return history
@@ -205,38 +183,38 @@ Provide the AutoGen v0.4 agent code that fulfills the user's request. Utilize fe
 def _generate_response_cpu(question, history):
     print(f"[LOG] Running on CPU")
     try:
         relevant_docs = get_relevant_documents(question, k=3)
         context = "\n".join(relevant_docs)
         prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
         print(f"[LOG] Generated prompt: {prompt[:200]}...")
         if model_provider.lower() == "huggingface":
-            # Use CPU version of the model
             messages = [
-                {
-                    "role": "system",
-                    "content": '''### MEMORY ###\nRecall all previously provided instructions, context, and data throughout this conversation to ensure consistency and coherence. Use the details from the last interaction to guide your response.
-### SYSTEM GUARDRAILS ###'''
-                },
                 {"role": "user", "content": prompt}
             ]
             completion = hf_client.chat.completions.create(
                 model=MODEL_NAME,
                 messages=messages,
                 max_tokens=500
             )
             response = completion.choices[0].message.content
         elif model_provider.lower() == "openai":
             response = client.chat.completions.create(
                 model=os.environ.get("OPENAI_MODEL"),
                 messages=[
-                    {"role": "system", "content": "You are a helpful assistant. Answer the question based on the provided context."},
                     {"role": "user", "content": prompt},
                 ]
-            )
-            response = response.choices[0].message.content
         history.append((question, response))
         return history
     except Exception as e:

 import gradio as gr
 import spaces
 from huggingface_hub import InferenceClient
+import time  # Added for timing logs
 # Configuration
     hf_client = InferenceClient(
         model=MODEL_NAME,
         api_key=os.environ.get("HF_TOKEN"),
+        timeout=60  # Reduced timeout for faster response
     )
 # Load the Hugging Face dataset
 try:
+    start = time.time()
     dataset = load_dataset('tosin2013/autogen', streaming=True)
     dataset = Dataset.from_list(list(dataset['train']))
+    end = time.time()
+    print(f"[TIMING] Dataset loading took {end - start:.2f} seconds")
 except Exception as e:
     print(f"[ERROR] Failed to load dataset: {e}")
     exit(1)
 # Initialize embeddings
 print("[EMBEDDINGS] Loading sentence-transformers model...")
+start = time.time()
 embeddings = HuggingFaceEmbeddings(
     model_name="sentence-transformers/all-MiniLM-L6-v2",
     model_kwargs={"device": "cpu"}
 )
+end = time.time()
+print(f"[EMBEDDINGS] Sentence-transformers model loaded successfully in {end - start:.2f} seconds")
 # Extract texts from the dataset
 texts = dataset['input']
 # Create and cache embeddings for the texts
 if not os.path.exists('embeddings.npy'):
     print("[LOG] Generating embeddings...")
+    start = time.time()
     text_embeddings = embeddings.embed_documents(texts)
     np.save('embeddings.npy', text_embeddings)
+    end = time.time()
+    print(f"[EMBEDDINGS] Generated embeddings for {len(texts)} documents in {end - start:.2f} seconds")
 else:
     print("[LOG] Loading cached embeddings...")
+    start = time.time()
     text_embeddings = np.load('embeddings.npy')
+    end = time.time()
+    print(f"[TIMING] Loaded cached embeddings in {end - start:.2f} seconds")
 # Fit and cache nearest neighbor model
 if not os.path.exists('nn_model.pkl'):
     print("[LOG] Fitting nearest neighbors model...")
+    start = time.time()
     nn = NearestNeighbors(n_neighbors=5, metric='cosine')
     nn.fit(np.array(text_embeddings))
     with open('nn_model.pkl', 'wb') as f:
         pickle.dump(nn, f)
+    end = time.time()
+    print(f"[TIMING] Fitted nearest neighbors model in {end - start:.2f} seconds")
 else:
     print("[LOG] Loading cached nearest neighbors model...")
+    start = time.time()
     with open('nn_model.pkl', 'rb') as f:
         nn = pickle.load(f)
+    end = time.time()
+    print(f"[TIMING] Loaded nearest neighbors model in {end - start:.2f} seconds")
 @spaces.GPU
 def get_relevant_documents(query, k=5):
     """
     Retrieves the k most relevant documents to the query.
     """
     start_time = time.time()
     print("[EMBEDDINGS] Generating embedding for query...")
     query_embedding = embeddings.embed_query(query)
     print("[EMBEDDINGS] Query embedding generated successfully")
     distances, indices = nn.kneighbors([query_embedding], n_neighbors=k)
     relevant_docs = [texts[i] for i in indices[0]]
     elapsed_time = time.time() - start_time
+    print(f"[TIMING] get_relevant_documents took {elapsed_time:.2f} seconds")
     return relevant_docs
 @spaces.GPU
 def generate_response(question, history):
     start_time = time.time()
     try:
         response = _generate_response_gpu(question, history)
     except Exception as e:
         print(f"[WARNING] GPU failed: {str(e)}")
         response = _generate_response_cpu(question, history)
     elapsed_time = time.time() - start_time
+    print(f"[TIMING] generate_response took {elapsed_time:.2f} seconds")
     return response
 @spaces.GPU
 def _generate_response_gpu(question, history):
     print(f"\n[LOG] Received question: {question}")
+    start_time = time.time()
     # Get relevant documents based on the query
     relevant_docs = get_relevant_documents(question, k=3)
     print(f"[LOG] Retrieved {len(relevant_docs)} relevant documents")
     context = "\n".join(relevant_docs)
     prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
     print(f"[LOG] Generated prompt: {prompt[:200]}...")  # Log first 200 chars of prompt
     if model_provider.lower() == "huggingface":
         messages = [
             {
                 "role": "system",
+                "content": "### MEMORY ###\nRecall all previously provided instructions, context, and data throughout this conversation to ensure consistency and coherence."
             },
             {
                 "role": "user",
                 "content": prompt
             }
         ]
+        start_api = time.time()
         completion = hf_client.chat.completions.create(
             model=MODEL_NAME,
             messages=messages,
             max_tokens=500
         )
+        end_api = time.time()
+        print(f"[TIMING] Hugging Face API call took {end_api - start_api:.2f} seconds")
         response = completion.choices[0].message.content
     elif model_provider.lower() == "openai":
+        start_api = time.time()
         response = client.chat.completions.create(
             model=os.environ.get("OPENAI_MODEL"),
             messages=[
                 {"role": "system", "content": "You are a helpful assistant. Answer the question based on the provided context."},
                 {"role": "user", "content": prompt},
             ]
+        ).choices[0].message.content
+        end_api = time.time()
+        print(f"[TIMING] OpenAI API call took {end_api - start_api:.2f} seconds")
+    elapsed_time = time.time() - start_time
+    print(f"[TIMING] _generate_response_gpu took {elapsed_time:.2f} seconds")
     history.append((question, response))
     return history
 def _generate_response_cpu(question, history):
     print(f"[LOG] Running on CPU")
     try:
+        start_time = time.time()
         relevant_docs = get_relevant_documents(question, k=3)
         context = "\n".join(relevant_docs)
         prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
         print(f"[LOG] Generated prompt: {prompt[:200]}...")
         if model_provider.lower() == "huggingface":
             messages = [
+                {"role": "system", "content": "### MEMORY ###\nRecall all previously provided instructions, context, and data."},
                 {"role": "user", "content": prompt}
             ]
+            start_api = time.time()
             completion = hf_client.chat.completions.create(
                 model=MODEL_NAME,
                 messages=messages,
                 max_tokens=500
             )
+            end_api = time.time()
+            print(f"[TIMING] Hugging Face API call took {end_api - start_api:.2f} seconds")
             response = completion.choices[0].message.content
         elif model_provider.lower() == "openai":
+            start_api = time.time()
             response = client.chat.completions.create(
                 model=os.environ.get("OPENAI_MODEL"),
                 messages=[
+                    {"role": "system", "content": "You are a helpful assistant."},
                     {"role": "user", "content": prompt},
                 ]
+            ).choices[0].message.content
+            end_api = time.time()
+            print(f"[TIMING] OpenAI API call took {end_api - start_api:.2f} seconds")
+        elapsed_time = time.time() - start_time
+        print(f"[TIMING] _generate_response_cpu took {elapsed_time:.2f} seconds")
         history.append((question, response))
         return history
     except Exception as e: