Spaces:

simondh
/

classifieur

Sleeping

App Files Files Community

simondh commited on 17 days ago

Commit

36183d4

1 Parent(s): 720c911

classify async

Browse files

Files changed (4) hide show

app.py +37 -17
classifiers/llm.py +62 -74
client.py +37 -0
process.py +16 -27

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 import gradio as gr
-from litellm import OpenAI
 import json
 from sklearn.cluster import KMeans
 from sklearn.decomposition import PCA
@@ -9,7 +9,9 @@ import matplotlib.pyplot as plt
 import logging
 from dotenv import load_dotenv
-from process import update_api_key, process_file, export_results
 # Load environment variables from .env file
 load_dotenv()
@@ -30,16 +32,13 @@ logging.basicConfig(
 # Initialize API key from environment variable
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
-# Only initialize client if API key is available
-client = None
 if OPENAI_API_KEY:
-    try:
-        client = OpenAI(api_key=OPENAI_API_KEY)
         logging.info("OpenAI client initialized successfully")
-    except Exception as e:
-        logging.error(f"Failed to initialize OpenAI client: {str(e)}")
 # Create Gradio interface
 with gr.Blocks(title="Text Classification System") as demo:
@@ -57,9 +56,8 @@ with gr.Blocks(title="Text Classification System") as demo:
         api_key_message = gr.Textbox(label="Status", interactive=False)
         # Display current API status
-        api_status = (
-            "API Key is set" if OPENAI_API_KEY else "No API Key found. Please set one."
-        )
         gr.Markdown(f"**Current API Status**: {api_status}")
         api_key_button.click(
@@ -344,7 +342,7 @@ with gr.Blocks(title="Text Classification System") as demo:
             return gr.File(value=file_path, visible=True)
         # Function to improve classification based on validation report
-        def improve_classification(
             df,
             validation_report,
             text_columns,
@@ -353,7 +351,7 @@ with gr.Blocks(title="Text Classification System") as demo:
             show_explanations,
             file,
         ):
-            """Improve classification based on validation report"""
             if df is None or not validation_report:
                 return (
                     df,
@@ -420,7 +418,7 @@ with gr.Blocks(title="Text Classification System") as demo:
                             categories = ",".join(all_categories)
                         # Process with improved parameters
-                        improved_df, new_validation = process_file(
                             file,
                             text_columns,
                             categories,
@@ -466,6 +464,28 @@ with gr.Blocks(title="Text Classification System") as demo:
                     ),
                 )
         # Connect functions
         load_categories_button.click(
             load_file_and_suggest_categories,
@@ -506,7 +526,7 @@ with gr.Blocks(title="Text Classification System") as demo:
         process_button.click(
             lambda: gr.Dataframe(visible=True), inputs=[], outputs=[results_df]
         ).then(
-            process_file,
             inputs=[
                 file_input,
                 text_column,

 import os
 import gradio as gr
+import asyncio
 import json
 from sklearn.cluster import KMeans
 from sklearn.decomposition import PCA
 import logging
 from dotenv import load_dotenv
+from process import update_api_key, process_file_async, export_results
+from client import get_client, initialize_client
 # Load environment variables from .env file
 load_dotenv()
 # Initialize API key from environment variable
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
+# Initialize client if API key is available
 if OPENAI_API_KEY:
+    success, message = initialize_client(OPENAI_API_KEY)
+    if success:
         logging.info("OpenAI client initialized successfully")
+    else:
+        logging.error(f"Failed to initialize OpenAI client: {message}")
 # Create Gradio interface
 with gr.Blocks(title="Text Classification System") as demo:
         api_key_message = gr.Textbox(label="Status", interactive=False)
         # Display current API status
+        client = get_client()
+        api_status = "API Key is set" if client else "No API Key found. Please set one."
         gr.Markdown(f"**Current API Status**: {api_status}")
         api_key_button.click(
             return gr.File(value=file_path, visible=True)
         # Function to improve classification based on validation report
+        async def improve_classification_async(
             df,
             validation_report,
             text_columns,
             show_explanations,
             file,
         ):
+            """Async version of improve_classification"""
             if df is None or not validation_report:
                 return (
                     df,
                             categories = ",".join(all_categories)
                         # Process with improved parameters
+                        improved_df, new_validation = await process_file_async(
                             file,
                             text_columns,
                             categories,
                     ),
                 )
+        def improve_classification(
+            df,
+            validation_report,
+            text_columns,
+            categories,
+            classifier_type,
+            show_explanations,
+            file,
+        ):
+            """Synchronous wrapper for improve_classification_async"""
+            return asyncio.run(
+                improve_classification_async(
+                    df,
+                    validation_report,
+                    text_columns,
+                    categories,
+                    classifier_type,
+                    show_explanations,
+                    file,
+                )
+            )
         # Connect functions
         load_categories_button.click(
             load_file_and_suggest_categories,
         process_button.click(
             lambda: gr.Dataframe(visible=True), inputs=[], outputs=[results_df]
         ).then(
+            process_file_async,
             inputs=[
                 file_input,
                 text_column,

classifiers/llm.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
@@ -6,14 +5,13 @@ from sklearn.cluster import KMeans
 from sklearn.metrics.pairwise import cosine_similarity
 import random
 import json
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import List, Dict, Any, Optional
 from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
 from .base import BaseClassifier
 class LLMClassifier(BaseClassifier):
     """Classifier using a Large Language Model for more accurate but slower classification"""
@@ -22,77 +20,15 @@ class LLMClassifier(BaseClassifier):
         self.client = client
         self.model = model
-    def classify(
-        self, texts: List[str], categories: Optional[List[str]] = None
-    ) -> List[Dict[str, Any]]:
-        """Classify texts using an LLM with parallel processing"""
-        if not categories:
-            # First, use LLM to generate appropriate categories
-            categories = self._suggest_categories(texts)
-        # Process texts in parallel
-        with ThreadPoolExecutor(max_workers=10) as executor:
-            # Submit all tasks with their original indices
-            future_to_index = {
-                executor.submit(self._classify_text, text, categories): idx
-                for idx, text in enumerate(texts)
-            }
-            # Initialize results list with None values
-            results = [None] * len(texts)
-            # Collect results as they complete
-            for future in as_completed(future_to_index):
-                original_idx = future_to_index[future]
-                try:
-                    result = future.result()
-                    results[original_idx] = result
-                except Exception as e:
-                    print(f"Error processing text: {str(e)}")
-                    results[original_idx] = {
-                        "category": categories[0],
-                        "confidence": 50,
-                        "explanation": f"Error during classification: {str(e)}",
-                    }
-        return results
-    def _suggest_categories(self, texts: List[str], sample_size: int = 20) -> List[str]:
-        """Use LLM to suggest appropriate categories for the dataset"""
-        # Take a sample of texts to avoid token limitations
-        if len(texts) > sample_size:
-            sample_texts = random.sample(texts, sample_size)
-        else:
-            sample_texts = texts
-        prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
-        try:
-            response = self.client.chat.completions.create(
-                model=self.model,
-                messages=[{"role": "user", "content": prompt}],
-                temperature=0.2,
-                max_tokens=100,
-            )
-            # Parse response to get categories
-            categories_text = response.choices[0].message.content.strip()
-            categories = [cat.strip() for cat in categories_text.split(",")]
-            return categories
-        except Exception as e:
-            # Fallback to default categories on error
-            print(f"Error suggesting categories: {str(e)}")
-            return self._generate_default_categories(texts)
-    def _classify_text(self, text: str, categories: List[str]) -> Dict[str, Any]:
-        """Use LLM to classify a single text"""
         prompt = TEXT_CLASSIFICATION_PROMPT.format(
-            categories=", ".join(categories), text=text
         )
         try:
-            response = self.client.chat.completions.create(
                 model=self.model,
                 messages=[{"role": "user", "content": prompt}],
                 temperature=0,
@@ -101,17 +37,15 @@ class LLMClassifier(BaseClassifier):
             # Parse JSON response
             response_text = response.choices[0].message.content.strip()
             result = json.loads(response_text)
             # Ensure all required fields are present
             if not all(k in result for k in ["category", "confidence", "explanation"]):
                 raise ValueError("Missing required fields in LLM response")
             # Validate category is in the list
             if result["category"] not in categories:
-                result["category"] = categories[
-                    0
-                ]  # Default to first category if invalid
             # Validate confidence is a number between 0 and 100
             try:
@@ -135,3 +69,57 @@ class LLMClassifier(BaseClassifier):
                 "confidence": 50,
                 "explanation": f"Classification based on language model analysis. (Note: Structured response parsing failed)",
             }

 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import random
 import json
+import asyncio
 from typing import List, Dict, Any, Optional
 from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
 from .base import BaseClassifier
 class LLMClassifier(BaseClassifier):
     """Classifier using a Large Language Model for more accurate but slower classification"""
         self.client = client
         self.model = model
+    async def _classify_text_async(self, text: str, categories: List[str]) -> Dict[str, Any]:
+        """Async version of text classification"""
         prompt = TEXT_CLASSIFICATION_PROMPT.format(
+            categories=", ".join(categories),
+            text=text
         )
         try:
+            response = await self.client.chat.completions.create(
                 model=self.model,
                 messages=[{"role": "user", "content": prompt}],
                 temperature=0,
             # Parse JSON response
             response_text = response.choices[0].message.content.strip()
             result = json.loads(response_text)
             # Ensure all required fields are present
             if not all(k in result for k in ["category", "confidence", "explanation"]):
                 raise ValueError("Missing required fields in LLM response")
             # Validate category is in the list
             if result["category"] not in categories:
+                result["category"] = categories[0]  # Default to first category if invalid
             # Validate confidence is a number between 0 and 100
             try:
                 "confidence": 50,
                 "explanation": f"Classification based on language model analysis. (Note: Structured response parsing failed)",
             }
+        except Exception as e:
+            return {
+                "category": categories[0],
+                "confidence": 50,
+                "explanation": f"Error during classification: {str(e)}",
+            }
+    async def _suggest_categories_async(self, texts: List[str], sample_size: int = 20) -> List[str]:
+        """Async version of category suggestion"""
+        # Take a sample of texts to avoid token limitations
+        if len(texts) > sample_size:
+            sample_texts = random.sample(texts, sample_size)
+        else:
+            sample_texts = texts
+        prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
+        try:
+            response = await self.client.chat.completions.create(
+                model=self.model,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.2,
+                max_tokens=100,
+            )
+            # Parse response to get categories
+            categories_text = response.choices[0].message.content.strip()
+            categories = [cat.strip() for cat in categories_text.split(",")]
+            return categories
+        except Exception as e:
+            # Fallback to default categories on error
+            print(f"Error suggesting categories: {str(e)}")
+            return self._generate_default_categories(texts)
+    async def classify_async(
+        self, texts: List[str], categories: Optional[List[str]] = None
+    ) -> List[Dict[str, Any]]:
+        """Async method to classify texts"""
+        if not categories:
+            categories = await self._suggest_categories_async(texts)
+        # Create tasks for all texts
+        tasks = [self._classify_text_async(text, categories) for text in texts]
+        # Gather all results
+        results = await asyncio.gather(*tasks)
+        return results
+    def classify(
+        self, texts: List[str], categories: Optional[List[str]] = None
+    ) -> List[Dict[str, Any]]:
+        """Synchronous wrapper for backwards compatibility"""
+        return asyncio.run(self.classify_async(texts, categories))

client.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from litellm import OpenAI
+import os
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Initialize client as None
+client = None
+def get_client():
+    """Get the OpenAI client instance"""
+    global client
+    return client
+def initialize_client(api_key=None):
+    """Initialize the OpenAI client with an API key"""
+    global client
+    # Use provided API key or get from environment
+    api_key = api_key or os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        return False, "No API key provided"
+    try:
+        client = OpenAI(api_key=api_key)
+        # Test the connection with a simple request
+        response = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "test"}],
+            max_tokens=5,
+        )
+        return True, "API Key updated and verified successfully"
+    except Exception as e:
+        client = None
+        return False, f"Failed to initialize client: {str(e)}"

process.py CHANGED Viewed

@@ -1,41 +1,22 @@
 import logging
 import time
 import traceback
 from sklearn.feature_extraction.text import TfidfVectorizer
-from litellm import OpenAI
 from classifiers import TFIDFClassifier, LLMClassifier
 from utils import load_data, validate_results
 def update_api_key(api_key):
     """Update the OpenAI API key"""
-    global OPENAI_API_KEY, client
-    if not api_key:
-        return "API Key cannot be empty"
-    OPENAI_API_KEY = api_key
-    try:
-        client = OpenAI(api_key=api_key)
-        # Test the connection with a simple request
-        response = client.chat.completions.create(
-            model="gpt-3.5-turbo",
-            messages=[{"role": "user", "content": "test"}],
-            max_tokens=5,
-        )
-        return f"API Key updated and verified successfully"
-    except Exception as e:
-        error_msg = str(e)
-        logging.error(f"API key update failed: {error_msg}")
-        return f"Failed to update API Key: {error_msg}"
-def process_file(file, text_columns, categories, classifier_type, show_explanations):
-    """Process the uploaded file and classify text data"""
     # Initialize result_df and validation_report
     result_df = None
     validation_report = None
@@ -83,6 +64,9 @@ def process_file(file, text_columns, categories, classifier_type, show_explanati
             else:
                 classifier_type = "tfidf"
         # Initialize appropriate classifier
         if classifier_type == "tfidf":
             classifier = TFIDFClassifier()
@@ -95,7 +79,7 @@ def process_file(file, text_columns, categories, classifier_type, show_explanati
                 )
             model = "gpt-3.5-turbo" if classifier_type == "gpt35" else "gpt-4"
             classifier = LLMClassifier(client=client, model=model)
-            results = classifier.classify(texts, category_list)
         else:  # hybrid
             if client is None:
                 return (
@@ -121,7 +105,7 @@ def process_file(file, text_columns, categories, classifier_type, show_explanati
                     results.append(tfidf_result)
             if low_confidence_texts:
-                llm_results = llm_classifier.classify(
                     low_confidence_texts, category_list
                 )
                 for idx, llm_result in zip(low_confidence_indices, llm_results):
@@ -145,6 +129,11 @@ def process_file(file, text_columns, categories, classifier_type, show_explanati
         return None, f"Error: {str(e)}\n{error_traceback}"
 def export_results(df, format_type):
     """Export results to a file and return the file path for download"""
     if df is None:

 import logging
 import time
 import traceback
+import asyncio
 from sklearn.feature_extraction.text import TfidfVectorizer
 from classifiers import TFIDFClassifier, LLMClassifier
 from utils import load_data, validate_results
+from client import get_client
 def update_api_key(api_key):
     """Update the OpenAI API key"""
+    from client import initialize_client
+    return initialize_client(api_key)
+async def process_file_async(file, text_columns, categories, classifier_type, show_explanations):
+    """Async version of process_file"""
     # Initialize result_df and validation_report
     result_df = None
     validation_report = None
             else:
                 classifier_type = "tfidf"
+        # Get the client instance
+        client = get_client()
         # Initialize appropriate classifier
         if classifier_type == "tfidf":
             classifier = TFIDFClassifier()
                 )
             model = "gpt-3.5-turbo" if classifier_type == "gpt35" else "gpt-4"
             classifier = LLMClassifier(client=client, model=model)
+            results = await classifier.classify_async(texts, category_list)
         else:  # hybrid
             if client is None:
                 return (
                     results.append(tfidf_result)
             if low_confidence_texts:
+                llm_results = await llm_classifier.classify_async(
                     low_confidence_texts, category_list
                 )
                 for idx, llm_result in zip(low_confidence_indices, llm_results):
         return None, f"Error: {str(e)}\n{error_traceback}"
+def process_file(file, text_columns, categories, classifier_type, show_explanations):
+    """Synchronous wrapper for process_file_async"""
+    return asyncio.run(process_file_async(file, text_columns, categories, classifier_type, show_explanations))
 def export_results(df, format_type):
     """Export results to a file and return the file path for download"""
     if df is None: