Spaces:

simondh
/

classifieur

Sleeping

App Files Files Community

simondh commited on 27 days ago

Commit

a241f5a

1 Parent(s): ffbfa54

isolate prompts

Browse files

Files changed (4) hide show

README.md +5 -19
app.py +20 -46
classifiers.py +6 -22
prompts.py +63 -0

README.md CHANGED Viewed

@@ -60,33 +60,19 @@ brainbox4/
 ```
 ## 🔧 Optimisations de Performance
-### Traitement Parallèle
-- Exploitation d'`asyncio` pour effectuer des appels API simultanés.
-- Gestion par lots de 20 textes par requête pour optimiser le débit.
-### Sélection Intelligente du Modèle
-- **GPT-3.5** : Utilisé par défaut pour moins de 100 textes.
-- **GPT-3.5-16k** : Adapté pour des volumes de 100 à 500 textes.
-- **GPT-4** : Préféré pour plus de 500 textes.
-- Intégration future de modèles hébergés localement pour une flexibilité accrue.
 ## 🎨 Optimisations de l'Interface Utilisateur
-### Suggestions Automatiques
-- Propositions automatiques de catégories et de colonnes basées sur un échantillon de textes.
-### Évaluation et Reclassification
 - Rapport d'évaluation détaillé après classification : analyse des catégories, détection des incohérences, suggestions d'amélioration.
-- Proposition de reclassification des textes selon les recommandations du rapport, ajustement des catégories et seuils de confiance pour améliorer la précision.
 ## ✨ Fonctionnalités Principales
 1. **Classification Rapide**
    - Traitement parallèle des textes
    - Support des fichiers Excel/CSV
-   - Scores de confiance
 2. **Interface Simple**
    - Upload de fichiers

 ```
 ## 🔧 Optimisations de Performance
+- parallélisation des requêtes API par lot de 10 maximum pour accélérer la classification.
+- suggestion automatique du modèle.
 ## 🎨 Optimisations de l'Interface Utilisateur
+- Suggestion automatiques de catégories et de colonnes basées sur un échantillon de textes.
 - Rapport d'évaluation détaillé après classification : analyse des catégories, détection des incohérences, suggestions d'amélioration.
+- Suggestion de reclassification des textes selon les recommandations du rapport.
 ## ✨ Fonctionnalités Principales
 1. **Classification Rapide**
    - Traitement parallèle des textes
    - Support des fichiers Excel/CSV
+   - Scores de confiance et justification
 2. **Interface Simple**
    - Upload de fichiers

app.py CHANGED Viewed

@@ -12,11 +12,16 @@ import time
 import torch
 import traceback
 import logging
-import asyncio
 # Import local modules
 from classifiers import TFIDFClassifier, LLMClassifier
 from utils import load_data, export_data, visualize_results, validate_results
 # Configure logging
 logging.basicConfig(level=logging.INFO,
@@ -269,12 +274,8 @@ with gr.Blocks(title="Text Classification System") as demo:
                     process_button = gr.Button("Process and Classify", visible=False)
         results_df = gr.Dataframe(interactive=True, visible=False)
         # Create containers for visualization and validation report
         with gr.Row(visible=False) as results_row:
             with gr.Column():
@@ -286,7 +287,6 @@ with gr.Blocks(title="Text Classification System") as demo:
                 validation_output = gr.Textbox(label="Validation Report", interactive=False)
                 improve_button = gr.Button("Improve Classification with Report", visible=False)
         # Function to load file and suggest categories
         def load_file_and_suggest_categories(file):
             if not file:
@@ -319,13 +319,7 @@ with gr.Blocks(title="Text Classification System") as demo:
                 # Use LLM to suggest categories
                 if client:
-                    prompt = f"""
-                    Based on these example texts, suggest 5 appropriate categories for classification:
-                    {sample_texts[:5]}
-                    Return your answer as a comma-separated list of category names only.
-                    """
                     try:
                         response = client.chat.completions.create(
                             model="gpt-3.5-turbo",
@@ -396,15 +390,10 @@ with gr.Blocks(title="Text Classification System") as demo:
                     sample_texts.extend(df[col].head(5).tolist())
                 if client:
-                    prompt = f"""
-                    Based on these example texts and the existing categories ({', '.join(current_categories)}),
-                    suggest one additional appropriate category for classification.
-                    Example texts:
-                    {sample_texts[:5]}
-                    Return only the suggested category name, nothing else.
-                    """
                     try:
                         response = client.chat.completions.create(
                             model="gpt-3.5-turbo",
@@ -438,20 +427,10 @@ with gr.Blocks(title="Text Classification System") as demo:
             try:
                 # Extract insights from validation report
                 if client:
-                    prompt = f"""
-                    Based on this validation report, analyze the current classification and suggest improvements:
-                    {validation_report}
-                    Return your answer in JSON format with these fields:
-                    - suggested_categories: list of improved category names (must be different from current categories: {categories})
-                    - confidence_threshold: a number between 0 and 100 for minimum confidence
-                    - focus_areas: list of specific aspects to focus on during classification
-                    - analysis: a brief analysis of what needs improvement
-                    - new_categories_needed: boolean indicating if new categories should be added
-                    JSON response:
-                    """
                     try:
                         response = client.chat.completions.create(
                             model="gpt-4",
@@ -475,16 +454,11 @@ with gr.Blocks(title="Text Classification System") as demo:
                                     temp_df = load_data(file.name)
                                 sample_texts.extend(temp_df[col].head(5).tolist())
-                            category_prompt = f"""
-                            Based on these example texts and the current categories ({', '.join(current_categories)}),
-                            suggest new categories that would improve the classification. The validation report indicates:
-                            {improvements.get('analysis', '')}
-                            Example texts:
-                            {sample_texts[:5]}
-                            Return your answer as a comma-separated list of new category names only.
-                            """
                             category_response = client.chat.completions.create(
                                 model="gpt-4",

 import torch
 import traceback
 import logging
 # Import local modules
 from classifiers import TFIDFClassifier, LLMClassifier
 from utils import load_data, export_data, visualize_results, validate_results
+from prompts import (
+    CATEGORY_SUGGESTION_PROMPT,
+    ADDITIONAL_CATEGORY_PROMPT,
+    VALIDATION_ANALYSIS_PROMPT,
+    CATEGORY_IMPROVEMENT_PROMPT
+)
 # Configure logging
 logging.basicConfig(level=logging.INFO,
                     process_button = gr.Button("Process and Classify", visible=False)
         results_df = gr.Dataframe(interactive=True, visible=False)
         # Create containers for visualization and validation report
         with gr.Row(visible=False) as results_row:
             with gr.Column():
                 validation_output = gr.Textbox(label="Validation Report", interactive=False)
                 improve_button = gr.Button("Improve Classification with Report", visible=False)
         # Function to load file and suggest categories
         def load_file_and_suggest_categories(file):
             if not file:
                 # Use LLM to suggest categories
                 if client:
+                    prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts[:5]))
                     try:
                         response = client.chat.completions.create(
                             model="gpt-3.5-turbo",
                     sample_texts.extend(df[col].head(5).tolist())
                 if client:
+                    prompt = ADDITIONAL_CATEGORY_PROMPT.format(
+                        existing_categories=", ".join(current_categories),
+                        sample_texts="\n---\n".join(sample_texts[:5])
+                    )
                     try:
                         response = client.chat.completions.create(
                             model="gpt-3.5-turbo",
             try:
                 # Extract insights from validation report
                 if client:
+                    prompt = VALIDATION_ANALYSIS_PROMPT.format(
+                        validation_report=validation_report,
+                        current_categories=categories
+                    )
                     try:
                         response = client.chat.completions.create(
                             model="gpt-4",
                                     temp_df = load_data(file.name)
                                 sample_texts.extend(temp_df[col].head(5).tolist())
+                            category_prompt = CATEGORY_IMPROVEMENT_PROMPT.format(
+                                current_categories=", ".join(current_categories),
+                                analysis=improvements.get('analysis', ''),
+                                sample_texts="\n---\n".join(sample_texts[:5])
+                            )
                             category_response = client.chat.completions.create(
                                 model="gpt-4",

classifiers.py CHANGED Viewed

@@ -7,6 +7,7 @@ import random
 import json
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import List, Dict, Any, Optional
 class BaseClassifier:
     """Base class for text classifiers"""
@@ -183,14 +184,7 @@ class LLMClassifier(BaseClassifier):
         else:
             sample_texts = texts
-        prompt = """
-        I have a collection of texts that I need to classify into categories. Here are some examples:
-        {}
-        Based on these examples, suggest up 2 to 5 appropriate categories for classification.
-        Return your answer as a comma-separated list of category names only.
-        """.format("\n---\n".join(sample_texts))
         try:
             response = self.client.chat.completions.create(
@@ -212,20 +206,10 @@ class LLMClassifier(BaseClassifier):
     def _classify_text(self, text: str, categories: List[str]) -> Dict[str, Any]:
         """Use LLM to classify a single text"""
-        categories_str = ", ".join(categories)
-        prompt = f"""
-        Classify the following text into one of these categories: {categories_str}
-        Text: {text}
-        Return your answer in JSON format with these fields:
-        - category: the chosen category from the list
-        - confidence: a value between 0 and 100 indicating your confidence in this classification (as a percentage)
-        - explanation: a brief explanation of why this category was chosen (1-2 sentences)
-        JSON response:
-        """
         try:
             response = self.client.chat.completions.create(

 import json
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import List, Dict, Any, Optional
+from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
 class BaseClassifier:
     """Base class for text classifiers"""
         else:
             sample_texts = texts
+        prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
         try:
             response = self.client.chat.completions.create(
     def _classify_text(self, text: str, categories: List[str]) -> Dict[str, Any]:
         """Use LLM to classify a single text"""
+        prompt = TEXT_CLASSIFICATION_PROMPT.format(
+            categories=", ".join(categories),
+            text=text
+        )
         try:
             response = self.client.chat.completions.create(

prompts.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""Prompts used in the text classification system"""
+# Category suggestion prompt
+CATEGORY_SUGGESTION_PROMPT = """
+Based on these example texts, suggest 5 appropriate categories for classification:
+{}
+Return your answer as a comma-separated list of category names only.
+"""
+# Text classification prompt
+TEXT_CLASSIFICATION_PROMPT = """
+Classify the following text into one of these categories: {categories}
+Text: {text}
+Return your answer in JSON format with these fields:
+- category: the chosen category from the list
+- confidence: a value between 0 and 100 indicating your confidence in this classification (as a percentage)
+- explanation: a brief explanation of why this category was chosen (1-2 sentences)
+JSON response:
+"""
+# Additional category suggestion prompt
+ADDITIONAL_CATEGORY_PROMPT = """
+Based on these example texts and the existing categories ({existing_categories}),
+suggest one additional appropriate category for classification.
+Example texts:
+{}
+Return only the suggested category name, nothing else.
+"""
+# Validation report analysis prompt
+VALIDATION_ANALYSIS_PROMPT = """
+Based on this validation report, analyze the current classification and suggest improvements:
+{validation_report}
+Return your answer in JSON format with these fields:
+- suggested_categories: list of improved category names (must be different from current categories: {current_categories})
+- confidence_threshold: a number between 0 and 100 for minimum confidence
+- focus_areas: list of specific aspects to focus on during classification
+- analysis: a brief analysis of what needs improvement
+- new_categories_needed: boolean indicating if new categories should be added
+JSON response:
+"""
+# Category improvement prompt
+CATEGORY_IMPROVEMENT_PROMPT = """
+Based on these example texts and the current categories ({current_categories}),
+suggest new categories that would improve the classification. The validation report indicates:
+{analysis}
+Example texts:
+{}
+Return your answer as a comma-separated list of new category names only.
+"""