Spaces:
Sleeping
Sleeping
isolate prompts
Browse files- README.md +5 -19
- app.py +20 -46
- classifiers.py +6 -22
- prompts.py +63 -0
README.md
CHANGED
@@ -60,33 +60,19 @@ brainbox4/
|
|
60 |
```
|
61 |
|
62 |
## 🔧 Optimisations de Performance
|
63 |
-
|
64 |
-
|
65 |
-
- Exploitation d'`asyncio` pour effectuer des appels API simultanés.
|
66 |
-
- Gestion par lots de 20 textes par requête pour optimiser le débit.
|
67 |
-
|
68 |
-
### Sélection Intelligente du Modèle
|
69 |
-
- **GPT-3.5** : Utilisé par défaut pour moins de 100 textes.
|
70 |
-
- **GPT-3.5-16k** : Adapté pour des volumes de 100 à 500 textes.
|
71 |
-
- **GPT-4** : Préféré pour plus de 500 textes.
|
72 |
-
- Intégration future de modèles hébergés localement pour une flexibilité accrue.
|
73 |
|
74 |
## 🎨 Optimisations de l'Interface Utilisateur
|
75 |
-
|
76 |
-
### Suggestions Automatiques
|
77 |
-
- Propositions automatiques de catégories et de colonnes basées sur un échantillon de textes.
|
78 |
-
|
79 |
-
### Évaluation et Reclassification
|
80 |
- Rapport d'évaluation détaillé après classification : analyse des catégories, détection des incohérences, suggestions d'amélioration.
|
81 |
-
-
|
82 |
-
|
83 |
|
84 |
## ✨ Fonctionnalités Principales
|
85 |
-
|
86 |
1. **Classification Rapide**
|
87 |
- Traitement parallèle des textes
|
88 |
- Support des fichiers Excel/CSV
|
89 |
-
- Scores de confiance
|
90 |
|
91 |
2. **Interface Simple**
|
92 |
- Upload de fichiers
|
|
|
60 |
```
|
61 |
|
62 |
## 🔧 Optimisations de Performance
|
63 |
+
- parallélisation des requêtes API par lot de 10 maximum pour accélérer la classification.
|
64 |
+
- suggestion automatique du modèle.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
## 🎨 Optimisations de l'Interface Utilisateur
|
67 |
+
- Suggestion automatiques de catégories et de colonnes basées sur un échantillon de textes.
|
|
|
|
|
|
|
|
|
68 |
- Rapport d'évaluation détaillé après classification : analyse des catégories, détection des incohérences, suggestions d'amélioration.
|
69 |
+
- Suggestion de reclassification des textes selon les recommandations du rapport.
|
|
|
70 |
|
71 |
## ✨ Fonctionnalités Principales
|
|
|
72 |
1. **Classification Rapide**
|
73 |
- Traitement parallèle des textes
|
74 |
- Support des fichiers Excel/CSV
|
75 |
+
- Scores de confiance et justification
|
76 |
|
77 |
2. **Interface Simple**
|
78 |
- Upload de fichiers
|
app.py
CHANGED
@@ -12,11 +12,16 @@ import time
|
|
12 |
import torch
|
13 |
import traceback
|
14 |
import logging
|
15 |
-
import asyncio
|
16 |
|
17 |
# Import local modules
|
18 |
from classifiers import TFIDFClassifier, LLMClassifier
|
19 |
from utils import load_data, export_data, visualize_results, validate_results
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
# Configure logging
|
22 |
logging.basicConfig(level=logging.INFO,
|
@@ -269,12 +274,8 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
269 |
|
270 |
process_button = gr.Button("Process and Classify", visible=False)
|
271 |
|
272 |
-
|
273 |
-
|
274 |
results_df = gr.Dataframe(interactive=True, visible=False)
|
275 |
|
276 |
-
|
277 |
-
|
278 |
# Create containers for visualization and validation report
|
279 |
with gr.Row(visible=False) as results_row:
|
280 |
with gr.Column():
|
@@ -286,7 +287,6 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
286 |
validation_output = gr.Textbox(label="Validation Report", interactive=False)
|
287 |
improve_button = gr.Button("Improve Classification with Report", visible=False)
|
288 |
|
289 |
-
|
290 |
# Function to load file and suggest categories
|
291 |
def load_file_and_suggest_categories(file):
|
292 |
if not file:
|
@@ -319,13 +319,7 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
319 |
|
320 |
# Use LLM to suggest categories
|
321 |
if client:
|
322 |
-
prompt =
|
323 |
-
Based on these example texts, suggest 5 appropriate categories for classification:
|
324 |
-
|
325 |
-
{sample_texts[:5]}
|
326 |
-
|
327 |
-
Return your answer as a comma-separated list of category names only.
|
328 |
-
"""
|
329 |
try:
|
330 |
response = client.chat.completions.create(
|
331 |
model="gpt-3.5-turbo",
|
@@ -396,15 +390,10 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
396 |
sample_texts.extend(df[col].head(5).tolist())
|
397 |
|
398 |
if client:
|
399 |
-
prompt =
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
Example texts:
|
404 |
-
{sample_texts[:5]}
|
405 |
-
|
406 |
-
Return only the suggested category name, nothing else.
|
407 |
-
"""
|
408 |
try:
|
409 |
response = client.chat.completions.create(
|
410 |
model="gpt-3.5-turbo",
|
@@ -438,20 +427,10 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
438 |
try:
|
439 |
# Extract insights from validation report
|
440 |
if client:
|
441 |
-
prompt =
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
Return your answer in JSON format with these fields:
|
447 |
-
- suggested_categories: list of improved category names (must be different from current categories: {categories})
|
448 |
-
- confidence_threshold: a number between 0 and 100 for minimum confidence
|
449 |
-
- focus_areas: list of specific aspects to focus on during classification
|
450 |
-
- analysis: a brief analysis of what needs improvement
|
451 |
-
- new_categories_needed: boolean indicating if new categories should be added
|
452 |
-
|
453 |
-
JSON response:
|
454 |
-
"""
|
455 |
try:
|
456 |
response = client.chat.completions.create(
|
457 |
model="gpt-4",
|
@@ -475,16 +454,11 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
475 |
temp_df = load_data(file.name)
|
476 |
sample_texts.extend(temp_df[col].head(5).tolist())
|
477 |
|
478 |
-
category_prompt =
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
Example texts:
|
484 |
-
{sample_texts[:5]}
|
485 |
-
|
486 |
-
Return your answer as a comma-separated list of new category names only.
|
487 |
-
"""
|
488 |
|
489 |
category_response = client.chat.completions.create(
|
490 |
model="gpt-4",
|
|
|
12 |
import torch
|
13 |
import traceback
|
14 |
import logging
|
|
|
15 |
|
16 |
# Import local modules
|
17 |
from classifiers import TFIDFClassifier, LLMClassifier
|
18 |
from utils import load_data, export_data, visualize_results, validate_results
|
19 |
+
from prompts import (
|
20 |
+
CATEGORY_SUGGESTION_PROMPT,
|
21 |
+
ADDITIONAL_CATEGORY_PROMPT,
|
22 |
+
VALIDATION_ANALYSIS_PROMPT,
|
23 |
+
CATEGORY_IMPROVEMENT_PROMPT
|
24 |
+
)
|
25 |
|
26 |
# Configure logging
|
27 |
logging.basicConfig(level=logging.INFO,
|
|
|
274 |
|
275 |
process_button = gr.Button("Process and Classify", visible=False)
|
276 |
|
|
|
|
|
277 |
results_df = gr.Dataframe(interactive=True, visible=False)
|
278 |
|
|
|
|
|
279 |
# Create containers for visualization and validation report
|
280 |
with gr.Row(visible=False) as results_row:
|
281 |
with gr.Column():
|
|
|
287 |
validation_output = gr.Textbox(label="Validation Report", interactive=False)
|
288 |
improve_button = gr.Button("Improve Classification with Report", visible=False)
|
289 |
|
|
|
290 |
# Function to load file and suggest categories
|
291 |
def load_file_and_suggest_categories(file):
|
292 |
if not file:
|
|
|
319 |
|
320 |
# Use LLM to suggest categories
|
321 |
if client:
|
322 |
+
prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts[:5]))
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
try:
|
324 |
response = client.chat.completions.create(
|
325 |
model="gpt-3.5-turbo",
|
|
|
390 |
sample_texts.extend(df[col].head(5).tolist())
|
391 |
|
392 |
if client:
|
393 |
+
prompt = ADDITIONAL_CATEGORY_PROMPT.format(
|
394 |
+
existing_categories=", ".join(current_categories),
|
395 |
+
sample_texts="\n---\n".join(sample_texts[:5])
|
396 |
+
)
|
|
|
|
|
|
|
|
|
|
|
397 |
try:
|
398 |
response = client.chat.completions.create(
|
399 |
model="gpt-3.5-turbo",
|
|
|
427 |
try:
|
428 |
# Extract insights from validation report
|
429 |
if client:
|
430 |
+
prompt = VALIDATION_ANALYSIS_PROMPT.format(
|
431 |
+
validation_report=validation_report,
|
432 |
+
current_categories=categories
|
433 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
434 |
try:
|
435 |
response = client.chat.completions.create(
|
436 |
model="gpt-4",
|
|
|
454 |
temp_df = load_data(file.name)
|
455 |
sample_texts.extend(temp_df[col].head(5).tolist())
|
456 |
|
457 |
+
category_prompt = CATEGORY_IMPROVEMENT_PROMPT.format(
|
458 |
+
current_categories=", ".join(current_categories),
|
459 |
+
analysis=improvements.get('analysis', ''),
|
460 |
+
sample_texts="\n---\n".join(sample_texts[:5])
|
461 |
+
)
|
|
|
|
|
|
|
|
|
|
|
462 |
|
463 |
category_response = client.chat.completions.create(
|
464 |
model="gpt-4",
|
classifiers.py
CHANGED
@@ -7,6 +7,7 @@ import random
|
|
7 |
import json
|
8 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9 |
from typing import List, Dict, Any, Optional
|
|
|
10 |
|
11 |
class BaseClassifier:
|
12 |
"""Base class for text classifiers"""
|
@@ -183,14 +184,7 @@ class LLMClassifier(BaseClassifier):
|
|
183 |
else:
|
184 |
sample_texts = texts
|
185 |
|
186 |
-
prompt = ""
|
187 |
-
I have a collection of texts that I need to classify into categories. Here are some examples:
|
188 |
-
|
189 |
-
{}
|
190 |
-
|
191 |
-
Based on these examples, suggest up 2 to 5 appropriate categories for classification.
|
192 |
-
Return your answer as a comma-separated list of category names only.
|
193 |
-
""".format("\n---\n".join(sample_texts))
|
194 |
|
195 |
try:
|
196 |
response = self.client.chat.completions.create(
|
@@ -212,20 +206,10 @@ class LLMClassifier(BaseClassifier):
|
|
212 |
|
213 |
def _classify_text(self, text: str, categories: List[str]) -> Dict[str, Any]:
|
214 |
"""Use LLM to classify a single text"""
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
Text: {text}
|
221 |
-
|
222 |
-
Return your answer in JSON format with these fields:
|
223 |
-
- category: the chosen category from the list
|
224 |
-
- confidence: a value between 0 and 100 indicating your confidence in this classification (as a percentage)
|
225 |
-
- explanation: a brief explanation of why this category was chosen (1-2 sentences)
|
226 |
-
|
227 |
-
JSON response:
|
228 |
-
"""
|
229 |
|
230 |
try:
|
231 |
response = self.client.chat.completions.create(
|
|
|
7 |
import json
|
8 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9 |
from typing import List, Dict, Any, Optional
|
10 |
+
from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT
|
11 |
|
12 |
class BaseClassifier:
|
13 |
"""Base class for text classifiers"""
|
|
|
184 |
else:
|
185 |
sample_texts = texts
|
186 |
|
187 |
+
prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
try:
|
190 |
response = self.client.chat.completions.create(
|
|
|
206 |
|
207 |
def _classify_text(self, text: str, categories: List[str]) -> Dict[str, Any]:
|
208 |
"""Use LLM to classify a single text"""
|
209 |
+
prompt = TEXT_CLASSIFICATION_PROMPT.format(
|
210 |
+
categories=", ".join(categories),
|
211 |
+
text=text
|
212 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
|
214 |
try:
|
215 |
response = self.client.chat.completions.create(
|
prompts.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Prompts used in the text classification system"""
|
2 |
+
|
3 |
+
# Category suggestion prompt
|
4 |
+
CATEGORY_SUGGESTION_PROMPT = """
|
5 |
+
Based on these example texts, suggest 5 appropriate categories for classification:
|
6 |
+
|
7 |
+
{}
|
8 |
+
|
9 |
+
Return your answer as a comma-separated list of category names only.
|
10 |
+
"""
|
11 |
+
|
12 |
+
# Text classification prompt
|
13 |
+
TEXT_CLASSIFICATION_PROMPT = """
|
14 |
+
Classify the following text into one of these categories: {categories}
|
15 |
+
|
16 |
+
Text: {text}
|
17 |
+
|
18 |
+
Return your answer in JSON format with these fields:
|
19 |
+
- category: the chosen category from the list
|
20 |
+
- confidence: a value between 0 and 100 indicating your confidence in this classification (as a percentage)
|
21 |
+
- explanation: a brief explanation of why this category was chosen (1-2 sentences)
|
22 |
+
|
23 |
+
JSON response:
|
24 |
+
"""
|
25 |
+
|
26 |
+
# Additional category suggestion prompt
|
27 |
+
ADDITIONAL_CATEGORY_PROMPT = """
|
28 |
+
Based on these example texts and the existing categories ({existing_categories}),
|
29 |
+
suggest one additional appropriate category for classification.
|
30 |
+
|
31 |
+
Example texts:
|
32 |
+
{}
|
33 |
+
|
34 |
+
Return only the suggested category name, nothing else.
|
35 |
+
"""
|
36 |
+
|
37 |
+
# Validation report analysis prompt
|
38 |
+
VALIDATION_ANALYSIS_PROMPT = """
|
39 |
+
Based on this validation report, analyze the current classification and suggest improvements:
|
40 |
+
|
41 |
+
{validation_report}
|
42 |
+
|
43 |
+
Return your answer in JSON format with these fields:
|
44 |
+
- suggested_categories: list of improved category names (must be different from current categories: {current_categories})
|
45 |
+
- confidence_threshold: a number between 0 and 100 for minimum confidence
|
46 |
+
- focus_areas: list of specific aspects to focus on during classification
|
47 |
+
- analysis: a brief analysis of what needs improvement
|
48 |
+
- new_categories_needed: boolean indicating if new categories should be added
|
49 |
+
|
50 |
+
JSON response:
|
51 |
+
"""
|
52 |
+
|
53 |
+
# Category improvement prompt
|
54 |
+
CATEGORY_IMPROVEMENT_PROMPT = """
|
55 |
+
Based on these example texts and the current categories ({current_categories}),
|
56 |
+
suggest new categories that would improve the classification. The validation report indicates:
|
57 |
+
{analysis}
|
58 |
+
|
59 |
+
Example texts:
|
60 |
+
{}
|
61 |
+
|
62 |
+
Return your answer as a comma-separated list of new category names only.
|
63 |
+
"""
|