NamedEntityRecognitionTool

Running

App Files Files Community

Chris4K commited on 6 days ago

Commit

7d26c0c

verified ·

1 Parent(s): 2113210

Update ner_tool.py

Browse files

Files changed (1) hide show

ner_tool.py +168 -7

ner_tool.py CHANGED Viewed

@@ -62,7 +62,10 @@ class NamedEntityRecognitionTool(Tool):
             "WORK_OF_ART": "🎨 Work of Art",
             "LAW": "⚖️ Law",
             "LANGUAGE": "🗣️ Language",
-            "FAC": "🏢 Facility"
         }
         # Pipeline will be lazily loaded
         self._pipeline = None
@@ -71,14 +74,41 @@ class NamedEntityRecognitionTool(Tool):
         """Load the NER pipeline with the specified model."""
         try:
             from transformers import pipeline
-            self._pipeline = pipeline("ner", model=model_name, aggregation_strategy="simple")
             return True
         except Exception as e:
             print(f"Error loading model {model_name}: {str(e)}")
             try:
                 # Fall back to default model
                 from transformers import pipeline
-                self._pipeline = pipeline("ner", model=self.default_model, aggregation_strategy="simple")
                 return True
             except Exception as fallback_error:
                 print(f"Error loading fallback model: {str(fallback_error)}")
@@ -88,6 +118,34 @@ class NamedEntityRecognitionTool(Tool):
         """Convert technical entity labels to friendly descriptions with color indicators."""
         # Strip B- or I- prefixes that indicate beginning or inside of entity
         clean_label = label.replace("B-", "").replace("I-", "")
         return self.entity_colors.get(clean_label, f"🔷 {clean_label}")
     def forward(self, text: str, model: str = None, aggregation: str = None, min_score: float = None) -> str:
@@ -127,6 +185,16 @@ class NamedEntityRecognitionTool(Tool):
             # Filter by confidence score
             entities = [e for e in entities if e.get('score', 0) >= min_score]
             if not entities:
                 return "No entities were detected in the text with the current settings."
@@ -143,9 +211,40 @@ class NamedEntityRecognitionTool(Tool):
     def _format_simple(self, text: str, entities: List[Dict[str, Any]]) -> str:
         """Format entities as a simple list."""
         result = "Named Entities Found:\n\n"
-        for entity in entities:
             word = entity.get("word", "")
             label = entity.get("entity", "UNKNOWN")
             score = entity.get("score", 0)
@@ -157,10 +256,41 @@ class NamedEntityRecognitionTool(Tool):
     def _format_grouped(self, text: str, entities: List[Dict[str, Any]]) -> str:
         """Format entities grouped by their category."""
         # Group entities by their label
         grouped = {}
-        for entity in entities:
             word = entity.get("word", "")
             label = entity.get("entity", "UNKNOWN").replace("B-", "").replace("I-", "")
@@ -181,11 +311,42 @@ class NamedEntityRecognitionTool(Tool):
     def _format_detailed(self, text: str, entities: List[Dict[str, Any]]) -> str:
         """Format entities with detailed information including position in text."""
         # First, build an entity map to highlight the entire text
         character_labels = [None] * len(text)
         # Mark each character with its entity
-        for entity in entities:
             start = entity.get("start", 0)
             end = entity.get("end", 0)
             label = entity.get("entity", "UNKNOWN")
@@ -226,7 +387,7 @@ class NamedEntityRecognitionTool(Tool):
         # Get entity details
         entity_details = []
-        for entity in entities:
             word = entity.get("word", "")
             label = entity.get("entity", "UNKNOWN")
             score = entity.get("score", 0)

             "WORK_OF_ART": "🎨 Work of Art",
             "LAW": "⚖️ Law",
             "LANGUAGE": "🗣️ Language",
+            "FAC": "🏢 Facility",
+            # Fix for models that don't properly tag entities
+            "O": "Not an entity",
+            "UNKNOWN": "🔷 Entity"
         }
         # Pipeline will be lazily loaded
         self._pipeline = None
         """Load the NER pipeline with the specified model."""
         try:
             from transformers import pipeline
+            import torch
+            # Try to detect if GPU is available
+            device = 0 if torch.cuda.is_available() else -1
+            # For some models, we need special handling
+            if "dslim/bert-base-NER" in model_name:
+                # This model works better with a specific aggregation strategy
+                self._pipeline = pipeline(
+                    "ner",
+                    model=model_name,
+                    aggregation_strategy="first",
+                    device=device
+                )
+            else:
+                self._pipeline = pipeline(
+                    "ner",
+                    model=model_name,
+                    aggregation_strategy="simple",
+                    device=device
+                )
             return True
         except Exception as e:
             print(f"Error loading model {model_name}: {str(e)}")
             try:
                 # Fall back to default model
                 from transformers import pipeline
+                import torch
+                device = 0 if torch.cuda.is_available() else -1
+                self._pipeline = pipeline(
+                    "ner",
+                    model=self.default_model,
+                    aggregation_strategy="first",
+                    device=device
+                )
                 return True
             except Exception as fallback_error:
                 print(f"Error loading fallback model: {str(fallback_error)}")
         """Convert technical entity labels to friendly descriptions with color indicators."""
         # Strip B- or I- prefixes that indicate beginning or inside of entity
         clean_label = label.replace("B-", "").replace("I-", "")
+        # Handle common name and location patterns with heuristics
+        if clean_label == "UNKNOWN" or clean_label == "O":
+            # Apply some basic heuristics to detect entity types
+            # This is a fallback when the model fails to properly tag
+            text = self._current_entity_text.lower() if hasattr(self, '_current_entity_text') else ""
+            # Check for capitalized words which might be names or places
+            if text and text[0].isupper():
+                # Countries and major cities
+                countries_and_cities = ["germany", "france", "spain", "italy", "london",
+                                        "paris", "berlin", "rome", "new york", "tokyo",
+                                        "beijing", "moscow", "canada", "australia", "india",
+                                        "china", "japan", "russia", "brazil", "mexico"]
+                if text.lower() in countries_and_cities:
+                    return self.entity_colors.get("LOC", "🟨 Location")
+                # Common first names (add more as needed)
+                common_names = ["john", "mike", "sarah", "david", "michael", "james",
+                               "robert", "mary", "jennifer", "linda", "michael", "william",
+                               "kristof", "chris", "thomas", "daniel", "matthew", "joseph",
+                               "donald", "richard", "charles", "paul", "mark", "kevin"]
+                name_parts = text.lower().split()
+                if name_parts and name_parts[0] in common_names:
+                    return self.entity_colors.get("PER", "🟥 Person")
         return self.entity_colors.get(clean_label, f"🔷 {clean_label}")
     def forward(self, text: str, model: str = None, aggregation: str = None, min_score: float = None) -> str:
             # Filter by confidence score
             entities = [e for e in entities if e.get('score', 0) >= min_score]
+            # Store the text for better heuristics
+            for entity in entities:
+                word = entity.get("word", "")
+                start = entity.get("start", 0)
+                end = entity.get("end", 0)
+                # Store the actual text from the input for better entity type detection
+                entity['actual_text'] = text[start:end]
+                # Set this for _get_friendly_label to use
+                self._current_entity_text = text[start:end]
             if not entities:
                 return "No entities were detected in the text with the current settings."
     def _format_simple(self, text: str, entities: List[Dict[str, Any]]) -> str:
         """Format entities as a simple list."""
+        # Process word pieces and handle subtoken merging
+        merged_entities = []
+        current_entity = None
+        for entity in sorted(entities, key=lambda e: e.get("start", 0)):
+            word = entity.get("word", "")
+            start = entity.get("start", 0)
+            end = entity.get("end", 0)
+            label = entity.get("entity", "UNKNOWN")
+            score = entity.get("score", 0)
+            # Check if this is a continuation (subtoken)
+            if word.startswith("##"):
+                if current_entity:
+                    # Extend the current entity
+                    current_entity["word"] += word.replace("##", "")
+                    current_entity["end"] = end
+                    # Keep the average score
+                    current_entity["score"] = (current_entity["score"] + score) / 2
+                continue
+            # Start a new entity
+            current_entity = {
+                "word": word,
+                "start": start,
+                "end": end,
+                "entity": label,
+                "score": score
+            }
+            merged_entities.append(current_entity)
         result = "Named Entities Found:\n\n"
+        for entity in merged_entities:
             word = entity.get("word", "")
             label = entity.get("entity", "UNKNOWN")
             score = entity.get("score", 0)
     def _format_grouped(self, text: str, entities: List[Dict[str, Any]]) -> str:
         """Format entities grouped by their category."""
+        # Process word pieces and handle subtoken merging
+        merged_entities = []
+        current_entity = None
+        for entity in sorted(entities, key=lambda e: e.get("start", 0)):
+            word = entity.get("word", "")
+            start = entity.get("start", 0)
+            end = entity.get("end", 0)
+            label = entity.get("entity", "UNKNOWN")
+            score = entity.get("score", 0)
+            # Check if this is a continuation (subtoken)
+            if word.startswith("##"):
+                if current_entity:
+                    # Extend the current entity
+                    current_entity["word"] += word.replace("##", "")
+                    current_entity["end"] = end
+                    # Keep the average score
+                    current_entity["score"] = (current_entity["score"] + score) / 2
+                continue
+            # Start a new entity
+            current_entity = {
+                "word": word,
+                "start": start,
+                "end": end,
+                "entity": label,
+                "score": score
+            }
+            merged_entities.append(current_entity)
         # Group entities by their label
         grouped = {}
+        for entity in merged_entities:
             word = entity.get("word", "")
             label = entity.get("entity", "UNKNOWN").replace("B-", "").replace("I-", "")
     def _format_detailed(self, text: str, entities: List[Dict[str, Any]]) -> str:
         """Format entities with detailed information including position in text."""
+        # Process word pieces and handle subtoken merging
+        merged_entities = []
+        current_entity = None
+        for entity in sorted(entities, key=lambda e: e.get("start", 0)):
+            word = entity.get("word", "")
+            start = entity.get("start", 0)
+            end = entity.get("end", 0)
+            label = entity.get("entity", "UNKNOWN")
+            score = entity.get("score", 0)
+            # Check if this is a continuation (subtoken)
+            if word.startswith("##"):
+                if current_entity:
+                    # Extend the current entity
+                    current_entity["word"] += word.replace("##", "")
+                    current_entity["end"] = end
+                    # Keep the average score
+                    current_entity["score"] = (current_entity["score"] + score) / 2
+                continue
+            # Start a new entity
+            current_entity = {
+                "word": word,
+                "start": start,
+                "end": end,
+                "entity": label,
+                "score": score
+            }
+            merged_entities.append(current_entity)
         # First, build an entity map to highlight the entire text
         character_labels = [None] * len(text)
         # Mark each character with its entity
+        for entity in merged_entities:
             start = entity.get("start", 0)
             end = entity.get("end", 0)
             label = entity.get("entity", "UNKNOWN")
         # Get entity details
         entity_details = []
+        for entity in merged_entities:
             word = entity.get("word", "")
             label = entity.get("entity", "UNKNOWN")
             score = entity.get("score", 0)