Spaces:

tiwari866
/

abhigyan

Sleeping

App Files Files Community

Abhigyan commited on 9 days ago

Commit

f68c4f8

1 Parent(s): 3bdd5ce

Refactor

Browse files

Files changed (3) hide show

__pycache__/ner_module.cpython-310.pyc +0 -0
app.py +184 -381
ner_module.py +68 -86

__pycache__/ner_module.cpython-310.pyc ADDED Viewed

Binary file (9.93 kB). View file

app.py CHANGED Viewed

@@ -1,396 +1,199 @@
-# ner_module.py
-import torch
 import time
-from typing import List, Dict, Any, Tuple
-from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 import logging
-# Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-class NERModel:
     """
-    A singleton class to manage the NER model loading and prediction.
-    Ensures the potentially large model is loaded only once.
     """
-    _instance = None
-    _model = None
-    _tokenizer = None
-    _pipeline = None
-    _model_name = None # Store model name used for initialization
-    @classmethod
-    def get_instance(cls, model_name: str = "Davlan/bert-base-multilingual-cased-ner-hrl"):
-        """
-        Singleton pattern: Get the existing instance or create a new one.
-        Uses the specified model_name only during the first initialization.
-        """
-        if cls._instance is None:
-            logger.info(f"Creating new NERModel instance with model: {model_name}")
-            cls._instance = cls(model_name)
-        elif cls._model_name != model_name:
-             logger.warning(f"NERModel already initialized with {cls._model_name}. Ignoring new model name {model_name}.")
-        return cls._instance
-    def __init__(self, model_name: str):
-        """
-        Initialize the model, tokenizer, and pipeline.
-        Private constructor - use get_instance() instead.
-        """
-        if NERModel._instance is not None:
-            raise Exception("This class is a singleton! Use get_instance() to get the object.")
-        else:
-            self.model_name = model_name
-            NERModel._model_name = model_name # Store the model name
-            self._load_model()
-            NERModel._instance = self # Assign the instance here
-    def _load_model(self):
-        """Load the NER model and tokenizer from Hugging Face."""
-        logger.info(f"Loading model: {self.model_name}")
-        start_time = time.time()
-        try:
-            # Load tokenizer and model
-            self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            self._model = AutoModelForTokenClassification.from_pretrained(self.model_name)
-            # Check if the model is a PyTorch model for potential optimizations
-            if isinstance(self._model, torch.nn.Module):
-                self._model.eval()  # Set model to evaluation mode (important for inference)
-                # self._model.share_memory() # share_memory() might not be needed unless using multiprocessing explicitly
-            # Create the NER pipeline
-            # Specify device=-1 for CPU, device=0 for first GPU, etc.
-            # Let pipeline decide device automatically by default, or specify if needed
-            self._pipeline = pipeline(
-                "ner",
-                model=self._model,
-                tokenizer=self._tokenizer,
-                # grouped_entities=True # Group subword tokens automatically (alternative to manual combination)
-            )
-            load_time = time.time() - start_time
-            logger.info(f"Model '{self.model_name}' loaded successfully in {load_time:.2f} seconds.")
-        except Exception as e:
-            logger.error(f"Error loading model {self.model_name}: {e}")
-            # Clean up partial loads if necessary
-            self._tokenizer = None
-            self._model = None
-            self._pipeline = None
-            # Re-raise the exception to signal failure
-            raise
-    def predict(self, text: str) -> List[Dict[str, Any]]:
-        """
-        Run NER prediction on the input text using the loaded pipeline.
-        Args:
-            text: The input string to perform NER on.
-        Returns:
-            A list of dictionaries, where each dictionary represents an entity
-            identified by the pipeline. The exact format depends on the pipeline
-            configuration (e.g., grouped_entities).
-        """
-        if self._pipeline is None:
-            logger.error("NER pipeline is not initialized. Cannot predict.")
-            return [] # Return empty list or raise an error
-        if not text or not isinstance(text, str):
-            logger.warning("Prediction called with empty or invalid text.")
-            return []
-        logger.debug(f"Running prediction on text: '{text[:100]}...'") # Log snippet
-        try:
-            # The pipeline handles tokenization and prediction
-            results = self._pipeline(text)
-            logger.debug(f"Prediction results: {results}")
-            return results
-        except Exception as e:
-            logger.error(f"Error during NER prediction: {e}")
-            return [] # Return empty list on error
-class TextProcessor:
     """
-    Provides static methods for processing text, specifically for NER tasks,
-    including combining subword entities and handling large texts via chunking.
     """
-    @staticmethod
-    def combine_entities(ner_results: List[Dict[str, Any]], original_text: str) -> List[Dict[str, Any]]:
-        """
-        Combine entities that might be split into subword tokens (B-TAG, I-TAG).
-        This method assumes the pipeline did *not* use grouped_entities=True.
-        Args:
-            ner_results: The raw output from the NER pipeline (list of token dictionaries).
-            original_text: The original text input to extract entity words accurately.
-        Returns:
-            A list of dictionaries, each representing a combined entity with
-            'entity_type', 'start', 'end', 'score', and 'word'.
-        """
-        if not ner_results:
-            return []
-        combined_entities = []
-        current_entity = None
-        for token in ner_results:
-            # Basic validation of token structure
-            if not all(k in token for k in ['entity', 'start', 'end', 'score']):
-                logger.warning(f"Skipping malformed token: {token}")
-                continue
-            # Skip 'O' tags (Outside any entity)
-            if token['entity'] == 'O':
-                # If we were tracking an entity, finalize it before moving on
-                if current_entity:
-                    combined_entities.append(current_entity)
-                    current_entity = None
-                continue
-            # Extract entity type (e.g., 'PER', 'LOC') removing 'B-' or 'I-'
-            entity_tag = token['entity']
-            if entity_tag.startswith('B-') or entity_tag.startswith('I-'):
-                entity_type = entity_tag[2:]
-            else:
-                # Handle cases where the tag might not have B-/I- prefix (less common)
-                logger.warning(f"Unexpected entity tag format: {entity_tag}. Using as is.")
-                entity_type = entity_tag
-            # Start of a new entity ('B-') or continuation of a different entity type
-            if entity_tag.startswith('B-') or (entity_tag.startswith('I-') and (not current_entity or current_entity['entity_type'] != entity_type)):
-                # Finalize the previous entity if it exists
-                if current_entity:
-                    combined_entities.append(current_entity)
-                # Start the new entity
-                current_entity = {
-                    'entity_type': entity_type,
-                    'start': token['start'],
-                    'end': token['end'],
-                    'score': float(token['score']),
-                    'token_count': 1 # Keep track of tokens for averaging score
-                }
-            # Continuation of the current entity ('I-' and matching type)
-            elif entity_tag.startswith('I-') and current_entity and current_entity['entity_type'] == entity_type:
-                # Extend the end position
-                current_entity['end'] = token['end']
-                # Update the score (e.g., average)
-                current_entity['score'] = (current_entity['score'] * current_entity['token_count'] + float(token['score'])) / (current_entity['token_count'] + 1)
-                current_entity['token_count'] += 1
-            # Handle unexpected cases (e.g., I- tag without preceding B- or matching I-)
-            else:
-                 logger.warning(f"Encountered unexpected token sequence at: {token}. Resetting current entity.")
-                 if current_entity:
-                     combined_entities.append(current_entity)
-                 current_entity = None # Reset
-        # Add the last tracked entity if it exists
-        if current_entity:
-            combined_entities.append(current_entity)
-        # Extract the actual text 'word' for each combined entity
-        for entity in combined_entities:
-            try:
-                entity['word'] = original_text[entity['start']:entity['end']].strip()
-                # Remove internal helper key
-                if 'token_count' in entity:
-                    del entity['token_count']
-            except IndexError:
-                logger.error(f"Index error extracting word for entity: {entity} with text length {len(original_text)}")
-                entity['word'] = "[Error extracting word]"
-        # Optional: Sort entities by start position
-        combined_entities.sort(key=lambda x: x['start'])
-        logger.info(f"Combined {len(ner_results)} raw tokens into {len(combined_entities)} entities.")
-        return combined_entities
-    @staticmethod
-    def process_large_text(text: str, model: NERModel, chunk_size: int = 512, overlap: int = 50) -> List[Dict[str, Any]]:
-        """
-        Process large text by splitting it into overlapping chunks, running NER
-        on each chunk, and then combining the results intelligently.
-        Args:
-            text: The large input text string.
-            model: The initialized NERModel instance.
-            chunk_size: The maximum size of each text chunk (in characters or tokens,
-                        depending on the tokenizer's limits, often ~512 for BERT).
-            overlap: The number of characters/tokens to overlap between consecutive chunks
-                     to ensure entities spanning chunk boundaries are captured.
-        Returns:
-            A list of combined entity dictionaries for the entire text.
-        """
-        if not text:
-            return []
-        # Use tokenizer max length if available and smaller than chunk_size
-        if model._tokenizer and hasattr(model._tokenizer, 'model_max_length'):
-            tokenizer_max_len = model._tokenizer.model_max_length
-            if chunk_size > tokenizer_max_len:
-                logger.warning(f"Requested chunk_size {chunk_size} exceeds model max length {tokenizer_max_len}. Using {tokenizer_max_len}.")
-                chunk_size = tokenizer_max_len
-            # Ensure overlap is reasonable compared to chunk size
-            if overlap >= chunk_size // 2:
-                 logger.warning(f"Overlap {overlap} seems large for chunk_size {chunk_size}. Reducing overlap to {chunk_size // 4}.")
-                 overlap = chunk_size // 4
-        logger.info(f"Processing large text (length {len(text)}) with chunk_size={chunk_size}, overlap={overlap}")
-        chunks = TextProcessor._create_chunks(text, chunk_size, overlap)
-        logger.info(f"Split text into {len(chunks)} chunks.")
-        all_raw_results = []
-        total_processing_time = 0
-        for i, (chunk_text, start_pos) in enumerate(chunks):
-            logger.debug(f"Processing chunk {i+1}/{len(chunks)} (start_pos: {start_pos}, length: {len(chunk_text)})")
-            start_time = time.time()
-            # Get raw predictions for the current chunk
-            raw_results_chunk = model.predict(chunk_text)
-            chunk_processing_time = time.time() - start_time
-            total_processing_time += chunk_processing_time
-            logger.debug(f"Chunk {i+1} processed in {chunk_processing_time:.2f}s. Found {len(raw_results_chunk)} raw entities.")
-            # Adjust entity positions relative to the original text
-            for result in raw_results_chunk:
-                 # Check if 'start' and 'end' exist before adjusting
-                 if 'start' in result and 'end' in result:
-                     result['start'] += start_pos
-                     result['end'] += start_pos
-                 else:
-                     logger.warning(f"Skipping position adjustment for malformed result in chunk {i+1}: {result}")
-            all_raw_results.extend(raw_results_chunk)
-        logger.info(f"Finished processing all chunks in {total_processing_time:.2f} seconds.")
-        logger.info(f"Total raw entities found across all chunks: {len(all_raw_results)}")
-        # Combine entities from all chunks, handling potential duplicates from overlap
-        # The combine_entities method needs refinement to handle overlaps better,
-        # e.g., by prioritizing entities from non-overlapped regions or merging based on confidence.
-        # For now, we use the existing combine_entities, which might create duplicates if
-        # an entity appears fully in the overlap region of two chunks.
-        # A more robust approach would involve deduplication based on start/end/type.
-        combined_entities = TextProcessor.combine_entities(all_raw_results, text)
-        # Simple deduplication based on exact start, end, and type
-        unique_entities = []
-        seen_entities = set()
-        for entity in combined_entities:
-            entity_key = (entity['start'], entity['end'], entity['entity_type'])
-            if entity_key not in seen_entities:
-                unique_entities.append(entity)
-                seen_entities.add(entity_key)
             else:
-                logger.debug(f"Duplicate entity removed: {entity}")
-        logger.info(f"Final number of unique combined entities: {len(unique_entities)}")
-        return unique_entities
-    @staticmethod
-    def _create_chunks(text: str, chunk_size: int = 512, overlap: int = 50) -> List[Tuple[str, int]]:
-        """
-        Split text into potentially overlapping chunks, trying to respect word boundaries.
-        Args:
-            text: The input text string.
-            chunk_size: The target maximum size of each chunk.
-            overlap: The desired overlap between consecutive chunks.
-        Returns:
-            A list of tuples, where each tuple contains (chunk_text, start_position_in_original_text).
-        """
-        if not text:
-            return []
-        if chunk_size <= overlap:
-             raise ValueError("chunk_size must be greater than overlap")
-        if chunk_size <= 0:
-             raise ValueError("chunk_size must be positive")
-        chunks = []
-        start = 0
-        text_len = len(text)
-        while start < text_len:
-            # Determine the ideal end position
-            end = start + chunk_size
-            # If the ideal end is beyond the text length, just take the rest
-            if end >= text_len:
-                chunks.append((text[start:], start))
-                break # We've reached the end
-            # Try to find a suitable split point (e.g., whitespace) near the ideal end
-            # Search backwards from the ideal end position within a reasonable window (e.g., overlap size)
-            split_pos = -1
-            search_start = max(start, end - overlap) # Don't search too far back
-            for i in range(end, search_start -1 , -1):
-                 # Prefer splitting at whitespace
-                 if text[i].isspace():
-                     split_pos = i + 1 # Split *after* the space
-                     break
-                 # Consider splitting at punctuation as a fallback? (optional)
-                 # import string
-                 # if text[i] in string.punctuation:
-                 #    split_pos = i + 1
-                 #    break
-            # If no good split point found nearby, just cut at the chunk_size
-            if split_pos == -1 or split_pos <= start:
-                 actual_end = end
-                 logger.debug(f"No suitable whitespace found near char {end}, cutting at {actual_end}")
-            else:
-                 actual_end = split_pos
-                 logger.debug(f"Found whitespace split point at char {actual_end}")
-            # Ensure the chunk isn't empty if split_pos was too close to start
-            if actual_end <= start:
-                actual_end = end # Fallback to hard cut if split logic fails
-            # Add the chunk and its starting position
-            chunks.append((text[start:actual_end], start))
-            # Determine the start of the next chunk
-            # Move forward by chunk_size minus overlap, ensuring progress
-            next_start = start + (chunk_size - overlap)
-            # If we split at whitespace (actual_end), we can potentially start the next chunk
-            # right after the split point to avoid redundant processing of the overlap zone
-            # if the split was significantly before the ideal 'end'.
-            # However, the simple `next_start = start + (chunk_size - overlap)` is safer
-            # to ensure consistent overlap handling unless more complex logic is added.
-            # Let's stick to the simpler approach for now:
-            # next_start = actual_end - overlap # This could lead to variable overlap size
-            # Ensure we always make progress
-            if next_start <= start:
-                logger.warning("Chunking logic resulted in no progress. Moving start by 1.")
-                next_start = start + 1
-            start = next_start
-        return chunks

+# app.py
+import streamlit as st
+from ner_module import NERModel, TextProcessor
 import time
 import logging
+# Configure logging (optional, but helpful for debugging Streamlit apps)
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# --- Configuration ---
+DEFAULT_MODEL = "Davlan/bert-base-multilingual-cased-ner-hrl"
+# Alternative models (ensure they are compatible TokenClassification models)
+# DEFAULT_MODEL = "dslim/bert-base-NER" # English NER
+# DEFAULT_MODEL = "xlm-roberta-large-finetuned-conll03-english" # Another English option
+DEFAULT_TEXT = """
+Angela Merkel met Emmanuel Macron in Berlin on Tuesday to discuss the future of the European Union.
+They visited the Brandenburg Gate and enjoyed some Currywurst. Later, they flew to Paris.
+John Doe from New York works at Google LLC.
+"""
+CHUNK_SIZE_DEFAULT = 500 # Slightly less than common 512 limit to be safe
+OVERLAP_DEFAULT = 50
+# --- Caching ---
+@st.cache_resource(show_spinner="Loading NER Model...")
+def load_ner_model(model_name: str):
     """
+    Loads the NERModel using the singleton pattern and caches the instance.
+    Streamlit's cache_resource is ideal for heavy objects like models.
     """
+    try:
+        logger.info(f"Attempting to load model: {model_name}")
+        model_instance = NERModel.get_instance(model_name=model_name)
+        return model_instance
+    except Exception as e:
+        st.error(f"Failed to load model '{model_name}'. Error: {e}", icon="🚨")
+        logger.error(f"Fatal error loading model {model_name}: {e}")
+        return None
+# --- Helper Functions ---
+def get_color_for_entity(entity_type: str) -> str:
+    """Assigns a color based on the entity type for visualization."""
+    # Simple color mapping, can be expanded
+    colors = {
+        "PER": "#faa",  # Light red for Person
+        "ORG": "#afa",  # Light green for Organization
+        "LOC": "#aaf",  # Light blue for Location
+        "MISC": "#ffc", # Light yellow for Miscellaneous
+        # Add more colors as needed based on model's entity types
+    }
+    # Default color if type not found
+    return colors.get(entity_type.upper(), "#ddd") # Light grey default
+def highlight_entities(text: str, entities: list) -> str:
     """
+    Generates an HTML string with entities highlighted using spans and colors.
+    Sorts entities by start position descending to handle nested entities correctly.
     """
+    if not entities:
+        return text
+    # Sort entities by start index in descending order
+    # This ensures that inner entities are processed before outer ones if they overlap
+    entities.sort(key=lambda x: x['start'], reverse=True)
+    highlighted_text = text
+    for entity in entities:
+        start = entity['start']
+        end = entity['end']
+        entity_type = entity['entity_type']
+        word = entity['word'] # Use the extracted word for the title/tooltip
+        color = get_color_for_entity(entity_type)
+        # Create the highlighted span
+        highlight = (
+            f'<span style="background-color: {color}; padding: 0.2em 0.3em; '
+            f'margin: 0 0.15em; line-height: 1; border-radius: 0.3em;" '
+            f'title="{entity_type}: {word} (Score: {entity.get("score", 0):.2f})">' # Tooltip
+            f'{highlighted_text[start:end]}' # Get the original text slice
+            f'<sup style="font-size: 0.7em; font-weight: bold; margin-left: 2px; color: #555;">{entity_type}</sup>' # Small label
+            f'</span>'
+        )
+        # Replace the original text portion with the highlighted version
+        # Working backwards prevents index issues from altering string length
+        highlighted_text = highlighted_text[:start] + highlight + highlighted_text[end:]
+    return highlighted_text
+# --- Streamlit App UI ---
+st.set_page_config(layout="wide", page_title="NER Demo")
+st.title("📝 Named Entity Recognition (NER) Demo")
+st.markdown("Highlight Persons (PER), Organizations (ORG), Locations (LOC), and Miscellaneous (MISC) entities in text using a Hugging Face Transformer model.")
+# Model selection fixed to default for simplicity
+model_name = DEFAULT_MODEL
+# Load the model (cached)
+ner_model = load_ner_model(model_name)
+if ner_model: # Proceed only if the model loaded successfully
+    st.success(f"Model '{ner_model.model_name}' loaded successfully.", icon="✅")
+    # --- Input & Controls ---
+    col1, col2 = st.columns([3, 1]) # Input area takes more space
+    with col1:
+        st.subheader("Input Text")
+        # Use session state to keep text area content persistent across reruns
+        if 'text_input' not in st.session_state:
+            st.session_state.text_input = DEFAULT_TEXT
+        text_input = st.text_area("Enter text here:", value=st.session_state.text_input, height=250, key="text_area_input")
+        st.session_state.text_input = text_input # Update session state on change
+    with col2:
+        st.subheader("Options")
+        use_chunking = st.checkbox("Process as Large Text (Chunking)", value=True)
+        chunk_size = CHUNK_SIZE_DEFAULT
+        overlap = OVERLAP_DEFAULT
+        if use_chunking:
+            chunk_size = st.slider("Chunk Size (chars)", min_value=100, max_value=1024, value=CHUNK_SIZE_DEFAULT, step=10)
+            overlap = st.slider("Overlap (chars)", min_value=10, max_value=chunk_size // 2, value=OVERLAP_DEFAULT, step=5)
+        process_button = st.button("✨ Analyze Text", type="primary", use_container_width=True)
+    # --- Processing and Output ---
+    if process_button and text_input:
+        start_process_time = time.time()
+        st.markdown("---") # Separator
+        st.subheader("Analysis Results")
+        with st.spinner("Analyzing text... Please wait."):
+            if use_chunking:
+                logger.info(f"Processing with chunking: size={chunk_size}, overlap={overlap}")
+                entities = TextProcessor.process_large_text(
+                    text=text_input,
+                    model=ner_model,
+                    chunk_size=chunk_size,
+                    overlap=overlap
+                )
             else:
+                logger.info("Processing without chunking (potential truncation for long text)")
+                entities = TextProcessor.process_large_text(
+                    text=text_input,
+                    model=ner_model,
+                    chunk_size=max(len(text_input), 512), # Use text length or a large value
+                    overlap=0 # No overlap needed for single chunk
+                )
+        end_process_time = time.time()
+        processing_duration = end_process_time - start_process_time
+        st.info(f"Analysis completed in {processing_duration:.2f} seconds. Found {len(entities)} entities.", icon="⏱️")
+        if entities:
+            # Display highlighted text
+            st.markdown("#### Highlighted Text:")
+            highlighted_html = highlight_entities(text_input, entities)
+            # Use st.markdown to render the HTML
+            st.markdown(highlighted_html, unsafe_allow_html=True)
+            # Display entities in a table-like format
+            st.markdown("#### Extracted Entities:")
+            # Sort entities by appearance order for the list
+            entities.sort(key=lambda x: x['start'])
+            # Use columns for a cleaner layout
+            cols = st.columns(3) # Adjust number of columns as needed
+            col_idx = 0
+            for entity in entities:
+                with cols[col_idx % len(cols)]:
+                    st.markdown(
+                        f"**{entity['entity_type']}** `{entity['score']:.2f}`: "
+                        f"{entity['word']} ({entity['start']}-{entity['end']})"
+                    )
+                col_idx += 1
+            # Alternative display as an expander with detailed info
+            with st.expander("Show Detailed Entity List", expanded=False):
+                for entity in entities:
+                    st.write(f"- **{entity['entity_type']}**: {entity['word']} (Score: {entity['score']:.2f}, Position: {entity['start']}-{entity['end']})")
+        else:
+            st.warning("No entities found in the provided text.", icon="❓")
+    elif process_button and not text_input:
+        st.warning("Please enter some text to analyze.", icon="⚠️")
+else:
+    # This block runs if the model failed to load
+    st.error("NER model could not be loaded. Please check the logs or model name. The application cannot proceed.", icon="🛑")
+# Add footer or instructions
+st.markdown("---")
+st.caption("Powered by Hugging Face Transformers and Streamlit.")

ner_module.py CHANGED Viewed

@@ -59,16 +59,13 @@ class NERModel:
             # Check if the model is a PyTorch model for potential optimizations
             if isinstance(self._model, torch.nn.Module):
                 self._model.eval()  # Set model to evaluation mode (important for inference)
-                # self._model.share_memory() # share_memory() might not be needed unless using multiprocessing explicitly
             # Create the NER pipeline
-            # Specify device=-1 for CPU, device=0 for first GPU, etc.
-            # Let pipeline decide device automatically by default, or specify if needed
             self._pipeline = pipeline(
                 "ner",
                 model=self._model,
                 tokenizer=self._tokenizer,
-                # grouped_entities=True # Group subword tokens automatically (alternative to manual combination)
             )
             load_time = time.time() - start_time
@@ -92,8 +89,7 @@ class NERModel:
         Returns:
             A list of dictionaries, where each dictionary represents an entity
-            identified by the pipeline. The exact format depends on the pipeline
-            configuration (e.g., grouped_entities).
         """
         if self._pipeline is None:
             logger.error("NER pipeline is not initialized. Cannot predict.")
@@ -160,7 +156,6 @@ class TextProcessor:
                 entity_type = entity_tag[2:]
             else:
                 # Handle cases where the tag might not have B-/I- prefix (less common)
-                logger.warning(f"Unexpected entity tag format: {entity_tag}. Using as is.")
                 entity_type = entity_tag
             # Start of a new entity ('B-') or continuation of a different entity type
@@ -188,11 +183,17 @@ class TextProcessor:
             # Handle unexpected cases (e.g., I- tag without preceding B- or matching I-)
             else:
-                 logger.warning(f"Encountered unexpected token sequence at: {token}. Resetting current entity.")
                  if current_entity:
                      combined_entities.append(current_entity)
-                 current_entity = None # Reset
         # Add the last tracked entity if it exists
         if current_entity:
@@ -201,16 +202,18 @@ class TextProcessor:
         # Extract the actual text 'word' for each combined entity
         for entity in combined_entities:
             try:
-                entity['word'] = original_text[entity['start']:entity['end']].strip()
                 # Remove internal helper key
                 if 'token_count' in entity:
                     del entity['token_count']
-            except IndexError:
-                logger.error(f"Index error extracting word for entity: {entity} with text length {len(original_text)}")
                 entity['word'] = "[Error extracting word]"
-        # Optional: Sort entities by start position
         combined_entities.sort(key=lambda x: x['start'])
         logger.info(f"Combined {len(ner_results)} raw tokens into {len(combined_entities)} entities.")
@@ -225,10 +228,8 @@ class TextProcessor:
         Args:
             text: The large input text string.
             model: The initialized NERModel instance.
-            chunk_size: The maximum size of each text chunk (in characters or tokens,
-                        depending on the tokenizer's limits, often ~512 for BERT).
-            overlap: The number of characters/tokens to overlap between consecutive chunks
-                     to ensure entities spanning chunk boundaries are captured.
         Returns:
             A list of combined entity dictionaries for the entire text.
@@ -247,7 +248,6 @@ class TextProcessor:
                  logger.warning(f"Overlap {overlap} seems large for chunk_size {chunk_size}. Reducing overlap to {chunk_size // 4}.")
                  overlap = chunk_size // 4
         logger.info(f"Processing large text (length {len(text)}) with chunk_size={chunk_size}, overlap={overlap}")
         chunks = TextProcessor._create_chunks(text, chunk_size, overlap)
         logger.info(f"Split text into {len(chunks)} chunks.")
@@ -266,7 +266,6 @@ class TextProcessor:
             total_processing_time += chunk_processing_time
             logger.debug(f"Chunk {i+1} processed in {chunk_processing_time:.2f}s. Found {len(raw_results_chunk)} raw entities.")
             # Adjust entity positions relative to the original text
             for result in raw_results_chunk:
                  # Check if 'start' and 'end' exist before adjusting
@@ -276,35 +275,48 @@ class TextProcessor:
                  else:
                      logger.warning(f"Skipping position adjustment for malformed result in chunk {i+1}: {result}")
             all_raw_results.extend(raw_results_chunk)
         logger.info(f"Finished processing all chunks in {total_processing_time:.2f} seconds.")
         logger.info(f"Total raw entities found across all chunks: {len(all_raw_results)}")
-        # Combine entities from all chunks, handling potential duplicates from overlap
-        # The combine_entities method needs refinement to handle overlaps better,
-        # e.g., by prioritizing entities from non-overlapped regions or merging based on confidence.
-        # For now, we use the existing combine_entities, which might create duplicates if
-        # an entity appears fully in the overlap region of two chunks.
-        # A more robust approach would involve deduplication based on start/end/type.
         combined_entities = TextProcessor.combine_entities(all_raw_results, text)
-        # Simple deduplication based on exact start, end, and type
         unique_entities = []
-        seen_entities = set()
         for entity in combined_entities:
-            entity_key = (entity['start'], entity['end'], entity['entity_type'])
-            if entity_key not in seen_entities:
                 unique_entities.append(entity)
-                seen_entities.add(entity_key)
-            else:
-                logger.debug(f"Duplicate entity removed: {entity}")
         logger.info(f"Final number of unique combined entities: {len(unique_entities)}")
         return unique_entities
     @staticmethod
     def _create_chunks(text: str, chunk_size: int = 512, overlap: int = 50) -> List[Tuple[str, int]]:
         """
@@ -325,71 +337,41 @@ class TextProcessor:
         if chunk_size <= 0:
              raise ValueError("chunk_size must be positive")
         chunks = []
         start = 0
         text_len = len(text)
         while start < text_len:
             # Determine the ideal end position
-            end = start + chunk_size
-            # If the ideal end is beyond the text length, just take the rest
             if end >= text_len:
                 chunks.append((text[start:], start))
-                break # We've reached the end
-            # Try to find a suitable split point (e.g., whitespace) near the ideal end
-            # Search backwards from the ideal end position within a reasonable window (e.g., overlap size)
             split_pos = -1
-            search_start = max(start, end - overlap) # Don't search too far back
-            for i in range(end, search_start -1 , -1):
-                 # Prefer splitting at whitespace
-                 if text[i].isspace():
-                     split_pos = i + 1 # Split *after* the space
-                     break
-                 # Consider splitting at punctuation as a fallback? (optional)
-                 # import string
-                 # if text[i] in string.punctuation:
-                 #    split_pos = i + 1
-                 #    break
-            # If no good split point found nearby, just cut at the chunk_size
             if split_pos == -1 or split_pos <= start:
-                 actual_end = end
-                 logger.debug(f"No suitable whitespace found near char {end}, cutting at {actual_end}")
             else:
-                 actual_end = split_pos
-                 logger.debug(f"Found whitespace split point at char {actual_end}")
-            # Ensure the chunk isn't empty if split_pos was too close to start
-            if actual_end <= start:
-                actual_end = end # Fallback to hard cut if split logic fails
-            # Add the chunk and its starting position
             chunks.append((text[start:actual_end], start))
-            # Determine the start of the next chunk
-            # Move forward by chunk_size minus overlap, ensuring progress
-            next_start = start + (chunk_size - overlap)
-            # If we split at whitespace (actual_end), we can potentially start the next chunk
-            # right after the split point to avoid redundant processing of the overlap zone
-            # if the split was significantly before the ideal 'end'.
-            # However, the simple `next_start = start + (chunk_size - overlap)` is safer
-            # to ensure consistent overlap handling unless more complex logic is added.
-            # Let's stick to the simpler approach for now:
-            # next_start = actual_end - overlap # This could lead to variable overlap size
-            # Ensure we always make progress
             if next_start <= start:
-                logger.warning("Chunking logic resulted in no progress. Moving start by 1.")
                 next_start = start + 1
             start = next_start
         return chunks

             # Check if the model is a PyTorch model for potential optimizations
             if isinstance(self._model, torch.nn.Module):
                 self._model.eval()  # Set model to evaluation mode (important for inference)
             # Create the NER pipeline
             self._pipeline = pipeline(
                 "ner",
                 model=self._model,
                 tokenizer=self._tokenizer,
+                # grouped_entities=True # Uncomment if you want to use pipeline's built-in grouping
             )
             load_time = time.time() - start_time
         Returns:
             A list of dictionaries, where each dictionary represents an entity
+            identified by the pipeline.
         """
         if self._pipeline is None:
             logger.error("NER pipeline is not initialized. Cannot predict.")
                 entity_type = entity_tag[2:]
             else:
                 # Handle cases where the tag might not have B-/I- prefix (less common)
                 entity_type = entity_tag
             # Start of a new entity ('B-') or continuation of a different entity type
             # Handle unexpected cases (e.g., I- tag without preceding B- or matching I-)
             else:
+                 logger.warning(f"Encountered unexpected token sequence at: {token}. Starting new entity.")
                  if current_entity:
                      combined_entities.append(current_entity)
+                 # Try to create a new entity from this token
+                 current_entity = {
+                     'entity_type': entity_type,
+                     'start': token['start'],
+                     'end': token['end'],
+                     'score': float(token['score']),
+                     'token_count': 1
+                 }
         # Add the last tracked entity if it exists
         if current_entity:
         # Extract the actual text 'word' for each combined entity
         for entity in combined_entities:
             try:
+                # Ensure indices are valid
+                start = max(0, min(entity['start'], len(original_text)))
+                end = max(start, min(entity['end'], len(original_text)))
+                entity['word'] = original_text[start:end].strip()
                 # Remove internal helper key
                 if 'token_count' in entity:
                     del entity['token_count']
+            except Exception as e:
+                logger.error(f"Error extracting word for entity: {entity}, error: {e}")
                 entity['word'] = "[Error extracting word]"
+        # Sort entities by start position
         combined_entities.sort(key=lambda x: x['start'])
         logger.info(f"Combined {len(ner_results)} raw tokens into {len(combined_entities)} entities.")
         Args:
             text: The large input text string.
             model: The initialized NERModel instance.
+            chunk_size: The maximum size of each text chunk.
+            overlap: The number of characters to overlap between consecutive chunks.
         Returns:
             A list of combined entity dictionaries for the entire text.
                  logger.warning(f"Overlap {overlap} seems large for chunk_size {chunk_size}. Reducing overlap to {chunk_size // 4}.")
                  overlap = chunk_size // 4
         logger.info(f"Processing large text (length {len(text)}) with chunk_size={chunk_size}, overlap={overlap}")
         chunks = TextProcessor._create_chunks(text, chunk_size, overlap)
         logger.info(f"Split text into {len(chunks)} chunks.")
             total_processing_time += chunk_processing_time
             logger.debug(f"Chunk {i+1} processed in {chunk_processing_time:.2f}s. Found {len(raw_results_chunk)} raw entities.")
             # Adjust entity positions relative to the original text
             for result in raw_results_chunk:
                  # Check if 'start' and 'end' exist before adjusting
                  else:
                      logger.warning(f"Skipping position adjustment for malformed result in chunk {i+1}: {result}")
             all_raw_results.extend(raw_results_chunk)
         logger.info(f"Finished processing all chunks in {total_processing_time:.2f} seconds.")
         logger.info(f"Total raw entities found across all chunks: {len(all_raw_results)}")
+        # Combine entities from all chunks
         combined_entities = TextProcessor.combine_entities(all_raw_results, text)
+        # Deduplicate entities based on overlapping positions
+        # Two entities are considered duplicates if they have the same type and
+        # overlap by more than 50% of the shorter entity's length
         unique_entities = []
         for entity in combined_entities:
+            is_duplicate = False
+            # Calculate entity length for overlap comparison
+            entity_length = entity['end'] - entity['start']
+            for existing in unique_entities:
+                if existing['entity_type'] == entity['entity_type']:
+                    # Check for significant overlap
+                    overlap_start = max(entity['start'], existing['start'])
+                    overlap_end = min(entity['end'], existing['end'])
+                    if overlap_start < overlap_end:  # They overlap
+                        overlap_length = overlap_end - overlap_start
+                        shorter_length = min(entity_length, existing['end'] - existing['start'])
+                        # If overlap is significant (>50% of shorter entity)
+                        if overlap_length > 0.5 * shorter_length:
+                            is_duplicate = True
+                            # Keep the one with higher score
+                            if entity['score'] > existing['score']:
+                                # Replace the existing entity with this one
+                                unique_entities.remove(existing)
+                                is_duplicate = False
+                            break
+            if not is_duplicate:
                 unique_entities.append(entity)
         logger.info(f"Final number of unique combined entities: {len(unique_entities)}")
         return unique_entities
     @staticmethod
     def _create_chunks(text: str, chunk_size: int = 512, overlap: int = 50) -> List[Tuple[str, int]]:
         """
         if chunk_size <= 0:
              raise ValueError("chunk_size must be positive")
         chunks = []
         start = 0
         text_len = len(text)
         while start < text_len:
             # Determine the ideal end position
+            end = min(start + chunk_size, text_len)
+            # If we're at the end of the text, just use what's left
             if end >= text_len:
                 chunks.append((text[start:], start))
+                break
+            # Try to find a suitable split point (whitespace) to ensure we don't cut words
             split_pos = -1
+            # Search backwards from end to find a whitespace
+            for i in range(end, max(start, end - overlap) - 1, -1):
+                if i < text_len and text[i].isspace():
+                    split_pos = i + 1  # Position after the space
+                    break
+            # If no good split found, just use the calculated end
             if split_pos == -1 or split_pos <= start:
+                actual_end = end
             else:
+                actual_end = split_pos
+            # Add the chunk
             chunks.append((text[start:actual_end], start))
+            # Calculate next start position, ensuring we make progress
+            next_start = start + (actual_end - start - overlap)
             if next_start <= start:
                 next_start = start + 1
             start = next_start
         return chunks