Spaces:

tiwari866
/

abhigyan

Sleeping

App Files Files Community

Abhigyan commited on 11 days ago

Commit

e3f321e

1 Parent(s): b99eab6

Add app.py

Browse files

Files changed (1) hide show

app.py +396 -0

app.py ADDED Viewed

	@@ -0,0 +1,396 @@

+# ner_module.py
+import torch
+import time
+from typing import List, Dict, Any, Tuple
+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class NERModel:
+    """
+    A singleton class to manage the NER model loading and prediction.
+    Ensures the potentially large model is loaded only once.
+    """
+    _instance = None
+    _model = None
+    _tokenizer = None
+    _pipeline = None
+    _model_name = None # Store model name used for initialization
+    @classmethod
+    def get_instance(cls, model_name: str = "Davlan/bert-base-multilingual-cased-ner-hrl"):
+        """
+        Singleton pattern: Get the existing instance or create a new one.
+        Uses the specified model_name only during the first initialization.
+        """
+        if cls._instance is None:
+            logger.info(f"Creating new NERModel instance with model: {model_name}")
+            cls._instance = cls(model_name)
+        elif cls._model_name != model_name:
+             logger.warning(f"NERModel already initialized with {cls._model_name}. Ignoring new model name {model_name}.")
+        return cls._instance
+    def __init__(self, model_name: str):
+        """
+        Initialize the model, tokenizer, and pipeline.
+        Private constructor - use get_instance() instead.
+        """
+        if NERModel._instance is not None:
+            raise Exception("This class is a singleton! Use get_instance() to get the object.")
+        else:
+            self.model_name = model_name
+            NERModel._model_name = model_name # Store the model name
+            self._load_model()
+            NERModel._instance = self # Assign the instance here
+    def _load_model(self):
+        """Load the NER model and tokenizer from Hugging Face."""
+        logger.info(f"Loading model: {self.model_name}")
+        start_time = time.time()
+        try:
+            # Load tokenizer and model
+            self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            self._model = AutoModelForTokenClassification.from_pretrained(self.model_name)
+            # Check if the model is a PyTorch model for potential optimizations
+            if isinstance(self._model, torch.nn.Module):
+                self._model.eval()  # Set model to evaluation mode (important for inference)
+                # self._model.share_memory() # share_memory() might not be needed unless using multiprocessing explicitly
+            # Create the NER pipeline
+            # Specify device=-1 for CPU, device=0 for first GPU, etc.
+            # Let pipeline decide device automatically by default, or specify if needed
+            self._pipeline = pipeline(
+                "ner",
+                model=self._model,
+                tokenizer=self._tokenizer,
+                # grouped_entities=True # Group subword tokens automatically (alternative to manual combination)
+            )
+            load_time = time.time() - start_time
+            logger.info(f"Model '{self.model_name}' loaded successfully in {load_time:.2f} seconds.")
+        except Exception as e:
+            logger.error(f"Error loading model {self.model_name}: {e}")
+            # Clean up partial loads if necessary
+            self._tokenizer = None
+            self._model = None
+            self._pipeline = None
+            # Re-raise the exception to signal failure
+            raise
+    def predict(self, text: str) -> List[Dict[str, Any]]:
+        """
+        Run NER prediction on the input text using the loaded pipeline.
+        Args:
+            text: The input string to perform NER on.
+        Returns:
+            A list of dictionaries, where each dictionary represents an entity
+            identified by the pipeline. The exact format depends on the pipeline
+            configuration (e.g., grouped_entities).
+        """
+        if self._pipeline is None:
+            logger.error("NER pipeline is not initialized. Cannot predict.")
+            return [] # Return empty list or raise an error
+        if not text or not isinstance(text, str):
+            logger.warning("Prediction called with empty or invalid text.")
+            return []
+        logger.debug(f"Running prediction on text: '{text[:100]}...'") # Log snippet
+        try:
+            # The pipeline handles tokenization and prediction
+            results = self._pipeline(text)
+            logger.debug(f"Prediction results: {results}")
+            return results
+        except Exception as e:
+            logger.error(f"Error during NER prediction: {e}")
+            return [] # Return empty list on error
+class TextProcessor:
+    """
+    Provides static methods for processing text, specifically for NER tasks,
+    including combining subword entities and handling large texts via chunking.
+    """
+    @staticmethod
+    def combine_entities(ner_results: List[Dict[str, Any]], original_text: str) -> List[Dict[str, Any]]:
+        """
+        Combine entities that might be split into subword tokens (B-TAG, I-TAG).
+        This method assumes the pipeline did *not* use grouped_entities=True.
+        Args:
+            ner_results: The raw output from the NER pipeline (list of token dictionaries).
+            original_text: The original text input to extract entity words accurately.
+        Returns:
+            A list of dictionaries, each representing a combined entity with
+            'entity_type', 'start', 'end', 'score', and 'word'.
+        """
+        if not ner_results:
+            return []
+        combined_entities = []
+        current_entity = None
+        for token in ner_results:
+            # Basic validation of token structure
+            if not all(k in token for k in ['entity', 'start', 'end', 'score']):
+                logger.warning(f"Skipping malformed token: {token}")
+                continue
+            # Skip 'O' tags (Outside any entity)
+            if token['entity'] == 'O':
+                # If we were tracking an entity, finalize it before moving on
+                if current_entity:
+                    combined_entities.append(current_entity)
+                    current_entity = None
+                continue
+            # Extract entity type (e.g., 'PER', 'LOC') removing 'B-' or 'I-'
+            entity_tag = token['entity']
+            if entity_tag.startswith('B-') or entity_tag.startswith('I-'):
+                entity_type = entity_tag[2:]
+            else:
+                # Handle cases where the tag might not have B-/I- prefix (less common)
+                logger.warning(f"Unexpected entity tag format: {entity_tag}. Using as is.")
+                entity_type = entity_tag
+            # Start of a new entity ('B-') or continuation of a different entity type
+            if entity_tag.startswith('B-') or (entity_tag.startswith('I-') and (not current_entity or current_entity['entity_type'] != entity_type)):
+                # Finalize the previous entity if it exists
+                if current_entity:
+                    combined_entities.append(current_entity)
+                # Start the new entity
+                current_entity = {
+                    'entity_type': entity_type,
+                    'start': token['start'],
+                    'end': token['end'],
+                    'score': float(token['score']),
+                    'token_count': 1 # Keep track of tokens for averaging score
+                }
+            # Continuation of the current entity ('I-' and matching type)
+            elif entity_tag.startswith('I-') and current_entity and current_entity['entity_type'] == entity_type:
+                # Extend the end position
+                current_entity['end'] = token['end']
+                # Update the score (e.g., average)
+                current_entity['score'] = (current_entity['score'] * current_entity['token_count'] + float(token['score'])) / (current_entity['token_count'] + 1)
+                current_entity['token_count'] += 1
+            # Handle unexpected cases (e.g., I- tag without preceding B- or matching I-)
+            else:
+                 logger.warning(f"Encountered unexpected token sequence at: {token}. Resetting current entity.")
+                 if current_entity:
+                     combined_entities.append(current_entity)
+                 current_entity = None # Reset
+        # Add the last tracked entity if it exists
+        if current_entity:
+            combined_entities.append(current_entity)
+        # Extract the actual text 'word' for each combined entity
+        for entity in combined_entities:
+            try:
+                entity['word'] = original_text[entity['start']:entity['end']].strip()
+                # Remove internal helper key
+                if 'token_count' in entity:
+                    del entity['token_count']
+            except IndexError:
+                logger.error(f"Index error extracting word for entity: {entity} with text length {len(original_text)}")
+                entity['word'] = "[Error extracting word]"
+        # Optional: Sort entities by start position
+        combined_entities.sort(key=lambda x: x['start'])
+        logger.info(f"Combined {len(ner_results)} raw tokens into {len(combined_entities)} entities.")
+        return combined_entities
+    @staticmethod
+    def process_large_text(text: str, model: NERModel, chunk_size: int = 512, overlap: int = 50) -> List[Dict[str, Any]]:
+        """
+        Process large text by splitting it into overlapping chunks, running NER
+        on each chunk, and then combining the results intelligently.
+        Args:
+            text: The large input text string.
+            model: The initialized NERModel instance.
+            chunk_size: The maximum size of each text chunk (in characters or tokens,
+                        depending on the tokenizer's limits, often ~512 for BERT).
+            overlap: The number of characters/tokens to overlap between consecutive chunks
+                     to ensure entities spanning chunk boundaries are captured.
+        Returns:
+            A list of combined entity dictionaries for the entire text.
+        """
+        if not text:
+            return []
+        # Use tokenizer max length if available and smaller than chunk_size
+        if model._tokenizer and hasattr(model._tokenizer, 'model_max_length'):
+            tokenizer_max_len = model._tokenizer.model_max_length
+            if chunk_size > tokenizer_max_len:
+                logger.warning(f"Requested chunk_size {chunk_size} exceeds model max length {tokenizer_max_len}. Using {tokenizer_max_len}.")
+                chunk_size = tokenizer_max_len
+            # Ensure overlap is reasonable compared to chunk size
+            if overlap >= chunk_size // 2:
+                 logger.warning(f"Overlap {overlap} seems large for chunk_size {chunk_size}. Reducing overlap to {chunk_size // 4}.")
+                 overlap = chunk_size // 4
+        logger.info(f"Processing large text (length {len(text)}) with chunk_size={chunk_size}, overlap={overlap}")
+        chunks = TextProcessor._create_chunks(text, chunk_size, overlap)
+        logger.info(f"Split text into {len(chunks)} chunks.")
+        all_raw_results = []
+        total_processing_time = 0
+        for i, (chunk_text, start_pos) in enumerate(chunks):
+            logger.debug(f"Processing chunk {i+1}/{len(chunks)} (start_pos: {start_pos}, length: {len(chunk_text)})")
+            start_time = time.time()
+            # Get raw predictions for the current chunk
+            raw_results_chunk = model.predict(chunk_text)
+            chunk_processing_time = time.time() - start_time
+            total_processing_time += chunk_processing_time
+            logger.debug(f"Chunk {i+1} processed in {chunk_processing_time:.2f}s. Found {len(raw_results_chunk)} raw entities.")
+            # Adjust entity positions relative to the original text
+            for result in raw_results_chunk:
+                 # Check if 'start' and 'end' exist before adjusting
+                 if 'start' in result and 'end' in result:
+                     result['start'] += start_pos
+                     result['end'] += start_pos
+                 else:
+                     logger.warning(f"Skipping position adjustment for malformed result in chunk {i+1}: {result}")
+            all_raw_results.extend(raw_results_chunk)
+        logger.info(f"Finished processing all chunks in {total_processing_time:.2f} seconds.")
+        logger.info(f"Total raw entities found across all chunks: {len(all_raw_results)}")
+        # Combine entities from all chunks, handling potential duplicates from overlap
+        # The combine_entities method needs refinement to handle overlaps better,
+        # e.g., by prioritizing entities from non-overlapped regions or merging based on confidence.
+        # For now, we use the existing combine_entities, which might create duplicates if
+        # an entity appears fully in the overlap region of two chunks.
+        # A more robust approach would involve deduplication based on start/end/type.
+        combined_entities = TextProcessor.combine_entities(all_raw_results, text)
+        # Simple deduplication based on exact start, end, and type
+        unique_entities = []
+        seen_entities = set()
+        for entity in combined_entities:
+            entity_key = (entity['start'], entity['end'], entity['entity_type'])
+            if entity_key not in seen_entities:
+                unique_entities.append(entity)
+                seen_entities.add(entity_key)
+            else:
+                logger.debug(f"Duplicate entity removed: {entity}")
+        logger.info(f"Final number of unique combined entities: {len(unique_entities)}")
+        return unique_entities
+    @staticmethod
+    def _create_chunks(text: str, chunk_size: int = 512, overlap: int = 50) -> List[Tuple[str, int]]:
+        """
+        Split text into potentially overlapping chunks, trying to respect word boundaries.
+        Args:
+            text: The input text string.
+            chunk_size: The target maximum size of each chunk.
+            overlap: The desired overlap between consecutive chunks.
+        Returns:
+            A list of tuples, where each tuple contains (chunk_text, start_position_in_original_text).
+        """
+        if not text:
+            return []
+        if chunk_size <= overlap:
+             raise ValueError("chunk_size must be greater than overlap")
+        if chunk_size <= 0:
+             raise ValueError("chunk_size must be positive")
+        chunks = []
+        start = 0
+        text_len = len(text)
+        while start < text_len:
+            # Determine the ideal end position
+            end = start + chunk_size
+            # If the ideal end is beyond the text length, just take the rest
+            if end >= text_len:
+                chunks.append((text[start:], start))
+                break # We've reached the end
+            # Try to find a suitable split point (e.g., whitespace) near the ideal end
+            # Search backwards from the ideal end position within a reasonable window (e.g., overlap size)
+            split_pos = -1
+            search_start = max(start, end - overlap) # Don't search too far back
+            for i in range(end, search_start -1 , -1):
+                 # Prefer splitting at whitespace
+                 if text[i].isspace():
+                     split_pos = i + 1 # Split *after* the space
+                     break
+                 # Consider splitting at punctuation as a fallback? (optional)
+                 # import string
+                 # if text[i] in string.punctuation:
+                 #    split_pos = i + 1
+                 #    break
+            # If no good split point found nearby, just cut at the chunk_size
+            if split_pos == -1 or split_pos <= start:
+                 actual_end = end
+                 logger.debug(f"No suitable whitespace found near char {end}, cutting at {actual_end}")
+            else:
+                 actual_end = split_pos
+                 logger.debug(f"Found whitespace split point at char {actual_end}")
+            # Ensure the chunk isn't empty if split_pos was too close to start
+            if actual_end <= start:
+                actual_end = end # Fallback to hard cut if split logic fails
+            # Add the chunk and its starting position
+            chunks.append((text[start:actual_end], start))
+            # Determine the start of the next chunk
+            # Move forward by chunk_size minus overlap, ensuring progress
+            next_start = start + (chunk_size - overlap)
+            # If we split at whitespace (actual_end), we can potentially start the next chunk
+            # right after the split point to avoid redundant processing of the overlap zone
+            # if the split was significantly before the ideal 'end'.
+            # However, the simple `next_start = start + (chunk_size - overlap)` is safer
+            # to ensure consistent overlap handling unless more complex logic is added.
+            # Let's stick to the simpler approach for now:
+            # next_start = actual_end - overlap # This could lead to variable overlap size
+            # Ensure we always make progress
+            if next_start <= start:
+                logger.warning("Chunking logic resulted in no progress. Moving start by 1.")
+                next_start = start + 1
+            start = next_start
+        return chunks