Tamil_txt_Summarisation_NER / mt5_summarize_ner_interactive_perc.py
Nivas007's picture
Added intial root files need to add spacy NER model and Transformer model
4382bfb verified
# -*- coding: utf-8 -*-
import spacy
from pathlib import Path
import sys
# Make sure you have installed transformers, torch, sentencepiece, spacy, protobuf==3.20.3
try:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
except ImportError:
print("✘ Error: 'transformers' library not found.")
print("Please install it: pip install transformers torch sentencepiece")
sys.exit(1)
import torch
import warnings
import re # For slightly better entity checking
import numpy as np # Needed for calculation
# --- Configuration ---
# 1. Path to your trained spaCy NER model (Use your best one!)
NER_MODEL_PATH = Path("./training_400/model-best") # <-- ADJUST TO YOUR BEST NER MODEL
# 2. Hugging Face model name for mT5 summarization
SUMMARIZATION_MODEL_NAME = "csebuetnlp/mT5_multilingual_XLSum"
# 3. Device: "cuda" for GPU or "cpu"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# 4. Summarization parameters
SUMM_NUM_BEAMS = 4
# --- NEW: Percentage-based length ---
MIN_LEN_PERC = 0.30 # Target minimum summary length as % of input tokens (e.g., 30%)
MAX_LEN_PERC = 0.75 # Target maximum summary length as % of input tokens (e.g., 55%)
# --- NEW: Absolute token limits (safety net) ---
ABS_MIN_TOKEN_LEN = 20 # Don't generate summaries shorter than this many tokens
ABS_MAX_TOKEN_LEN = 512 # Don't generate summaries longer than this many tokens
# --- End Configuration ---
warnings.filterwarnings("ignore", message="CUDA path could not be detected*")
warnings.filterwarnings("ignore", message=".*You are using `torch.load` with `weights_only=False`.*")
# --- Model Loading Functions ---
# (Keep load_ner_model and load_summarizer functions exactly as in the previous corrected version)
def load_ner_model(path):
"""Loads the spaCy NER model and ensures sentencizer is present."""
if not path.exists():
print(f"✘ Error: NER Model directory not found at {path.resolve()}")
sys.exit(1)
try:
nlp = spacy.load(path)
print(f"✔ Successfully loaded NER model from: {path.resolve()}")
# Ensure a sentence boundary detector is present
component_to_add_before = None
if "tok2vec" in nlp.pipe_names: component_to_add_before="tok2vec"
elif "ner" in nlp.pipe_names: component_to_add_before="ner"
if not nlp.has_pipe("sentencizer") and not nlp.has_pipe("parser"):
try:
if component_to_add_before: nlp.add_pipe("sentencizer", before=component_to_add_before)
else: nlp.add_pipe("sentencizer", first=True)
print("INFO: Added 'sentencizer' to loaded NER pipeline.")
except Exception as e_pipe:
print(f"✘ WARNING: Could not add 'sentencizer': {e_pipe}. Sentence splitting might fail.")
return nlp
except Exception as e:
print(f"✘ Error loading NER model from {path.resolve()}: {e}")
sys.exit(1)
def load_summarizer(model_name):
"""Loads the Hugging Face tokenizer and model for summarization."""
try:
print(f"\nLoading summarization tokenizer: {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"Loading summarization model: {model_name} (this may take time)...")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(DEVICE)
try:
new_max = 256 # Set your desired max length
model.config.max_length = new_max
print(f"INFO: Attempted to override model config max_length to {new_max}")
except Exception as e_cfg:
print(f"WARN: Could not override model config max_length: {e_cfg}")
# return tokenizer, model
print(f"INFO: Model's configured max generation length: {model.config.max_length}")
print(f"✔ Successfully loaded summarization model '{model_name}' on {DEVICE}.")
return tokenizer, model
except Exception as e:
print(f"✘ Error loading summarization model '{model_name}': {e}")
print("Please ensure model name is correct, protobuf==3.20.3, internet access.")
sys.exit(1)
# --- Core Logic Functions ---
# --- MODIFIED summarize_text function ---
def summarize_text(tokenizer, model, text, num_beams=SUMM_NUM_BEAMS,
min_length_perc=MIN_LEN_PERC, max_length_perc=MAX_LEN_PERC):
"""Generates abstractive summary with length based on input token percentage."""
if not text or text.isspace(): return "Input text is empty."
print("\nGenerating summary (using percentage lengths)...")
try:
# 1. Calculate input token length (important to NOT pad/truncate here)
input_ids = tokenizer(text, return_tensors="pt", truncation=False, padding=False).input_ids
input_token_count = input_ids.shape[1]
if input_token_count == 0: return "Input text tokenized to zero tokens."
print(f"INFO: Input text has approx {len(text.split())} words and {input_token_count} tokens.")
# 2. Calculate target token lengths based on percentages
min_len_tokens = int(input_token_count * min_length_perc)
max_len_tokens = int(input_token_count * max_length_perc)
# 3. Apply absolute limits and ensure min < max
min_len_tokens = max(ABS_MIN_TOKEN_LEN, min_len_tokens) # Apply absolute minimum
# Ensure max is reasonably larger than min, prevent max < min
max_len_tokens = max(min_len_tokens + 10, max_len_tokens)
# Apply absolute maximum (e.g., model limit or desired cap)
max_len_tokens = min(ABS_MAX_TOKEN_LEN, max_len_tokens)
# Ensure min_len is not greater than max_len after caps
min_len_tokens = min(min_len_tokens, max_len_tokens)
print(f"INFO: Target summary token length: min={min_len_tokens}, max={max_len_tokens}.")
# 4. Tokenize *again* for model input (this time with padding/truncation to model max input size)
# Max length here refers to the *input* sequence length limit for the model
inputs = tokenizer(text, max_length=1024, return_tensors="pt", padding="max_length", truncation=True).to(DEVICE)
# 5. Generate summary using CALCULATED min/max token lengths
summary_ids = model.generate(inputs['input_ids'],
num_beams=num_beams,
max_length=max_len_tokens, # Use calculated max
min_length=min_len_tokens, # Use calculated min
early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
print("✔ Summary generation complete.")
return summary
except Exception as e:
print(f"✘ Error during summary generation: {e}")
import traceback
traceback.print_exc()
return "[Error generating summary]"
# (Keep extract_entities function exactly as before)
def extract_entities(ner_nlp, text):
"""Extracts named entities using the spaCy NER model."""
if not text or text.isspace(): return []
print("\nExtracting entities from original text using custom NER model...")
try:
doc = ner_nlp(text)
entities = list({(ent.text.strip(), ent.label_) for ent in doc.ents if ent.text.strip()}) # Unique entities
print(f"✔ Extracted {len(entities)} unique entities.")
return entities
except Exception as e:
print(f"✘ Error during entity extraction: {e}")
return []
# (Keep create_prompted_input function exactly as before)
def create_prompted_input(text, entities):
"""Creates a new input string with entities prepended."""
if not entities:
print("INFO: No entities found by NER, using original text for prompted summary.")
return text # Return original text if no entities found
entity_string = ", ".join(ent[0] for ent in entities)
separator = ". முக்கிய சொற்கள்: " # ". Key terms: "
prompted_text = f"{entity_string}{separator}{text}"
print(f"\nINFO: Created prompted input (showing start): {prompted_text[:250]}...") # For debugging
return prompted_text
# --- Main execution ---
# (Keep main function exactly as before - it now calls the modified summarize_text)
def main():
# Load models
print("Loading models, please wait...")
ner_model = load_ner_model(NER_MODEL_PATH)
summ_tokenizer, summ_model = load_summarizer(SUMMARIZATION_MODEL_NAME)
print("\nModels loaded successfully!")
print("="*50)
# Get Input Text from User
print("Please paste the Tamil text paragraph you want to summarize below.")
print("Press Enter after pasting the text.")
print("(You might need to configure your terminal for multi-line paste if it's long)")
print("-" * 50)
input_paragraph = input("Input Text:\n") # Get input from user
if not input_paragraph or input_paragraph.isspace():
print("\n✘ Error: No input text provided. Exiting.")
sys.exit(1)
text_to_process = input_paragraph.strip()
print("\n" + "="*50)
print("Processing Input Text (Snippet):")
print(text_to_process[:300] + "...")
print("="*50)
# --- Generate Output 1: Standard Summary (using percentage lengths) ---
print("\n--- Output 1: Standard Abstractive Summary (Percentage Length) ---")
standard_summary = summarize_text(
summ_tokenizer, summ_model, text_to_process,
num_beams=SUMM_NUM_BEAMS
# Uses default percentages MIN_LEN_PERC, MAX_LEN_PERC from config section
)
print("\nStandard Summary:")
print(standard_summary)
print("-" * 50)
# --- Generate Output 2: NER-Influenced Summary (using percentage lengths) ---
print("\n--- Output 2: NER-Influenced Abstractive Summary (Percentage Length) ---")
# a) Extract entities
extracted_entities = extract_entities(ner_model, text_to_process)
print("\nKey Entities Extracted by NER:")
if extracted_entities:
for text_ent, label in extracted_entities:
print(f" - '{text_ent}' ({label})")
else:
print(" No entities found by NER model.")
# b) Create prompted input
prompted_input_text = create_prompted_input(text_to_process, extracted_entities)
# c) Generate summary from prompted input (using percentage lengths)
ner_influenced_summary = summarize_text(
summ_tokenizer, summ_model, prompted_input_text,
num_beams=SUMM_NUM_BEAMS
# Uses default percentages MIN_LEN_PERC, MAX_LEN_PERC from config section
)
print("\nNER-Influenced Summary (Generated using entities as prefix):")
print(ner_influenced_summary)
print("\nNOTE: Compare this summary with the standard summary (Output 1).")
print("See if prepending entities influenced the output and included more of them.")
print("This method is experimental and doesn't guarantee inclusion.")
print("="*50)
if __name__ == "__main__":
main()