|
|
|
|
|
import spacy
|
|
from pathlib import Path
|
|
import sys
|
|
|
|
try:
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
except ImportError:
|
|
print("✘ Error: 'transformers' library not found.")
|
|
print("Please install it: pip install transformers torch sentencepiece")
|
|
sys.exit(1)
|
|
import torch
|
|
import warnings
|
|
import re
|
|
import numpy as np
|
|
|
|
|
|
|
|
NER_MODEL_PATH = Path("./training_400/model-best")
|
|
|
|
|
|
SUMMARIZATION_MODEL_NAME = "csebuetnlp/mT5_multilingual_XLSum"
|
|
|
|
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
SUMM_NUM_BEAMS = 4
|
|
|
|
MIN_LEN_PERC = 0.30
|
|
MAX_LEN_PERC = 0.75
|
|
|
|
ABS_MIN_TOKEN_LEN = 20
|
|
ABS_MAX_TOKEN_LEN = 512
|
|
|
|
|
|
warnings.filterwarnings("ignore", message="CUDA path could not be detected*")
|
|
warnings.filterwarnings("ignore", message=".*You are using `torch.load` with `weights_only=False`.*")
|
|
|
|
|
|
|
|
def load_ner_model(path):
|
|
"""Loads the spaCy NER model and ensures sentencizer is present."""
|
|
if not path.exists():
|
|
print(f"✘ Error: NER Model directory not found at {path.resolve()}")
|
|
sys.exit(1)
|
|
try:
|
|
nlp = spacy.load(path)
|
|
print(f"✔ Successfully loaded NER model from: {path.resolve()}")
|
|
|
|
component_to_add_before = None
|
|
if "tok2vec" in nlp.pipe_names: component_to_add_before="tok2vec"
|
|
elif "ner" in nlp.pipe_names: component_to_add_before="ner"
|
|
if not nlp.has_pipe("sentencizer") and not nlp.has_pipe("parser"):
|
|
try:
|
|
if component_to_add_before: nlp.add_pipe("sentencizer", before=component_to_add_before)
|
|
else: nlp.add_pipe("sentencizer", first=True)
|
|
print("INFO: Added 'sentencizer' to loaded NER pipeline.")
|
|
except Exception as e_pipe:
|
|
print(f"✘ WARNING: Could not add 'sentencizer': {e_pipe}. Sentence splitting might fail.")
|
|
return nlp
|
|
except Exception as e:
|
|
print(f"✘ Error loading NER model from {path.resolve()}: {e}")
|
|
sys.exit(1)
|
|
|
|
def load_summarizer(model_name):
|
|
"""Loads the Hugging Face tokenizer and model for summarization."""
|
|
try:
|
|
print(f"\nLoading summarization tokenizer: {model_name}...")
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
print(f"Loading summarization model: {model_name} (this may take time)...")
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
|
model.to(DEVICE)
|
|
try:
|
|
new_max = 256
|
|
model.config.max_length = new_max
|
|
print(f"INFO: Attempted to override model config max_length to {new_max}")
|
|
except Exception as e_cfg:
|
|
print(f"WARN: Could not override model config max_length: {e_cfg}")
|
|
|
|
print(f"INFO: Model's configured max generation length: {model.config.max_length}")
|
|
print(f"✔ Successfully loaded summarization model '{model_name}' on {DEVICE}.")
|
|
return tokenizer, model
|
|
except Exception as e:
|
|
print(f"✘ Error loading summarization model '{model_name}': {e}")
|
|
print("Please ensure model name is correct, protobuf==3.20.3, internet access.")
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
def summarize_text(tokenizer, model, text, num_beams=SUMM_NUM_BEAMS,
|
|
min_length_perc=MIN_LEN_PERC, max_length_perc=MAX_LEN_PERC):
|
|
"""Generates abstractive summary with length based on input token percentage."""
|
|
if not text or text.isspace(): return "Input text is empty."
|
|
print("\nGenerating summary (using percentage lengths)...")
|
|
try:
|
|
|
|
input_ids = tokenizer(text, return_tensors="pt", truncation=False, padding=False).input_ids
|
|
input_token_count = input_ids.shape[1]
|
|
if input_token_count == 0: return "Input text tokenized to zero tokens."
|
|
print(f"INFO: Input text has approx {len(text.split())} words and {input_token_count} tokens.")
|
|
|
|
|
|
min_len_tokens = int(input_token_count * min_length_perc)
|
|
max_len_tokens = int(input_token_count * max_length_perc)
|
|
|
|
|
|
min_len_tokens = max(ABS_MIN_TOKEN_LEN, min_len_tokens)
|
|
|
|
max_len_tokens = max(min_len_tokens + 10, max_len_tokens)
|
|
|
|
max_len_tokens = min(ABS_MAX_TOKEN_LEN, max_len_tokens)
|
|
|
|
min_len_tokens = min(min_len_tokens, max_len_tokens)
|
|
|
|
|
|
print(f"INFO: Target summary token length: min={min_len_tokens}, max={max_len_tokens}.")
|
|
|
|
|
|
|
|
inputs = tokenizer(text, max_length=1024, return_tensors="pt", padding="max_length", truncation=True).to(DEVICE)
|
|
|
|
|
|
summary_ids = model.generate(inputs['input_ids'],
|
|
num_beams=num_beams,
|
|
max_length=max_len_tokens,
|
|
min_length=min_len_tokens,
|
|
early_stopping=True)
|
|
|
|
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
|
print("✔ Summary generation complete.")
|
|
return summary
|
|
except Exception as e:
|
|
print(f"✘ Error during summary generation: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return "[Error generating summary]"
|
|
|
|
|
|
def extract_entities(ner_nlp, text):
|
|
"""Extracts named entities using the spaCy NER model."""
|
|
if not text or text.isspace(): return []
|
|
print("\nExtracting entities from original text using custom NER model...")
|
|
try:
|
|
doc = ner_nlp(text)
|
|
entities = list({(ent.text.strip(), ent.label_) for ent in doc.ents if ent.text.strip()})
|
|
print(f"✔ Extracted {len(entities)} unique entities.")
|
|
return entities
|
|
except Exception as e:
|
|
print(f"✘ Error during entity extraction: {e}")
|
|
return []
|
|
|
|
|
|
def create_prompted_input(text, entities):
|
|
"""Creates a new input string with entities prepended."""
|
|
if not entities:
|
|
print("INFO: No entities found by NER, using original text for prompted summary.")
|
|
return text
|
|
entity_string = ", ".join(ent[0] for ent in entities)
|
|
separator = ". முக்கிய சொற்கள்: "
|
|
prompted_text = f"{entity_string}{separator}{text}"
|
|
print(f"\nINFO: Created prompted input (showing start): {prompted_text[:250]}...")
|
|
return prompted_text
|
|
|
|
|
|
|
|
def main():
|
|
|
|
print("Loading models, please wait...")
|
|
ner_model = load_ner_model(NER_MODEL_PATH)
|
|
summ_tokenizer, summ_model = load_summarizer(SUMMARIZATION_MODEL_NAME)
|
|
print("\nModels loaded successfully!")
|
|
print("="*50)
|
|
|
|
|
|
print("Please paste the Tamil text paragraph you want to summarize below.")
|
|
print("Press Enter after pasting the text.")
|
|
print("(You might need to configure your terminal for multi-line paste if it's long)")
|
|
print("-" * 50)
|
|
input_paragraph = input("Input Text:\n")
|
|
|
|
if not input_paragraph or input_paragraph.isspace():
|
|
print("\n✘ Error: No input text provided. Exiting.")
|
|
sys.exit(1)
|
|
text_to_process = input_paragraph.strip()
|
|
|
|
print("\n" + "="*50)
|
|
print("Processing Input Text (Snippet):")
|
|
print(text_to_process[:300] + "...")
|
|
print("="*50)
|
|
|
|
|
|
print("\n--- Output 1: Standard Abstractive Summary (Percentage Length) ---")
|
|
standard_summary = summarize_text(
|
|
summ_tokenizer, summ_model, text_to_process,
|
|
num_beams=SUMM_NUM_BEAMS
|
|
|
|
)
|
|
print("\nStandard Summary:")
|
|
print(standard_summary)
|
|
print("-" * 50)
|
|
|
|
|
|
print("\n--- Output 2: NER-Influenced Abstractive Summary (Percentage Length) ---")
|
|
|
|
extracted_entities = extract_entities(ner_model, text_to_process)
|
|
print("\nKey Entities Extracted by NER:")
|
|
if extracted_entities:
|
|
for text_ent, label in extracted_entities:
|
|
print(f" - '{text_ent}' ({label})")
|
|
else:
|
|
print(" No entities found by NER model.")
|
|
|
|
|
|
prompted_input_text = create_prompted_input(text_to_process, extracted_entities)
|
|
|
|
|
|
ner_influenced_summary = summarize_text(
|
|
summ_tokenizer, summ_model, prompted_input_text,
|
|
num_beams=SUMM_NUM_BEAMS
|
|
|
|
)
|
|
print("\nNER-Influenced Summary (Generated using entities as prefix):")
|
|
print(ner_influenced_summary)
|
|
print("\nNOTE: Compare this summary with the standard summary (Output 1).")
|
|
print("See if prepending entities influenced the output and included more of them.")
|
|
print("This method is experimental and doesn't guarantee inclusion.")
|
|
print("="*50)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |