Spaces:

Nivas007
/

Tamil_txt_Summarisation_NER

Sleeping

App Files Files Community

Nivas007 commited on 15 days ago

Commit

b9285e0

verified ·

1 Parent(s): 925ce54

Upload mt5_finetuned_summary.py

Browse files

Files changed (1) hide show

mt5_finetuned_summary.py +279 -0

mt5_finetuned_summary.py ADDED Viewed

	@@ -0,0 +1,279 @@

+# -*- coding: utf-8 -*-
+import spacy
+from pathlib import Path
+import sys
+import warnings
+import re
+import numpy as np
+# --- Prerequisites ---
+# Ensure these are installed in your .venv:
+# pip install spacy transformers torch sentencepiece protobuf==3.20.3 peft accelerate datasets evaluate gradio numpy
+# (Make sure spacy version matches your NER model training version)
+try:
+    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+    import torch
+    from peft import PeftModel, PeftConfig # Import PEFT classes
+except ImportError as e:
+    print(f"✘ Error: Missing required library: {e}")
+    print("Please install all dependencies: pip install spacy transformers torch sentencepiece protobuf==3.20.3 peft accelerate datasets evaluate gradio numpy")
+    sys.exit(1)
+# --- Configuration ---
+# 1. Path to your trained spaCy NER model directory
+NER_MODEL_PATH = Path("./training_400/model-best") # <-- ADJUST if different
+# 2. Hugging Face model name for the BASE summarization model
+BASE_SUMMARIZATION_MODEL_NAME = "csebuetnlp/mT5_multilingual_XLSum"
+# 3. Path to your saved PEFT/LoRA adapter directory (output from fine-tuning)
+ADAPTER_PATH = Path("./mt5_finetuned_tamil_summary") # <-- ADJUST if different
+# 4. Device: "cuda" for GPU or "cpu"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# 5. Summarization parameters
+SUMM_NUM_BEAMS = 4
+MIN_LEN_PERC = 0.30 # Target minimum summary length as % of input tokens
+MAX_LEN_PERC = 0.70 # Target maximum summary length as % of input tokens (Increased)
+ABS_MIN_TOKEN_LEN = 30   # Absolute minimum token length
+ABS_MAX_TOKEN_LEN = 512  # Absolute maximum token length (Safer cap)
+# --- End Configuration ---
+# --- Suppress Warnings ---
+warnings.filterwarnings("ignore", message="CUDA path could not be detected*")
+warnings.filterwarnings("ignore", message=".*You are using `torch.load` with `weights_only=False`.*")
+warnings.filterwarnings("ignore", message=".*The sentencepiece tokenizer that you are converting.*")
+# --- Global Variables for Loaded Models ---
+ner_model_global = None
+summ_tokenizer_global = None
+summ_model_global = None # This will hold the PEFT model
+models_loaded = False
+# --- Model Loading Functions ---
+def load_ner_model(path):
+    """Loads the spaCy NER model and ensures sentencizer is present."""
+    global ner_model_global
+    if not path.exists():
+        print(f"✘ FATAL: NER Model directory not found at {path.resolve()}")
+        return False
+    try:
+        ner_model_global = spacy.load(path)
+        print(f"✔ Successfully loaded NER model from: {path.resolve()}")
+        # Ensure a sentence boundary detector is present
+        component_to_add_before = None
+        if "tok2vec" in ner_model_global.pipe_names: component_to_add_before="tok2vec"
+        elif "ner" in ner_model_global.pipe_names: component_to_add_before="ner"
+        if not ner_model_global.has_pipe("sentencizer") and not ner_model_global.has_pipe("parser"):
+            try:
+                if component_to_add_before: ner_model_global.add_pipe("sentencizer", before=component_to_add_before)
+                else: ner_model_global.add_pipe("sentencizer", first=True)
+                print("INFO: Added 'sentencizer' to loaded NER pipeline.")
+            except Exception as e_pipe:
+                print(f"✘ WARNING: Could not add 'sentencizer': {e_pipe}.")
+        return True
+    except Exception as e:
+        print(f"✘ FATAL: Error loading NER model from {path.resolve()}: {e}")
+        return False
+def load_finetuned_summarizer(base_model_name, adapter_dir_path):
+    """Loads the base HF tokenizer/model and applies PEFT adapters."""
+    global summ_tokenizer_global, summ_model_global
+    if not adapter_dir_path.exists():
+        print(f"✘ FATAL: PEFT Adapter directory not found at {adapter_dir_path.resolve()}")
+        return False
+    try:
+        print(f"\nLoading base summarization tokenizer: {base_model_name}...")
+        summ_tokenizer_global = AutoTokenizer.from_pretrained(base_model_name)
+        print(f"Loading base summarization model: {base_model_name}...")
+        base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name)
+        print(f"Loading PEFT adapter from: {adapter_dir_path}...")
+        # Load the fine-tuned PEFT model by applying adapters to the base model
+        summ_model_global = PeftModel.from_pretrained(base_model, adapter_dir_path)
+        # Optional: Merge weights. This combines the adapter weights into the base model.
+        # It can make inference slightly faster but increases memory usage
+        # and you can no longer easily unload the adapter. Don't use if you plan
+        # to switch adapters or do more training later.
+        # print("INFO: Merging PEFT adapters into base model...")
+        # summ_model_global = summ_model_global.merge_and_unload()
+        # print("INFO: Adapters merged.")
+        summ_model_global.to(DEVICE)
+        print(f"INFO: Model's configured max generation length: {summ_model_global.config.max_length}") # Print base model's limit
+        print(f"✔ Successfully loaded fine-tuned PEFT model '{adapter_dir_path.name}' on base '{base_model_name}' on {DEVICE}.")
+        return True
+    except Exception as e:
+        print(f"✘ FATAL: Error loading fine-tuned summarization model: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+# --- MODIFIED summarize_text function ---
+def summarize_text(tokenizer, model, text, num_beams=SUMM_NUM_BEAMS,
+                   min_length_perc=MIN_LEN_PERC, max_length_perc=MAX_LEN_PERC):
+    """Generates abstractive summary with length based on input token percentage."""
+    if not text or text.isspace(): return "Input text is empty."
+    print("\nGenerating summary (using percentage lengths)...")
+    try:
+        # 1. Calculate input token length
+        with tokenizer.as_target_tokenizer():
+             input_ids_tensor = tokenizer(text, return_tensors="pt", truncation=False, padding=False).input_ids
+        input_token_count = input_ids_tensor.shape[1]
+        if input_token_count == 0: return "Input text tokenized to zero tokens."
+        print(f"INFO: Input text has approx {len(text.split())} words and {input_token_count} tokens.")
+        # 2. Calculate target token lengths
+        min_len_tokens = int(input_token_count * min_length_perc)
+        max_len_tokens = int(input_token_count * max_length_perc)
+        # 3. Apply absolute limits and ensure min < max
+        min_len_tokens = max(ABS_MIN_TOKEN_LEN, min_len_tokens)
+        max_len_tokens = max(min_len_tokens + 10, max_len_tokens)
+        max_len_tokens = min(ABS_MAX_TOKEN_LEN, max_len_tokens)
+        min_len_tokens = min(min_len_tokens, max_len_tokens)
+        print(f"INFO: Target summary token length: min={min_len_tokens}, max={max_len_tokens}.")
+        # 4. Tokenize for model input
+        inputs = tokenizer(text, max_length=1024, return_tensors="pt", padding="max_length", truncation=True).to(DEVICE)
+        # 5. Generate summary using CALCULATED min/max token lengths
+        print("INFO: Starting model.generate()...")
+        # --- *** THE FIX: Use explicit keyword argument 'input_ids=' *** ---
+        summary_ids = model.generate(
+            input_ids=inputs['input_ids'], # <<< Use keyword argument explicitly
+            # --- Other arguments remain keywords ---
+            num_beams=num_beams,
+            max_length=max_len_tokens,
+            min_length=min_len_tokens,
+            early_stopping=True
+        )
+        # --- *** End of Fix *** ---
+        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        print("✔ Summary generation complete.")
+        return summary
+    except TypeError as te: # Catch the specific error for better logging
+         print(f"✘ TypeError during summary generation: {te}")
+         print("✘ Error details: This often happens if generate() arguments are incorrect, check keyword vs positional.")
+         import traceback
+         traceback.print_exc()
+         return "[TypeError during summary generation - check arguments]"
+    except Exception as e:
+        print(f"✘ Error during summary generation: {e}")
+        import traceback
+        traceback.print_exc()
+        return "[Error generating summary]"
+def extract_entities(ner_nlp, text):
+    """Extracts named entities using the spaCy NER model."""
+    if not text or text.isspace(): return []
+    print("\nExtracting entities from original text using custom NER model...")
+    try:
+        doc = ner_nlp(text)
+        entities = list({(ent.text.strip(), ent.label_) for ent in doc.ents if ent.text.strip()}) # Unique entities
+        print(f"✔ Extracted {len(entities)} unique entities.")
+        return entities
+    except Exception as e:
+        print(f"✘ Error during entity extraction: {e}")
+        return []
+def create_prompted_input(text, entities):
+    """Creates a new input string with unique entities prepended."""
+    if not entities:
+        print("INFO: No entities found by NER, using original text for prompted summary.")
+        return text
+    unique_entity_texts = sorted(list({ent[0] for ent in entities if ent[0]}))
+    entity_string = ", ".join(unique_entity_texts)
+    separator = ". முக்கிய சொற்கள்: "
+    prompted_text = f"{entity_string}{separator}{text}"
+    print(f"\nINFO: Created prompted input with {len(unique_entity_texts)} unique entities (showing start): {prompted_text[:250]}...")
+    return prompted_text
+# --- Main execution ---
+def main():
+    global models_loaded # Access the flag
+    if not models_loaded:
+         print("✘ Models failed to load during startup. Cannot proceed.")
+         sys.exit(1)
+    print("\n" + "="*50)
+    print("Please paste the Tamil text paragraph you want to summarize below.")
+    print("Press Enter when finished.")
+    print("(You might need to configure your terminal for multi-line paste if it's long)")
+    print("-" * 50)
+    input_paragraph = input("Input Text:\n")
+    if not input_paragraph or input_paragraph.isspace():
+        print("\n✘ Error: No input text provided. Exiting.")
+        sys.exit(1)
+    text_to_process = input_paragraph.strip()
+    print("\n" + "="*50)
+    print("Processing Input Text (Snippet):")
+    print(text_to_process[:300] + "...")
+    print("="*50)
+    # --- Generate Output 1: Standard Summary (using FINE-TUNED model) ---
+    # Note: Even the "standard" summary now uses the fine-tuned model
+    print("\n--- Output 1: Standard Abstractive Summary (Fine-tuned Model) ---")
+    standard_summary = summarize_text(
+        summ_tokenizer_global, summ_model_global, text_to_process,
+        num_beams=SUMM_NUM_BEAMS
+    )
+    print("\nStandard Summary:")
+    print(standard_summary)
+    print("-" * 50)
+    # --- Generate Output 2: NER-Influenced Summary (using FINE-TUNED model) ---
+    print("\n--- Output 2: NER-Influenced Abstractive Summary (Fine-tuned Model) ---")
+    # a) Extract entities
+    extracted_entities = extract_entities(ner_model_global, text_to_process)
+    print("\nKey Entities Extracted by NER:")
+    if extracted_entities:
+        for text_ent, label in extracted_entities:
+            print(f"  - '{text_ent}' ({label})")
+    else:
+        print("  No entities found by NER model.")
+    # b) Create prompted input
+    prompted_input_text = create_prompted_input(text_to_process, extracted_entities)
+    # c) Generate summary from prompted input
+    ner_influenced_summary = summarize_text(
+        summ_tokenizer_global, summ_model_global, prompted_input_text,
+        num_beams=SUMM_NUM_BEAMS
+    )
+    print("\nNER-Influenced Summary (Generated using entities as prefix):")
+    print(ner_influenced_summary)
+    print("\nNOTE: Compare this summary with the standard summary (Output 1).")
+    print("Fine-tuning might make both summaries better reflect your data's style.")
+    print("Prepending entities is still experimental for influencing inclusion.")
+    print("="*50)
+if __name__ == "__main__":
+    # --- Load models globally when script starts ---
+    print("Application starting up... Loading models...")
+    # Load NER first, then summarizer (which might depend on NER path confirmation)
+    ner_loaded_ok = load_ner_model(NER_MODEL_PATH)
+    if ner_loaded_ok:
+        # Proceed to load summarizer only if NER loaded
+        summ_loaded_ok = load_finetuned_summarizer(BASE_SUMMARIZATION_MODEL_NAME, ADAPTER_PATH)
+        models_loaded = summ_loaded_ok # Overall success depends on summarizer loading
+    else:
+        models_loaded = False # NER failed, cannot proceed
+    if models_loaded:
+        print("\n--- All models loaded successfully! Ready for input. ---")
+        main() # Run the main interaction loop
+    else:
+        print("\n✘✘✘ CRITICAL ERROR: Model loading failed. Exiting. Check logs above. ✘✘✘")
+        sys.exit(1)