File size: 11,239 Bytes
4382bfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# -*- coding: utf-8 -*-

import spacy
from pathlib import Path
import sys
# Make sure you have installed transformers, torch, sentencepiece, spacy, protobuf==3.20.3
try:
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
except ImportError:
    print("✘ Error: 'transformers' library not found.")
    print("Please install it: pip install transformers torch sentencepiece")
    sys.exit(1)
import torch
import warnings
import re # For slightly better entity checking
import numpy as np # Needed for calculation

# --- Configuration ---
# 1. Path to your trained spaCy NER model (Use your best one!)
NER_MODEL_PATH = Path("./training_400/model-best") # <-- ADJUST TO YOUR BEST NER MODEL

# 2. Hugging Face model name for mT5 summarization
SUMMARIZATION_MODEL_NAME = "csebuetnlp/mT5_multilingual_XLSum"

# 3. Device: "cuda" for GPU or "cpu"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 4. Summarization parameters
SUMM_NUM_BEAMS = 4
# --- NEW: Percentage-based length ---
MIN_LEN_PERC = 0.30 # Target minimum summary length as % of input tokens (e.g., 30%)
MAX_LEN_PERC = 0.75 # Target maximum summary length as % of input tokens (e.g., 55%)
# --- NEW: Absolute token limits (safety net) ---
ABS_MIN_TOKEN_LEN = 20 # Don't generate summaries shorter than this many tokens
ABS_MAX_TOKEN_LEN = 512 # Don't generate summaries longer than this many tokens
# --- End Configuration ---

warnings.filterwarnings("ignore", message="CUDA path could not be detected*")
warnings.filterwarnings("ignore", message=".*You are using `torch.load` with `weights_only=False`.*")

# --- Model Loading Functions ---
# (Keep load_ner_model and load_summarizer functions exactly as in the previous corrected version)
def load_ner_model(path):
    """Loads the spaCy NER model and ensures sentencizer is present."""
    if not path.exists():
        print(f"✘ Error: NER Model directory not found at {path.resolve()}")
        sys.exit(1)
    try:
        nlp = spacy.load(path)
        print(f"✔ Successfully loaded NER model from: {path.resolve()}")
        # Ensure a sentence boundary detector is present
        component_to_add_before = None
        if "tok2vec" in nlp.pipe_names: component_to_add_before="tok2vec"
        elif "ner" in nlp.pipe_names: component_to_add_before="ner"
        if not nlp.has_pipe("sentencizer") and not nlp.has_pipe("parser"):
            try:
                if component_to_add_before: nlp.add_pipe("sentencizer", before=component_to_add_before)
                else: nlp.add_pipe("sentencizer", first=True)
                print("INFO: Added 'sentencizer' to loaded NER pipeline.")
            except Exception as e_pipe:
                print(f"✘ WARNING: Could not add 'sentencizer': {e_pipe}. Sentence splitting might fail.")
        return nlp
    except Exception as e:
        print(f"✘ Error loading NER model from {path.resolve()}: {e}")
        sys.exit(1)

def load_summarizer(model_name):
    """Loads the Hugging Face tokenizer and model for summarization."""
    try:
        print(f"\nLoading summarization tokenizer: {model_name}...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        print(f"Loading summarization model: {model_name} (this may take time)...")
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        model.to(DEVICE)
        try:
            new_max = 256  # Set your desired max length
            model.config.max_length = new_max
            print(f"INFO: Attempted to override model config max_length to {new_max}")
        except Exception as e_cfg:
            print(f"WARN: Could not override model config max_length: {e_cfg}")
        # return tokenizer, model
        print(f"INFO: Model's configured max generation length: {model.config.max_length}")
        print(f"✔ Successfully loaded summarization model '{model_name}' on {DEVICE}.")
        return tokenizer, model
    except Exception as e:
        print(f"✘ Error loading summarization model '{model_name}': {e}")
        print("Please ensure model name is correct, protobuf==3.20.3, internet access.")
        sys.exit(1)

# --- Core Logic Functions ---

# --- MODIFIED summarize_text function ---
def summarize_text(tokenizer, model, text, num_beams=SUMM_NUM_BEAMS,

                   min_length_perc=MIN_LEN_PERC, max_length_perc=MAX_LEN_PERC):
    """Generates abstractive summary with length based on input token percentage."""
    if not text or text.isspace(): return "Input text is empty."
    print("\nGenerating summary (using percentage lengths)...")
    try:
        # 1. Calculate input token length (important to NOT pad/truncate here)
        input_ids = tokenizer(text, return_tensors="pt", truncation=False, padding=False).input_ids
        input_token_count = input_ids.shape[1]
        if input_token_count == 0: return "Input text tokenized to zero tokens."
        print(f"INFO: Input text has approx {len(text.split())} words and {input_token_count} tokens.")

        # 2. Calculate target token lengths based on percentages
        min_len_tokens = int(input_token_count * min_length_perc)
        max_len_tokens = int(input_token_count * max_length_perc)

        # 3. Apply absolute limits and ensure min < max
        min_len_tokens = max(ABS_MIN_TOKEN_LEN, min_len_tokens) # Apply absolute minimum
         # Ensure max is reasonably larger than min, prevent max < min
        max_len_tokens = max(min_len_tokens + 10, max_len_tokens)
        # Apply absolute maximum (e.g., model limit or desired cap)
        max_len_tokens = min(ABS_MAX_TOKEN_LEN, max_len_tokens)
        # Ensure min_len is not greater than max_len after caps
        min_len_tokens = min(min_len_tokens, max_len_tokens)


        print(f"INFO: Target summary token length: min={min_len_tokens}, max={max_len_tokens}.")

        # 4. Tokenize *again* for model input (this time with padding/truncation to model max input size)
        # Max length here refers to the *input* sequence length limit for the model
        inputs = tokenizer(text, max_length=1024, return_tensors="pt", padding="max_length", truncation=True).to(DEVICE)

        # 5. Generate summary using CALCULATED min/max token lengths
        summary_ids = model.generate(inputs['input_ids'],
                                     num_beams=num_beams,
                                     max_length=max_len_tokens, # Use calculated max
                                     min_length=min_len_tokens, # Use calculated min
                                     early_stopping=True)

        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        print("✔ Summary generation complete.")
        return summary
    except Exception as e:
        print(f"✘ Error during summary generation: {e}")
        import traceback
        traceback.print_exc()
        return "[Error generating summary]"

# (Keep extract_entities function exactly as before)
def extract_entities(ner_nlp, text):
    """Extracts named entities using the spaCy NER model."""
    if not text or text.isspace(): return []
    print("\nExtracting entities from original text using custom NER model...")
    try:
        doc = ner_nlp(text)
        entities = list({(ent.text.strip(), ent.label_) for ent in doc.ents if ent.text.strip()}) # Unique entities
        print(f"✔ Extracted {len(entities)} unique entities.")
        return entities
    except Exception as e:
        print(f"✘ Error during entity extraction: {e}")
        return []

# (Keep create_prompted_input function exactly as before)
def create_prompted_input(text, entities):
    """Creates a new input string with entities prepended."""
    if not entities:
        print("INFO: No entities found by NER, using original text for prompted summary.")
        return text # Return original text if no entities found
    entity_string = ", ".join(ent[0] for ent in entities)
    separator = ". முக்கிய சொற்கள்: " # ". Key terms: "
    prompted_text = f"{entity_string}{separator}{text}"
    print(f"\nINFO: Created prompted input (showing start): {prompted_text[:250]}...") # For debugging
    return prompted_text

# --- Main execution ---
# (Keep main function exactly as before - it now calls the modified summarize_text)
def main():
    # Load models
    print("Loading models, please wait...")
    ner_model = load_ner_model(NER_MODEL_PATH)
    summ_tokenizer, summ_model = load_summarizer(SUMMARIZATION_MODEL_NAME)
    print("\nModels loaded successfully!")
    print("="*50)

    # Get Input Text from User
    print("Please paste the Tamil text paragraph you want to summarize below.")
    print("Press Enter after pasting the text.")
    print("(You might need to configure your terminal for multi-line paste if it's long)")
    print("-" * 50)
    input_paragraph = input("Input Text:\n") # Get input from user

    if not input_paragraph or input_paragraph.isspace():
        print("\n✘ Error: No input text provided. Exiting.")
        sys.exit(1)
    text_to_process = input_paragraph.strip()

    print("\n" + "="*50)
    print("Processing Input Text (Snippet):")
    print(text_to_process[:300] + "...")
    print("="*50)

    # --- Generate Output 1: Standard Summary (using percentage lengths) ---
    print("\n--- Output 1: Standard Abstractive Summary (Percentage Length) ---")
    standard_summary = summarize_text(
        summ_tokenizer, summ_model, text_to_process,
        num_beams=SUMM_NUM_BEAMS
        # Uses default percentages MIN_LEN_PERC, MAX_LEN_PERC from config section
    )
    print("\nStandard Summary:")
    print(standard_summary)
    print("-" * 50)

    # --- Generate Output 2: NER-Influenced Summary (using percentage lengths) ---
    print("\n--- Output 2: NER-Influenced Abstractive Summary (Percentage Length) ---")
    # a) Extract entities
    extracted_entities = extract_entities(ner_model, text_to_process)
    print("\nKey Entities Extracted by NER:")
    if extracted_entities:
        for text_ent, label in extracted_entities:
            print(f"  - '{text_ent}' ({label})")
    else:
        print("  No entities found by NER model.")

    # b) Create prompted input
    prompted_input_text = create_prompted_input(text_to_process, extracted_entities)

    # c) Generate summary from prompted input (using percentage lengths)
    ner_influenced_summary = summarize_text(
        summ_tokenizer, summ_model, prompted_input_text,
        num_beams=SUMM_NUM_BEAMS
         # Uses default percentages MIN_LEN_PERC, MAX_LEN_PERC from config section
    )
    print("\nNER-Influenced Summary (Generated using entities as prefix):")
    print(ner_influenced_summary)
    print("\nNOTE: Compare this summary with the standard summary (Output 1).")
    print("See if prepending entities influenced the output and included more of them.")
    print("This method is experimental and doesn't guarantee inclusion.")
    print("="*50)


if __name__ == "__main__":
    main()