Nivas007 commited on
Commit
4382bfb
·
verified ·
1 Parent(s): ac77ae7

Added intial root files need to add spacy NER model and Transformer model

Browse files
Files changed (4) hide show
  1. app.py +272 -0
  2. mt5_summarize_ner_interactive_perc.py +234 -0
  3. predict2.py +91 -0
  4. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # --- Prerequisites ---
4
+ # Ensure these are in your requirements.txt for Hugging Face Spaces:
5
+ # spacy==3.5.0 # Or the version used to train NER model
6
+ # transformers>=4.20.0
7
+ # torch>=1.10.0 # Or tensorflow
8
+ # sentencepiece>=0.1.90
9
+ # protobuf==3.20.3
10
+ # datasets # Often needed by transformers/evaluate
11
+ # evaluate # If using compute_metrics (not strictly needed for this app)
12
+ # gradio>=3.0.0
13
+ # numpy
14
+ # accelerate # Good practice for transformers
15
+
16
+ import spacy
17
+ from pathlib import Path
18
+ import sys
19
+ import gradio as gr # Import Gradio
20
+ import warnings
21
+ import re
22
+ import numpy as np
23
+ try:
24
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
25
+ import torch
26
+ except ImportError:
27
+ print("✘ Error: 'transformers' or 'torch' library not found.")
28
+ print("Ensure they are listed in requirements.txt")
29
+ # Gradio might handle showing an error in the UI, but good to log.
30
+ # We'll handle model loading failure later.
31
+ pass
32
+
33
+
34
+ # --- Configuration ---
35
+ # 1. Path to your spaCy NER model directory WITHIN THE SPACE REPO
36
+ # (Upload your model-best folder and adjust path if needed)
37
+ NER_MODEL_PATH = Path("./model-best") # Assumes model-best is at the repo root
38
+
39
+ # 2. Hugging Face model name for mT5 summarization
40
+ SUMMARIZATION_MODEL_NAME = "csebuetnlp/mT5_multilingual_XLSum"
41
+
42
+ # 3. Device Selection (CPU is default/safer for free HF Spaces)
43
+ DEVICE = "cpu"
44
+ # Uncomment below if using GPU hardware on Spaces and CUDA is confirmed working there
45
+ # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
46
+
47
+ # 4. Summarization parameters
48
+ SUMM_NUM_BEAMS = 4
49
+ MIN_LEN_PERC = 0.30 # Target minimum summary length as % of input tokens
50
+ MAX_LEN_PERC = 0.75 # Target maximum summary length as % of input tokens
51
+ ABS_MIN_TOKEN_LEN = 30 # Absolute minimum token length
52
+ ABS_MAX_TOKEN_LEN = 512 # Absolute maximum token length (Adjust based on model/needs)
53
+ # --- End Configuration ---
54
+
55
+ warnings.filterwarnings("ignore", message="CUDA path could not be detected*")
56
+ warnings.filterwarnings("ignore", message=".*You are using `torch.load` with `weights_only=False`.*")
57
+ warnings.filterwarnings("ignore", message=".*The sentencepiece tokenizer that you are converting.*")
58
+
59
+ # --- Global Variables for Loaded Models (Load Once) ---
60
+ ner_model_global = None
61
+ summ_tokenizer_global = None
62
+ summ_model_global = None
63
+ models_loaded = False
64
+
65
+ # --- Model Loading Functions (Modified slightly for global loading) ---
66
+ def load_ner_model(path):
67
+ """Loads the spaCy NER model and ensures sentencizer is present."""
68
+ global ner_model_global # Declare intent to modify global variable
69
+ if not path.exists():
70
+ print(f"✘ FATAL: NER Model directory not found at {path.resolve()}")
71
+ return False
72
+ try:
73
+ ner_model_global = spacy.load(path)
74
+ print(f"✔ Successfully loaded NER model from: {path.resolve()}")
75
+ # Ensure a sentence boundary detector is present
76
+ component_to_add_before = None
77
+ if "tok2vec" in ner_model_global.pipe_names: component_to_add_before="tok2vec"
78
+ elif "ner" in ner_model_global.pipe_names: component_to_add_before="ner"
79
+ if not ner_model_global.has_pipe("sentencizer") and not ner_model_global.has_pipe("parser"):
80
+ try:
81
+ if component_to_add_before: ner_model_global.add_pipe("sentencizer", before=component_to_add_before)
82
+ else: ner_model_global.add_pipe("sentencizer", first=True)
83
+ print("INFO: Added 'sentencizer' to loaded NER pipeline.")
84
+ except Exception as e_pipe:
85
+ print(f"✘ WARNING: Could not add 'sentencizer': {e_pipe}. Sentence splitting might fail.")
86
+ return True
87
+ except Exception as e:
88
+ print(f"✘ FATAL: Error loading NER model from {path.resolve()}: {e}")
89
+ return False
90
+
91
+ def load_summarizer(model_name):
92
+ """Loads the Hugging Face tokenizer and model for summarization."""
93
+ global summ_tokenizer_global, summ_model_global # Declare intent to modify globals
94
+ try:
95
+ print(f"\nLoading summarization tokenizer: {model_name}...")
96
+ summ_tokenizer_global = AutoTokenizer.from_pretrained(model_name)
97
+ print(f"Loading summarization model: {model_name}...")
98
+ summ_model_global = AutoModelForSeq2SeqLM.from_pretrained(model_name)
99
+ summ_model_global.to(DEVICE)
100
+ # Optional: Override max length config (can be unstable, test carefully)
101
+ # try:
102
+ # summ_model_global.config.max_length = ABS_MAX_TOKEN_LEN
103
+ # print(f"INFO: Overrode model config max_length to {ABS_MAX_TOKEN_LEN}")
104
+ # except Exception as e_cfg:
105
+ # print(f"WARN: Could not override model config max_length: {e_cfg}")
106
+ print(f"INFO: Model's default configured max generation length: {summ_model_global.config.max_length}")
107
+ print(f"✔ Successfully loaded summarization model '{model_name}' on {DEVICE}.")
108
+ return True
109
+ except Exception as e:
110
+ print(f"✘ FATAL: Error loading summarization model '{model_name}': {e}")
111
+ return False
112
+
113
+ # --- Load models when the script starts ---
114
+ print("Application starting up... Loading models...")
115
+ models_loaded = load_ner_model(NER_MODEL_PATH) and load_summarizer(SUMMARIZATION_MODEL_NAME)
116
+ if models_loaded:
117
+ print("\n--- All models loaded successfully! Ready for input. ---")
118
+ else:
119
+ print("\n✘✘✘ CRITICAL ERROR: Model loading failed. The application might not work correctly. Check logs. ✘✘✘")
120
+
121
+
122
+ # --- Core Logic Functions (Keep as they were) ---
123
+ def summarize_text(tokenizer, model, text, num_beams=SUMM_NUM_BEAMS,
124
+ min_length_perc=MIN_LEN_PERC, max_length_perc=MAX_LEN_PERC):
125
+ """Generates abstractive summary with length based on input token percentage."""
126
+ if not text or text.isspace(): return "Input text is empty."
127
+ print("INFO: Generating summary (using percentage lengths)...") # Use print for logs
128
+ try:
129
+ # 1. Calculate input token length
130
+ input_ids = tokenizer(text, return_tensors="pt", truncation=False, padding=False).input_ids
131
+ input_token_count = input_ids.shape[1]
132
+ if input_token_count == 0: return "Input text tokenized to zero tokens."
133
+ print(f"INFO: Input has {input_token_count} tokens.")
134
+
135
+ # 2. Calculate target token lengths
136
+ min_len_tokens = int(input_token_count * min_length_perc)
137
+ max_len_tokens = int(input_token_count * max_length_perc)
138
+
139
+ # 3. Apply absolute limits and ensure min < max
140
+ min_len_tokens = max(ABS_MIN_TOKEN_LEN, min_len_tokens)
141
+ max_len_tokens = max(min_len_tokens + 10, max_len_tokens)
142
+ max_len_tokens = min(ABS_MAX_TOKEN_LEN, max_len_tokens)
143
+ min_len_tokens = min(min_len_tokens, max_len_tokens)
144
+ print(f"INFO: Target summary token length: min={min_len_tokens}, max={max_len_tokens}.")
145
+
146
+ # 4. Tokenize for model input
147
+ inputs = tokenizer(text, max_length=1024, return_tensors="pt", padding="max_length", truncation=True).to(DEVICE)
148
+
149
+ # 5. Generate summary
150
+ summary_ids = model.generate(inputs['input_ids'],
151
+ num_beams=num_beams,
152
+ max_length=max_len_tokens,
153
+ min_length=min_len_tokens,
154
+ early_stopping=True)
155
+
156
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
157
+ print("✔ Summary generation complete.")
158
+ return summary
159
+ except Exception as e:
160
+ print(f"✘ Error during summary generation: {e}")
161
+ return "[Error during summary generation]"
162
+
163
+ def extract_entities(ner_nlp, text):
164
+ """Extracts named entities using the spaCy NER model."""
165
+ if not text or text.isspace(): return []
166
+ print("INFO: Extracting entities...")
167
+ try:
168
+ doc = ner_nlp(text)
169
+ entities = list({(ent.text.strip(), ent.label_) for ent in doc.ents if ent.text.strip()})
170
+ print(f"✔ Extracted {len(entities)} unique entities.")
171
+ return entities
172
+ except Exception as e:
173
+ print(f"✘ Error during entity extraction: {e}")
174
+ return []
175
+
176
+ def create_prompted_input(text, entities):
177
+ """Creates a new input string with unique entities prepended."""
178
+ if not entities:
179
+ return text
180
+ unique_entity_texts = sorted(list({ent[0] for ent in entities if ent[0]}))
181
+ entity_string = ", ".join(unique_entity_texts)
182
+ separator = ". முக்கிய சொற்கள்: "
183
+ prompted_text = f"{entity_string}{separator}{text}"
184
+ print(f"INFO: Created prompted input with {len(unique_entity_texts)} unique entities.")
185
+ return prompted_text
186
+
187
+ # --- Main Processing Function for Gradio ---
188
+ def process_text_for_gradio(input_paragraph):
189
+ """Takes input text and returns standard summary and NER-enhanced output string."""
190
+ # Check if models loaded correctly
191
+ if not models_loaded or ner_model_global is None or summ_tokenizer_global is None or summ_model_global is None:
192
+ error_msg = "[FATAL ERROR: Models did not load correctly. Check application logs.]"
193
+ return error_msg, error_msg
194
+
195
+ text_to_process = input_paragraph.strip()
196
+ if not text_to_process:
197
+ return "(No input text provided)", "(No input text provided)"
198
+
199
+ # --- Generate Output 1: Standard Summary ---
200
+ standard_summary = summarize_text(
201
+ summ_tokenizer_global, summ_model_global, text_to_process,
202
+ num_beams=SUMM_NUM_BEAMS
203
+ )
204
+
205
+ # --- Prepare Output 2: NER Analysis + NER-Influenced Summary ---
206
+ # a) Extract entities
207
+ extracted_entities = extract_entities(ner_model_global, text_to_process)
208
+
209
+ # b) Create prompted input
210
+ prompted_input_text = create_prompted_input(text_to_process, extracted_entities)
211
+
212
+ # c) Generate summary from prompted input
213
+ ner_influenced_summary = summarize_text(
214
+ summ_tokenizer_global, summ_model_global, prompted_input_text,
215
+ num_beams=SUMM_NUM_BEAMS
216
+ )
217
+
218
+ # d) Format the combined Output 2 string
219
+ output2_lines = ["--- Key Entities Found by NER ---"]
220
+ if extracted_entities:
221
+ for text_ent, label in extracted_entities:
222
+ output2_lines.append(f"- '{text_ent}' ({label})")
223
+ else:
224
+ output2_lines.append("(No entities found by NER model)")
225
+
226
+ output2_lines.append("\n--- NER-Influenced Summary ---")
227
+ output2_lines.append(ner_influenced_summary)
228
+ output2_lines.append("\n(NOTE: Compare with Output 1. Prepending entities is experimental.)")
229
+
230
+ output2_display = "\n".join(output2_lines)
231
+
232
+ # Return the two outputs for Gradio
233
+ return standard_summary, output2_display
234
+
235
+
236
+ # --- Create and Launch Gradio Interface ---
237
+ print("\nSetting up Gradio interface...")
238
+ # Add description specific to your setup
239
+ app_description = """
240
+ ஒரு தமிழ் பத்தியை உள்ளிடவும். இந்த பயன்பாடு இரண்டு சுருக்கங்களை உருவாக்கும்:
241
+ 1. **நிலையான சுருக்கம்:** முன் பயிற்சி பெற்ற mT5 மாதிரியைப் பயன்படுத்தி உருவாக்கப்பட்டது.
242
+ 2. **NER பகுப்பாய்வு & செல்வாக்கு பெற்ற சுருக்கம்:** உங்கள் தனிப்பயன் NER மாதிரியால் அடையாளம் காணப்பட்ட முக்கிய சொற்களைப் பட்டியலிடுகிறது, பின்னர் அந்த சொற்களை உள்ளீட்டின் முன்சேர்த்து உருவாக்கப்பட்ட சுருக்கத்தைக் காட்டுகிறது (இது சுருக்கத்தில் அவற்றைச் சேர்க்க மாதிரியை பாதிக்கலாம்).
243
+
244
+ Enter a Tamil paragraph. This app generates two summaries:
245
+ 1. **Standard Summary:** Generated using the pre-trained mT5 model.
246
+ 2. **NER Analysis & Influenced Summary:** Lists key entities identified by your custom NER model, then shows a summary generated by prepending those entities to the input (which may influence the model to include them).
247
+ """
248
+
249
+ # Add examples if desired
250
+ example_list = [
251
+ ["இந்திய கிரிக்கெட் அணியின் முன்னாள் கேப்டனும், சென்னை சூப்பர் கிங்ஸ் அணியின் தற்போதைய கேப்டனுமான எம்.எஸ். தோனி ஐபிஎல் தொடரில் இருந்து ஓய்வு பெறுவதாக வெளியான தகவல்கள் வெறும் வதந்தி என சிஎஸ்கே நிர்வாகம் மறுத்துள்ளது. நேற்று முன்தினம் மும்பை இந்தியன்ஸ் அணிக்கு எதிரான போட்டியில் சென்னை அணி அபார வெற்றி பெற்றது. இதில் தோனியின் கடைசி நேர அதிரடி ஆட்டம் முக்கிய பங்கு வகித்தது."],
252
+ ["ஜெய்ப்பூர்: ஐபிஎல் 2025 ஆம் ஆண்டு சீசனில் ராஜஸ்தான் ராயல்ஸ் அணிக்காக 14 வயது சூரியவன்ஷி அறிமுகமானார். இதன் மூலம் இளம் வயதில் ஐபிஎல் தொடரில் களமிறங்கிய வீரர் என்ற சாதனையை வைபவ் படைத்திருக்கிறார்."]
253
+ ]
254
+
255
+
256
+ iface = gr.Interface(
257
+ fn=process_text_for_gradio, # The function to call
258
+ inputs=gr.Textbox(lines=15, label=" உள்ளீடு தமிழ் பத்தி (Input Tamil Paragraph)"),
259
+ outputs=[
260
+ gr.Textbox(label=" வெளியீடு 1: நிலையான சுருக்கம் (Output 1: Standard Summary)"),
261
+ gr.Textbox(label=" வெளியீடு 2: NER பகுப்பாய்வு & செல்வாக்கு பெற்ற சுருக்கம் (Output 2: NER Analysis & Influenced Summary)")
262
+ ],
263
+ title="தமிழ் சுருக்கம் மற்றும் NER ஒருங்கிணைப்பு (Tamil Summarization + NER Integration)",
264
+ description=app_description,
265
+ allow_flagging='never',
266
+ examples=example_list
267
+ )
268
+
269
+ print("Launching Gradio interface... Access it at the URL provided.")
270
+ # queue() enables handling multiple simultaneous users
271
+ # share=True creates a temporary public link (use False for local only)
272
+ iface.launch(show_error=True)
mt5_summarize_ner_interactive_perc.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import spacy
4
+ from pathlib import Path
5
+ import sys
6
+ # Make sure you have installed transformers, torch, sentencepiece, spacy, protobuf==3.20.3
7
+ try:
8
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
9
+ except ImportError:
10
+ print("✘ Error: 'transformers' library not found.")
11
+ print("Please install it: pip install transformers torch sentencepiece")
12
+ sys.exit(1)
13
+ import torch
14
+ import warnings
15
+ import re # For slightly better entity checking
16
+ import numpy as np # Needed for calculation
17
+
18
+ # --- Configuration ---
19
+ # 1. Path to your trained spaCy NER model (Use your best one!)
20
+ NER_MODEL_PATH = Path("./training_400/model-best") # <-- ADJUST TO YOUR BEST NER MODEL
21
+
22
+ # 2. Hugging Face model name for mT5 summarization
23
+ SUMMARIZATION_MODEL_NAME = "csebuetnlp/mT5_multilingual_XLSum"
24
+
25
+ # 3. Device: "cuda" for GPU or "cpu"
26
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
27
+
28
+ # 4. Summarization parameters
29
+ SUMM_NUM_BEAMS = 4
30
+ # --- NEW: Percentage-based length ---
31
+ MIN_LEN_PERC = 0.30 # Target minimum summary length as % of input tokens (e.g., 30%)
32
+ MAX_LEN_PERC = 0.75 # Target maximum summary length as % of input tokens (e.g., 55%)
33
+ # --- NEW: Absolute token limits (safety net) ---
34
+ ABS_MIN_TOKEN_LEN = 20 # Don't generate summaries shorter than this many tokens
35
+ ABS_MAX_TOKEN_LEN = 512 # Don't generate summaries longer than this many tokens
36
+ # --- End Configuration ---
37
+
38
+ warnings.filterwarnings("ignore", message="CUDA path could not be detected*")
39
+ warnings.filterwarnings("ignore", message=".*You are using `torch.load` with `weights_only=False`.*")
40
+
41
+ # --- Model Loading Functions ---
42
+ # (Keep load_ner_model and load_summarizer functions exactly as in the previous corrected version)
43
+ def load_ner_model(path):
44
+ """Loads the spaCy NER model and ensures sentencizer is present."""
45
+ if not path.exists():
46
+ print(f"✘ Error: NER Model directory not found at {path.resolve()}")
47
+ sys.exit(1)
48
+ try:
49
+ nlp = spacy.load(path)
50
+ print(f"✔ Successfully loaded NER model from: {path.resolve()}")
51
+ # Ensure a sentence boundary detector is present
52
+ component_to_add_before = None
53
+ if "tok2vec" in nlp.pipe_names: component_to_add_before="tok2vec"
54
+ elif "ner" in nlp.pipe_names: component_to_add_before="ner"
55
+ if not nlp.has_pipe("sentencizer") and not nlp.has_pipe("parser"):
56
+ try:
57
+ if component_to_add_before: nlp.add_pipe("sentencizer", before=component_to_add_before)
58
+ else: nlp.add_pipe("sentencizer", first=True)
59
+ print("INFO: Added 'sentencizer' to loaded NER pipeline.")
60
+ except Exception as e_pipe:
61
+ print(f"✘ WARNING: Could not add 'sentencizer': {e_pipe}. Sentence splitting might fail.")
62
+ return nlp
63
+ except Exception as e:
64
+ print(f"✘ Error loading NER model from {path.resolve()}: {e}")
65
+ sys.exit(1)
66
+
67
+ def load_summarizer(model_name):
68
+ """Loads the Hugging Face tokenizer and model for summarization."""
69
+ try:
70
+ print(f"\nLoading summarization tokenizer: {model_name}...")
71
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
72
+ print(f"Loading summarization model: {model_name} (this may take time)...")
73
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
74
+ model.to(DEVICE)
75
+ try:
76
+ new_max = 256 # Set your desired max length
77
+ model.config.max_length = new_max
78
+ print(f"INFO: Attempted to override model config max_length to {new_max}")
79
+ except Exception as e_cfg:
80
+ print(f"WARN: Could not override model config max_length: {e_cfg}")
81
+ # return tokenizer, model
82
+ print(f"INFO: Model's configured max generation length: {model.config.max_length}")
83
+ print(f"✔ Successfully loaded summarization model '{model_name}' on {DEVICE}.")
84
+ return tokenizer, model
85
+ except Exception as e:
86
+ print(f"✘ Error loading summarization model '{model_name}': {e}")
87
+ print("Please ensure model name is correct, protobuf==3.20.3, internet access.")
88
+ sys.exit(1)
89
+
90
+ # --- Core Logic Functions ---
91
+
92
+ # --- MODIFIED summarize_text function ---
93
+ def summarize_text(tokenizer, model, text, num_beams=SUMM_NUM_BEAMS,
94
+ min_length_perc=MIN_LEN_PERC, max_length_perc=MAX_LEN_PERC):
95
+ """Generates abstractive summary with length based on input token percentage."""
96
+ if not text or text.isspace(): return "Input text is empty."
97
+ print("\nGenerating summary (using percentage lengths)...")
98
+ try:
99
+ # 1. Calculate input token length (important to NOT pad/truncate here)
100
+ input_ids = tokenizer(text, return_tensors="pt", truncation=False, padding=False).input_ids
101
+ input_token_count = input_ids.shape[1]
102
+ if input_token_count == 0: return "Input text tokenized to zero tokens."
103
+ print(f"INFO: Input text has approx {len(text.split())} words and {input_token_count} tokens.")
104
+
105
+ # 2. Calculate target token lengths based on percentages
106
+ min_len_tokens = int(input_token_count * min_length_perc)
107
+ max_len_tokens = int(input_token_count * max_length_perc)
108
+
109
+ # 3. Apply absolute limits and ensure min < max
110
+ min_len_tokens = max(ABS_MIN_TOKEN_LEN, min_len_tokens) # Apply absolute minimum
111
+ # Ensure max is reasonably larger than min, prevent max < min
112
+ max_len_tokens = max(min_len_tokens + 10, max_len_tokens)
113
+ # Apply absolute maximum (e.g., model limit or desired cap)
114
+ max_len_tokens = min(ABS_MAX_TOKEN_LEN, max_len_tokens)
115
+ # Ensure min_len is not greater than max_len after caps
116
+ min_len_tokens = min(min_len_tokens, max_len_tokens)
117
+
118
+
119
+ print(f"INFO: Target summary token length: min={min_len_tokens}, max={max_len_tokens}.")
120
+
121
+ # 4. Tokenize *again* for model input (this time with padding/truncation to model max input size)
122
+ # Max length here refers to the *input* sequence length limit for the model
123
+ inputs = tokenizer(text, max_length=1024, return_tensors="pt", padding="max_length", truncation=True).to(DEVICE)
124
+
125
+ # 5. Generate summary using CALCULATED min/max token lengths
126
+ summary_ids = model.generate(inputs['input_ids'],
127
+ num_beams=num_beams,
128
+ max_length=max_len_tokens, # Use calculated max
129
+ min_length=min_len_tokens, # Use calculated min
130
+ early_stopping=True)
131
+
132
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
133
+ print("✔ Summary generation complete.")
134
+ return summary
135
+ except Exception as e:
136
+ print(f"✘ Error during summary generation: {e}")
137
+ import traceback
138
+ traceback.print_exc()
139
+ return "[Error generating summary]"
140
+
141
+ # (Keep extract_entities function exactly as before)
142
+ def extract_entities(ner_nlp, text):
143
+ """Extracts named entities using the spaCy NER model."""
144
+ if not text or text.isspace(): return []
145
+ print("\nExtracting entities from original text using custom NER model...")
146
+ try:
147
+ doc = ner_nlp(text)
148
+ entities = list({(ent.text.strip(), ent.label_) for ent in doc.ents if ent.text.strip()}) # Unique entities
149
+ print(f"✔ Extracted {len(entities)} unique entities.")
150
+ return entities
151
+ except Exception as e:
152
+ print(f"✘ Error during entity extraction: {e}")
153
+ return []
154
+
155
+ # (Keep create_prompted_input function exactly as before)
156
+ def create_prompted_input(text, entities):
157
+ """Creates a new input string with entities prepended."""
158
+ if not entities:
159
+ print("INFO: No entities found by NER, using original text for prompted summary.")
160
+ return text # Return original text if no entities found
161
+ entity_string = ", ".join(ent[0] for ent in entities)
162
+ separator = ". முக்கிய சொற்கள்: " # ". Key terms: "
163
+ prompted_text = f"{entity_string}{separator}{text}"
164
+ print(f"\nINFO: Created prompted input (showing start): {prompted_text[:250]}...") # For debugging
165
+ return prompted_text
166
+
167
+ # --- Main execution ---
168
+ # (Keep main function exactly as before - it now calls the modified summarize_text)
169
+ def main():
170
+ # Load models
171
+ print("Loading models, please wait...")
172
+ ner_model = load_ner_model(NER_MODEL_PATH)
173
+ summ_tokenizer, summ_model = load_summarizer(SUMMARIZATION_MODEL_NAME)
174
+ print("\nModels loaded successfully!")
175
+ print("="*50)
176
+
177
+ # Get Input Text from User
178
+ print("Please paste the Tamil text paragraph you want to summarize below.")
179
+ print("Press Enter after pasting the text.")
180
+ print("(You might need to configure your terminal for multi-line paste if it's long)")
181
+ print("-" * 50)
182
+ input_paragraph = input("Input Text:\n") # Get input from user
183
+
184
+ if not input_paragraph or input_paragraph.isspace():
185
+ print("\n✘ Error: No input text provided. Exiting.")
186
+ sys.exit(1)
187
+ text_to_process = input_paragraph.strip()
188
+
189
+ print("\n" + "="*50)
190
+ print("Processing Input Text (Snippet):")
191
+ print(text_to_process[:300] + "...")
192
+ print("="*50)
193
+
194
+ # --- Generate Output 1: Standard Summary (using percentage lengths) ---
195
+ print("\n--- Output 1: Standard Abstractive Summary (Percentage Length) ---")
196
+ standard_summary = summarize_text(
197
+ summ_tokenizer, summ_model, text_to_process,
198
+ num_beams=SUMM_NUM_BEAMS
199
+ # Uses default percentages MIN_LEN_PERC, MAX_LEN_PERC from config section
200
+ )
201
+ print("\nStandard Summary:")
202
+ print(standard_summary)
203
+ print("-" * 50)
204
+
205
+ # --- Generate Output 2: NER-Influenced Summary (using percentage lengths) ---
206
+ print("\n--- Output 2: NER-Influenced Abstractive Summary (Percentage Length) ---")
207
+ # a) Extract entities
208
+ extracted_entities = extract_entities(ner_model, text_to_process)
209
+ print("\nKey Entities Extracted by NER:")
210
+ if extracted_entities:
211
+ for text_ent, label in extracted_entities:
212
+ print(f" - '{text_ent}' ({label})")
213
+ else:
214
+ print(" No entities found by NER model.")
215
+
216
+ # b) Create prompted input
217
+ prompted_input_text = create_prompted_input(text_to_process, extracted_entities)
218
+
219
+ # c) Generate summary from prompted input (using percentage lengths)
220
+ ner_influenced_summary = summarize_text(
221
+ summ_tokenizer, summ_model, prompted_input_text,
222
+ num_beams=SUMM_NUM_BEAMS
223
+ # Uses default percentages MIN_LEN_PERC, MAX_LEN_PERC from config section
224
+ )
225
+ print("\nNER-Influenced Summary (Generated using entities as prefix):")
226
+ print(ner_influenced_summary)
227
+ print("\nNOTE: Compare this summary with the standard summary (Output 1).")
228
+ print("See if prepending entities influenced the output and included more of them.")
229
+ print("This method is experimental and doesn't guarantee inclusion.")
230
+ print("="*50)
231
+
232
+
233
+ if __name__ == "__main__":
234
+ main()
predict2.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from pathlib import Path
3
+ import sys
4
+
5
+ # --- Configuration ---
6
+ # Ensure this path points to your best trained model directory
7
+ # We are using the one trained on the CPU from the previous steps.
8
+ MODEL_PATH = Path("./training_400/model-best")
9
+ # --- End Configuration ---
10
+
11
+ def load_model(path):
12
+ """Loads the spaCy model."""
13
+ if not path.exists():
14
+ print(f"✘ Error: Model directory not found at {path.resolve()}")
15
+ print("Please ensure the path is correct and you have trained the model.")
16
+ sys.exit(1)
17
+ try:
18
+ # The CuPy warnings might still appear here if CUDA PATH isn't set,
19
+ # but loading should proceed using CPU for this model.
20
+ nlp = spacy.load(path)
21
+ print(f"\n✔ Successfully loaded model from: {path.resolve()}")
22
+ return nlp
23
+ except Exception as e:
24
+ print(f"✘ Error loading model from {path.resolve()}: {e}")
25
+ print("Please ensure the model path is correct and the model files are intact (especially meta.json).")
26
+ sys.exit(1) # Exit if model can't be loaded
27
+
28
+ def predict_entities(nlp, text):
29
+ """Processes text and prints found entities."""
30
+ if not text or text.isspace():
31
+ print("Input text is empty.")
32
+ return
33
+
34
+ # Limit display length for very long inputs in the prompt message
35
+ display_text = f"\"{text[:100]}...\"" if len(text) > 100 else f"\"{text}\""
36
+ print(f"\n---> Processing text: {display_text}")
37
+
38
+ # Process the text with the loaded NLP model
39
+ doc = nlp(text)
40
+
41
+ # Check if any entities were found
42
+ if doc.ents:
43
+ print("\n--- Entities Found ---")
44
+ for ent in doc.ents:
45
+ print(f" Text: '{ent.text}'")
46
+ print(f" Label: {ent.label_}")
47
+ print(f" Start: {ent.start_char}, End: {ent.end_char}")
48
+ print("-" * 25) # Separator between entities
49
+ else:
50
+ print("\n--- No entities found in this text. ---")
51
+ print("=" * 40) # Separator between different predictions
52
+
53
+ def main():
54
+ """Main function to load model and run interactive prediction loop."""
55
+ nlp_model = load_model(MODEL_PATH)
56
+
57
+ print("\n==============================")
58
+ print(" Interactive NER Predictor")
59
+ print("==============================")
60
+ print(f"Model loaded: {MODEL_PATH.name}")
61
+ print("Enter Tamil text below to identify entities.")
62
+ print("Type 'quit' or 'exit' (or just press Enter on an empty line) to stop.")
63
+ print("-" * 40)
64
+
65
+ while True:
66
+ try:
67
+ # Get input from the user
68
+ user_input = input("Enter text >> ")
69
+
70
+ # Check for exit conditions
71
+ if user_input.lower() in ["quit", "exit", ""]:
72
+ print("\nExiting predictor.")
73
+ break
74
+
75
+ # Perform prediction
76
+ predict_entities(nlp_model, user_input)
77
+
78
+ except EOFError: # Handle Ctrl+D if used in some terminals
79
+ print("\nExiting predictor.")
80
+ break
81
+ except KeyboardInterrupt: # Handle Ctrl+C cleanly
82
+ print("\nExiting predictor.")
83
+ break
84
+ except Exception as e:
85
+ print(f"\nAn unexpected error occurred: {e}")
86
+ # Optionally continue or break based on error severity
87
+ # break
88
+
89
+
90
+ if __name__ == "__main__":
91
+ main()
requirements.txt ADDED
Binary file (3.39 kB). View file