Added intial root files need to add spacy NER model and Transformer model
Browse files- app.py +272 -0
- mt5_summarize_ner_interactive_perc.py +234 -0
- predict2.py +91 -0
- requirements.txt +0 -0
app.py
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
# --- Prerequisites ---
|
4 |
+
# Ensure these are in your requirements.txt for Hugging Face Spaces:
|
5 |
+
# spacy==3.5.0 # Or the version used to train NER model
|
6 |
+
# transformers>=4.20.0
|
7 |
+
# torch>=1.10.0 # Or tensorflow
|
8 |
+
# sentencepiece>=0.1.90
|
9 |
+
# protobuf==3.20.3
|
10 |
+
# datasets # Often needed by transformers/evaluate
|
11 |
+
# evaluate # If using compute_metrics (not strictly needed for this app)
|
12 |
+
# gradio>=3.0.0
|
13 |
+
# numpy
|
14 |
+
# accelerate # Good practice for transformers
|
15 |
+
|
16 |
+
import spacy
|
17 |
+
from pathlib import Path
|
18 |
+
import sys
|
19 |
+
import gradio as gr # Import Gradio
|
20 |
+
import warnings
|
21 |
+
import re
|
22 |
+
import numpy as np
|
23 |
+
try:
|
24 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
25 |
+
import torch
|
26 |
+
except ImportError:
|
27 |
+
print("✘ Error: 'transformers' or 'torch' library not found.")
|
28 |
+
print("Ensure they are listed in requirements.txt")
|
29 |
+
# Gradio might handle showing an error in the UI, but good to log.
|
30 |
+
# We'll handle model loading failure later.
|
31 |
+
pass
|
32 |
+
|
33 |
+
|
34 |
+
# --- Configuration ---
|
35 |
+
# 1. Path to your spaCy NER model directory WITHIN THE SPACE REPO
|
36 |
+
# (Upload your model-best folder and adjust path if needed)
|
37 |
+
NER_MODEL_PATH = Path("./model-best") # Assumes model-best is at the repo root
|
38 |
+
|
39 |
+
# 2. Hugging Face model name for mT5 summarization
|
40 |
+
SUMMARIZATION_MODEL_NAME = "csebuetnlp/mT5_multilingual_XLSum"
|
41 |
+
|
42 |
+
# 3. Device Selection (CPU is default/safer for free HF Spaces)
|
43 |
+
DEVICE = "cpu"
|
44 |
+
# Uncomment below if using GPU hardware on Spaces and CUDA is confirmed working there
|
45 |
+
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
46 |
+
|
47 |
+
# 4. Summarization parameters
|
48 |
+
SUMM_NUM_BEAMS = 4
|
49 |
+
MIN_LEN_PERC = 0.30 # Target minimum summary length as % of input tokens
|
50 |
+
MAX_LEN_PERC = 0.75 # Target maximum summary length as % of input tokens
|
51 |
+
ABS_MIN_TOKEN_LEN = 30 # Absolute minimum token length
|
52 |
+
ABS_MAX_TOKEN_LEN = 512 # Absolute maximum token length (Adjust based on model/needs)
|
53 |
+
# --- End Configuration ---
|
54 |
+
|
55 |
+
warnings.filterwarnings("ignore", message="CUDA path could not be detected*")
|
56 |
+
warnings.filterwarnings("ignore", message=".*You are using `torch.load` with `weights_only=False`.*")
|
57 |
+
warnings.filterwarnings("ignore", message=".*The sentencepiece tokenizer that you are converting.*")
|
58 |
+
|
59 |
+
# --- Global Variables for Loaded Models (Load Once) ---
|
60 |
+
ner_model_global = None
|
61 |
+
summ_tokenizer_global = None
|
62 |
+
summ_model_global = None
|
63 |
+
models_loaded = False
|
64 |
+
|
65 |
+
# --- Model Loading Functions (Modified slightly for global loading) ---
|
66 |
+
def load_ner_model(path):
|
67 |
+
"""Loads the spaCy NER model and ensures sentencizer is present."""
|
68 |
+
global ner_model_global # Declare intent to modify global variable
|
69 |
+
if not path.exists():
|
70 |
+
print(f"✘ FATAL: NER Model directory not found at {path.resolve()}")
|
71 |
+
return False
|
72 |
+
try:
|
73 |
+
ner_model_global = spacy.load(path)
|
74 |
+
print(f"✔ Successfully loaded NER model from: {path.resolve()}")
|
75 |
+
# Ensure a sentence boundary detector is present
|
76 |
+
component_to_add_before = None
|
77 |
+
if "tok2vec" in ner_model_global.pipe_names: component_to_add_before="tok2vec"
|
78 |
+
elif "ner" in ner_model_global.pipe_names: component_to_add_before="ner"
|
79 |
+
if not ner_model_global.has_pipe("sentencizer") and not ner_model_global.has_pipe("parser"):
|
80 |
+
try:
|
81 |
+
if component_to_add_before: ner_model_global.add_pipe("sentencizer", before=component_to_add_before)
|
82 |
+
else: ner_model_global.add_pipe("sentencizer", first=True)
|
83 |
+
print("INFO: Added 'sentencizer' to loaded NER pipeline.")
|
84 |
+
except Exception as e_pipe:
|
85 |
+
print(f"✘ WARNING: Could not add 'sentencizer': {e_pipe}. Sentence splitting might fail.")
|
86 |
+
return True
|
87 |
+
except Exception as e:
|
88 |
+
print(f"✘ FATAL: Error loading NER model from {path.resolve()}: {e}")
|
89 |
+
return False
|
90 |
+
|
91 |
+
def load_summarizer(model_name):
|
92 |
+
"""Loads the Hugging Face tokenizer and model for summarization."""
|
93 |
+
global summ_tokenizer_global, summ_model_global # Declare intent to modify globals
|
94 |
+
try:
|
95 |
+
print(f"\nLoading summarization tokenizer: {model_name}...")
|
96 |
+
summ_tokenizer_global = AutoTokenizer.from_pretrained(model_name)
|
97 |
+
print(f"Loading summarization model: {model_name}...")
|
98 |
+
summ_model_global = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
99 |
+
summ_model_global.to(DEVICE)
|
100 |
+
# Optional: Override max length config (can be unstable, test carefully)
|
101 |
+
# try:
|
102 |
+
# summ_model_global.config.max_length = ABS_MAX_TOKEN_LEN
|
103 |
+
# print(f"INFO: Overrode model config max_length to {ABS_MAX_TOKEN_LEN}")
|
104 |
+
# except Exception as e_cfg:
|
105 |
+
# print(f"WARN: Could not override model config max_length: {e_cfg}")
|
106 |
+
print(f"INFO: Model's default configured max generation length: {summ_model_global.config.max_length}")
|
107 |
+
print(f"✔ Successfully loaded summarization model '{model_name}' on {DEVICE}.")
|
108 |
+
return True
|
109 |
+
except Exception as e:
|
110 |
+
print(f"✘ FATAL: Error loading summarization model '{model_name}': {e}")
|
111 |
+
return False
|
112 |
+
|
113 |
+
# --- Load models when the script starts ---
|
114 |
+
print("Application starting up... Loading models...")
|
115 |
+
models_loaded = load_ner_model(NER_MODEL_PATH) and load_summarizer(SUMMARIZATION_MODEL_NAME)
|
116 |
+
if models_loaded:
|
117 |
+
print("\n--- All models loaded successfully! Ready for input. ---")
|
118 |
+
else:
|
119 |
+
print("\n✘✘✘ CRITICAL ERROR: Model loading failed. The application might not work correctly. Check logs. ✘✘✘")
|
120 |
+
|
121 |
+
|
122 |
+
# --- Core Logic Functions (Keep as they were) ---
|
123 |
+
def summarize_text(tokenizer, model, text, num_beams=SUMM_NUM_BEAMS,
|
124 |
+
min_length_perc=MIN_LEN_PERC, max_length_perc=MAX_LEN_PERC):
|
125 |
+
"""Generates abstractive summary with length based on input token percentage."""
|
126 |
+
if not text or text.isspace(): return "Input text is empty."
|
127 |
+
print("INFO: Generating summary (using percentage lengths)...") # Use print for logs
|
128 |
+
try:
|
129 |
+
# 1. Calculate input token length
|
130 |
+
input_ids = tokenizer(text, return_tensors="pt", truncation=False, padding=False).input_ids
|
131 |
+
input_token_count = input_ids.shape[1]
|
132 |
+
if input_token_count == 0: return "Input text tokenized to zero tokens."
|
133 |
+
print(f"INFO: Input has {input_token_count} tokens.")
|
134 |
+
|
135 |
+
# 2. Calculate target token lengths
|
136 |
+
min_len_tokens = int(input_token_count * min_length_perc)
|
137 |
+
max_len_tokens = int(input_token_count * max_length_perc)
|
138 |
+
|
139 |
+
# 3. Apply absolute limits and ensure min < max
|
140 |
+
min_len_tokens = max(ABS_MIN_TOKEN_LEN, min_len_tokens)
|
141 |
+
max_len_tokens = max(min_len_tokens + 10, max_len_tokens)
|
142 |
+
max_len_tokens = min(ABS_MAX_TOKEN_LEN, max_len_tokens)
|
143 |
+
min_len_tokens = min(min_len_tokens, max_len_tokens)
|
144 |
+
print(f"INFO: Target summary token length: min={min_len_tokens}, max={max_len_tokens}.")
|
145 |
+
|
146 |
+
# 4. Tokenize for model input
|
147 |
+
inputs = tokenizer(text, max_length=1024, return_tensors="pt", padding="max_length", truncation=True).to(DEVICE)
|
148 |
+
|
149 |
+
# 5. Generate summary
|
150 |
+
summary_ids = model.generate(inputs['input_ids'],
|
151 |
+
num_beams=num_beams,
|
152 |
+
max_length=max_len_tokens,
|
153 |
+
min_length=min_len_tokens,
|
154 |
+
early_stopping=True)
|
155 |
+
|
156 |
+
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
157 |
+
print("✔ Summary generation complete.")
|
158 |
+
return summary
|
159 |
+
except Exception as e:
|
160 |
+
print(f"✘ Error during summary generation: {e}")
|
161 |
+
return "[Error during summary generation]"
|
162 |
+
|
163 |
+
def extract_entities(ner_nlp, text):
|
164 |
+
"""Extracts named entities using the spaCy NER model."""
|
165 |
+
if not text or text.isspace(): return []
|
166 |
+
print("INFO: Extracting entities...")
|
167 |
+
try:
|
168 |
+
doc = ner_nlp(text)
|
169 |
+
entities = list({(ent.text.strip(), ent.label_) for ent in doc.ents if ent.text.strip()})
|
170 |
+
print(f"✔ Extracted {len(entities)} unique entities.")
|
171 |
+
return entities
|
172 |
+
except Exception as e:
|
173 |
+
print(f"✘ Error during entity extraction: {e}")
|
174 |
+
return []
|
175 |
+
|
176 |
+
def create_prompted_input(text, entities):
|
177 |
+
"""Creates a new input string with unique entities prepended."""
|
178 |
+
if not entities:
|
179 |
+
return text
|
180 |
+
unique_entity_texts = sorted(list({ent[0] for ent in entities if ent[0]}))
|
181 |
+
entity_string = ", ".join(unique_entity_texts)
|
182 |
+
separator = ". முக்கிய சொற்கள்: "
|
183 |
+
prompted_text = f"{entity_string}{separator}{text}"
|
184 |
+
print(f"INFO: Created prompted input with {len(unique_entity_texts)} unique entities.")
|
185 |
+
return prompted_text
|
186 |
+
|
187 |
+
# --- Main Processing Function for Gradio ---
|
188 |
+
def process_text_for_gradio(input_paragraph):
|
189 |
+
"""Takes input text and returns standard summary and NER-enhanced output string."""
|
190 |
+
# Check if models loaded correctly
|
191 |
+
if not models_loaded or ner_model_global is None or summ_tokenizer_global is None or summ_model_global is None:
|
192 |
+
error_msg = "[FATAL ERROR: Models did not load correctly. Check application logs.]"
|
193 |
+
return error_msg, error_msg
|
194 |
+
|
195 |
+
text_to_process = input_paragraph.strip()
|
196 |
+
if not text_to_process:
|
197 |
+
return "(No input text provided)", "(No input text provided)"
|
198 |
+
|
199 |
+
# --- Generate Output 1: Standard Summary ---
|
200 |
+
standard_summary = summarize_text(
|
201 |
+
summ_tokenizer_global, summ_model_global, text_to_process,
|
202 |
+
num_beams=SUMM_NUM_BEAMS
|
203 |
+
)
|
204 |
+
|
205 |
+
# --- Prepare Output 2: NER Analysis + NER-Influenced Summary ---
|
206 |
+
# a) Extract entities
|
207 |
+
extracted_entities = extract_entities(ner_model_global, text_to_process)
|
208 |
+
|
209 |
+
# b) Create prompted input
|
210 |
+
prompted_input_text = create_prompted_input(text_to_process, extracted_entities)
|
211 |
+
|
212 |
+
# c) Generate summary from prompted input
|
213 |
+
ner_influenced_summary = summarize_text(
|
214 |
+
summ_tokenizer_global, summ_model_global, prompted_input_text,
|
215 |
+
num_beams=SUMM_NUM_BEAMS
|
216 |
+
)
|
217 |
+
|
218 |
+
# d) Format the combined Output 2 string
|
219 |
+
output2_lines = ["--- Key Entities Found by NER ---"]
|
220 |
+
if extracted_entities:
|
221 |
+
for text_ent, label in extracted_entities:
|
222 |
+
output2_lines.append(f"- '{text_ent}' ({label})")
|
223 |
+
else:
|
224 |
+
output2_lines.append("(No entities found by NER model)")
|
225 |
+
|
226 |
+
output2_lines.append("\n--- NER-Influenced Summary ---")
|
227 |
+
output2_lines.append(ner_influenced_summary)
|
228 |
+
output2_lines.append("\n(NOTE: Compare with Output 1. Prepending entities is experimental.)")
|
229 |
+
|
230 |
+
output2_display = "\n".join(output2_lines)
|
231 |
+
|
232 |
+
# Return the two outputs for Gradio
|
233 |
+
return standard_summary, output2_display
|
234 |
+
|
235 |
+
|
236 |
+
# --- Create and Launch Gradio Interface ---
|
237 |
+
print("\nSetting up Gradio interface...")
|
238 |
+
# Add description specific to your setup
|
239 |
+
app_description = """
|
240 |
+
ஒரு தமிழ் பத்தியை உள்ளிடவும். இந்த பயன்பாடு இரண்டு சுருக்கங்களை உருவாக்கும்:
|
241 |
+
1. **நிலையான சுருக்கம்:** முன் பயிற்சி பெற்ற mT5 மாதிரியைப் பயன்படுத்தி உருவாக்கப்பட்டது.
|
242 |
+
2. **NER பகுப்பாய்வு & செல்வாக்கு பெற்ற சுருக்கம்:** உங்கள் தனிப்பயன் NER மாதிரியால் அடையாளம் காணப்பட்ட முக்கிய சொற்களைப் பட்டியலிடுகிறது, பின்னர் அந்த சொற்களை உள்ளீட்டின் முன்சேர்த்து உருவாக்கப்பட்ட சுருக்கத்தைக் காட்டுகிறது (இது சுருக்கத்தில் அவற்றைச் சேர்க்க மாதிரியை பாதிக்கலாம்).
|
243 |
+
|
244 |
+
Enter a Tamil paragraph. This app generates two summaries:
|
245 |
+
1. **Standard Summary:** Generated using the pre-trained mT5 model.
|
246 |
+
2. **NER Analysis & Influenced Summary:** Lists key entities identified by your custom NER model, then shows a summary generated by prepending those entities to the input (which may influence the model to include them).
|
247 |
+
"""
|
248 |
+
|
249 |
+
# Add examples if desired
|
250 |
+
example_list = [
|
251 |
+
["இந்திய கிரிக்கெட் அணியின் முன்னாள் கேப்டனும், சென்னை சூப்பர் கிங்ஸ் அணியின் தற்போதைய கேப்டனுமான எம்.எஸ். தோனி ஐபிஎல் தொடரில் இருந்து ஓய்வு பெறுவதாக வெளியான தகவல்கள் வெறும் வதந்தி என சிஎஸ்கே நிர்வாகம் மறுத்துள்ளது. நேற்று முன்தினம் மும்பை இந்தியன்ஸ் அணிக்கு எதிரான போட்டியில் சென்னை அணி அபார வெற்றி பெற்றது. இதில் தோனியின் கடைசி நேர அதிரடி ஆட்டம் முக்கிய பங்கு வகித்தது."],
|
252 |
+
["ஜெய்ப்பூர்: ஐபிஎல் 2025 ஆம் ஆண்டு சீசனில் ராஜஸ்தான் ராயல்ஸ் அணிக்காக 14 வயது சூரியவன்ஷி அறிமுகமானார். இதன் மூலம் இளம் வயதில் ஐபிஎல் தொடரில் களமிறங்கிய வீரர் என்ற சாதனையை வைபவ் படைத்திருக்கிறார்."]
|
253 |
+
]
|
254 |
+
|
255 |
+
|
256 |
+
iface = gr.Interface(
|
257 |
+
fn=process_text_for_gradio, # The function to call
|
258 |
+
inputs=gr.Textbox(lines=15, label=" உள்ளீடு தமிழ் பத்தி (Input Tamil Paragraph)"),
|
259 |
+
outputs=[
|
260 |
+
gr.Textbox(label=" வெளியீடு 1: நிலையான சுருக்கம் (Output 1: Standard Summary)"),
|
261 |
+
gr.Textbox(label=" வெளியீடு 2: NER பகுப்பாய்வு & செல்வாக்கு பெற்ற சுருக்கம் (Output 2: NER Analysis & Influenced Summary)")
|
262 |
+
],
|
263 |
+
title="தமிழ் சுருக்கம் மற்றும் NER ஒருங்கிணைப்பு (Tamil Summarization + NER Integration)",
|
264 |
+
description=app_description,
|
265 |
+
allow_flagging='never',
|
266 |
+
examples=example_list
|
267 |
+
)
|
268 |
+
|
269 |
+
print("Launching Gradio interface... Access it at the URL provided.")
|
270 |
+
# queue() enables handling multiple simultaneous users
|
271 |
+
# share=True creates a temporary public link (use False for local only)
|
272 |
+
iface.launch(show_error=True)
|
mt5_summarize_ner_interactive_perc.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
import spacy
|
4 |
+
from pathlib import Path
|
5 |
+
import sys
|
6 |
+
# Make sure you have installed transformers, torch, sentencepiece, spacy, protobuf==3.20.3
|
7 |
+
try:
|
8 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
9 |
+
except ImportError:
|
10 |
+
print("✘ Error: 'transformers' library not found.")
|
11 |
+
print("Please install it: pip install transformers torch sentencepiece")
|
12 |
+
sys.exit(1)
|
13 |
+
import torch
|
14 |
+
import warnings
|
15 |
+
import re # For slightly better entity checking
|
16 |
+
import numpy as np # Needed for calculation
|
17 |
+
|
18 |
+
# --- Configuration ---
|
19 |
+
# 1. Path to your trained spaCy NER model (Use your best one!)
|
20 |
+
NER_MODEL_PATH = Path("./training_400/model-best") # <-- ADJUST TO YOUR BEST NER MODEL
|
21 |
+
|
22 |
+
# 2. Hugging Face model name for mT5 summarization
|
23 |
+
SUMMARIZATION_MODEL_NAME = "csebuetnlp/mT5_multilingual_XLSum"
|
24 |
+
|
25 |
+
# 3. Device: "cuda" for GPU or "cpu"
|
26 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
27 |
+
|
28 |
+
# 4. Summarization parameters
|
29 |
+
SUMM_NUM_BEAMS = 4
|
30 |
+
# --- NEW: Percentage-based length ---
|
31 |
+
MIN_LEN_PERC = 0.30 # Target minimum summary length as % of input tokens (e.g., 30%)
|
32 |
+
MAX_LEN_PERC = 0.75 # Target maximum summary length as % of input tokens (e.g., 55%)
|
33 |
+
# --- NEW: Absolute token limits (safety net) ---
|
34 |
+
ABS_MIN_TOKEN_LEN = 20 # Don't generate summaries shorter than this many tokens
|
35 |
+
ABS_MAX_TOKEN_LEN = 512 # Don't generate summaries longer than this many tokens
|
36 |
+
# --- End Configuration ---
|
37 |
+
|
38 |
+
warnings.filterwarnings("ignore", message="CUDA path could not be detected*")
|
39 |
+
warnings.filterwarnings("ignore", message=".*You are using `torch.load` with `weights_only=False`.*")
|
40 |
+
|
41 |
+
# --- Model Loading Functions ---
|
42 |
+
# (Keep load_ner_model and load_summarizer functions exactly as in the previous corrected version)
|
43 |
+
def load_ner_model(path):
|
44 |
+
"""Loads the spaCy NER model and ensures sentencizer is present."""
|
45 |
+
if not path.exists():
|
46 |
+
print(f"✘ Error: NER Model directory not found at {path.resolve()}")
|
47 |
+
sys.exit(1)
|
48 |
+
try:
|
49 |
+
nlp = spacy.load(path)
|
50 |
+
print(f"✔ Successfully loaded NER model from: {path.resolve()}")
|
51 |
+
# Ensure a sentence boundary detector is present
|
52 |
+
component_to_add_before = None
|
53 |
+
if "tok2vec" in nlp.pipe_names: component_to_add_before="tok2vec"
|
54 |
+
elif "ner" in nlp.pipe_names: component_to_add_before="ner"
|
55 |
+
if not nlp.has_pipe("sentencizer") and not nlp.has_pipe("parser"):
|
56 |
+
try:
|
57 |
+
if component_to_add_before: nlp.add_pipe("sentencizer", before=component_to_add_before)
|
58 |
+
else: nlp.add_pipe("sentencizer", first=True)
|
59 |
+
print("INFO: Added 'sentencizer' to loaded NER pipeline.")
|
60 |
+
except Exception as e_pipe:
|
61 |
+
print(f"✘ WARNING: Could not add 'sentencizer': {e_pipe}. Sentence splitting might fail.")
|
62 |
+
return nlp
|
63 |
+
except Exception as e:
|
64 |
+
print(f"✘ Error loading NER model from {path.resolve()}: {e}")
|
65 |
+
sys.exit(1)
|
66 |
+
|
67 |
+
def load_summarizer(model_name):
|
68 |
+
"""Loads the Hugging Face tokenizer and model for summarization."""
|
69 |
+
try:
|
70 |
+
print(f"\nLoading summarization tokenizer: {model_name}...")
|
71 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
72 |
+
print(f"Loading summarization model: {model_name} (this may take time)...")
|
73 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
74 |
+
model.to(DEVICE)
|
75 |
+
try:
|
76 |
+
new_max = 256 # Set your desired max length
|
77 |
+
model.config.max_length = new_max
|
78 |
+
print(f"INFO: Attempted to override model config max_length to {new_max}")
|
79 |
+
except Exception as e_cfg:
|
80 |
+
print(f"WARN: Could not override model config max_length: {e_cfg}")
|
81 |
+
# return tokenizer, model
|
82 |
+
print(f"INFO: Model's configured max generation length: {model.config.max_length}")
|
83 |
+
print(f"✔ Successfully loaded summarization model '{model_name}' on {DEVICE}.")
|
84 |
+
return tokenizer, model
|
85 |
+
except Exception as e:
|
86 |
+
print(f"✘ Error loading summarization model '{model_name}': {e}")
|
87 |
+
print("Please ensure model name is correct, protobuf==3.20.3, internet access.")
|
88 |
+
sys.exit(1)
|
89 |
+
|
90 |
+
# --- Core Logic Functions ---
|
91 |
+
|
92 |
+
# --- MODIFIED summarize_text function ---
|
93 |
+
def summarize_text(tokenizer, model, text, num_beams=SUMM_NUM_BEAMS,
|
94 |
+
min_length_perc=MIN_LEN_PERC, max_length_perc=MAX_LEN_PERC):
|
95 |
+
"""Generates abstractive summary with length based on input token percentage."""
|
96 |
+
if not text or text.isspace(): return "Input text is empty."
|
97 |
+
print("\nGenerating summary (using percentage lengths)...")
|
98 |
+
try:
|
99 |
+
# 1. Calculate input token length (important to NOT pad/truncate here)
|
100 |
+
input_ids = tokenizer(text, return_tensors="pt", truncation=False, padding=False).input_ids
|
101 |
+
input_token_count = input_ids.shape[1]
|
102 |
+
if input_token_count == 0: return "Input text tokenized to zero tokens."
|
103 |
+
print(f"INFO: Input text has approx {len(text.split())} words and {input_token_count} tokens.")
|
104 |
+
|
105 |
+
# 2. Calculate target token lengths based on percentages
|
106 |
+
min_len_tokens = int(input_token_count * min_length_perc)
|
107 |
+
max_len_tokens = int(input_token_count * max_length_perc)
|
108 |
+
|
109 |
+
# 3. Apply absolute limits and ensure min < max
|
110 |
+
min_len_tokens = max(ABS_MIN_TOKEN_LEN, min_len_tokens) # Apply absolute minimum
|
111 |
+
# Ensure max is reasonably larger than min, prevent max < min
|
112 |
+
max_len_tokens = max(min_len_tokens + 10, max_len_tokens)
|
113 |
+
# Apply absolute maximum (e.g., model limit or desired cap)
|
114 |
+
max_len_tokens = min(ABS_MAX_TOKEN_LEN, max_len_tokens)
|
115 |
+
# Ensure min_len is not greater than max_len after caps
|
116 |
+
min_len_tokens = min(min_len_tokens, max_len_tokens)
|
117 |
+
|
118 |
+
|
119 |
+
print(f"INFO: Target summary token length: min={min_len_tokens}, max={max_len_tokens}.")
|
120 |
+
|
121 |
+
# 4. Tokenize *again* for model input (this time with padding/truncation to model max input size)
|
122 |
+
# Max length here refers to the *input* sequence length limit for the model
|
123 |
+
inputs = tokenizer(text, max_length=1024, return_tensors="pt", padding="max_length", truncation=True).to(DEVICE)
|
124 |
+
|
125 |
+
# 5. Generate summary using CALCULATED min/max token lengths
|
126 |
+
summary_ids = model.generate(inputs['input_ids'],
|
127 |
+
num_beams=num_beams,
|
128 |
+
max_length=max_len_tokens, # Use calculated max
|
129 |
+
min_length=min_len_tokens, # Use calculated min
|
130 |
+
early_stopping=True)
|
131 |
+
|
132 |
+
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
133 |
+
print("✔ Summary generation complete.")
|
134 |
+
return summary
|
135 |
+
except Exception as e:
|
136 |
+
print(f"✘ Error during summary generation: {e}")
|
137 |
+
import traceback
|
138 |
+
traceback.print_exc()
|
139 |
+
return "[Error generating summary]"
|
140 |
+
|
141 |
+
# (Keep extract_entities function exactly as before)
|
142 |
+
def extract_entities(ner_nlp, text):
|
143 |
+
"""Extracts named entities using the spaCy NER model."""
|
144 |
+
if not text or text.isspace(): return []
|
145 |
+
print("\nExtracting entities from original text using custom NER model...")
|
146 |
+
try:
|
147 |
+
doc = ner_nlp(text)
|
148 |
+
entities = list({(ent.text.strip(), ent.label_) for ent in doc.ents if ent.text.strip()}) # Unique entities
|
149 |
+
print(f"✔ Extracted {len(entities)} unique entities.")
|
150 |
+
return entities
|
151 |
+
except Exception as e:
|
152 |
+
print(f"✘ Error during entity extraction: {e}")
|
153 |
+
return []
|
154 |
+
|
155 |
+
# (Keep create_prompted_input function exactly as before)
|
156 |
+
def create_prompted_input(text, entities):
|
157 |
+
"""Creates a new input string with entities prepended."""
|
158 |
+
if not entities:
|
159 |
+
print("INFO: No entities found by NER, using original text for prompted summary.")
|
160 |
+
return text # Return original text if no entities found
|
161 |
+
entity_string = ", ".join(ent[0] for ent in entities)
|
162 |
+
separator = ". முக்கிய சொற்கள்: " # ". Key terms: "
|
163 |
+
prompted_text = f"{entity_string}{separator}{text}"
|
164 |
+
print(f"\nINFO: Created prompted input (showing start): {prompted_text[:250]}...") # For debugging
|
165 |
+
return prompted_text
|
166 |
+
|
167 |
+
# --- Main execution ---
|
168 |
+
# (Keep main function exactly as before - it now calls the modified summarize_text)
|
169 |
+
def main():
|
170 |
+
# Load models
|
171 |
+
print("Loading models, please wait...")
|
172 |
+
ner_model = load_ner_model(NER_MODEL_PATH)
|
173 |
+
summ_tokenizer, summ_model = load_summarizer(SUMMARIZATION_MODEL_NAME)
|
174 |
+
print("\nModels loaded successfully!")
|
175 |
+
print("="*50)
|
176 |
+
|
177 |
+
# Get Input Text from User
|
178 |
+
print("Please paste the Tamil text paragraph you want to summarize below.")
|
179 |
+
print("Press Enter after pasting the text.")
|
180 |
+
print("(You might need to configure your terminal for multi-line paste if it's long)")
|
181 |
+
print("-" * 50)
|
182 |
+
input_paragraph = input("Input Text:\n") # Get input from user
|
183 |
+
|
184 |
+
if not input_paragraph or input_paragraph.isspace():
|
185 |
+
print("\n✘ Error: No input text provided. Exiting.")
|
186 |
+
sys.exit(1)
|
187 |
+
text_to_process = input_paragraph.strip()
|
188 |
+
|
189 |
+
print("\n" + "="*50)
|
190 |
+
print("Processing Input Text (Snippet):")
|
191 |
+
print(text_to_process[:300] + "...")
|
192 |
+
print("="*50)
|
193 |
+
|
194 |
+
# --- Generate Output 1: Standard Summary (using percentage lengths) ---
|
195 |
+
print("\n--- Output 1: Standard Abstractive Summary (Percentage Length) ---")
|
196 |
+
standard_summary = summarize_text(
|
197 |
+
summ_tokenizer, summ_model, text_to_process,
|
198 |
+
num_beams=SUMM_NUM_BEAMS
|
199 |
+
# Uses default percentages MIN_LEN_PERC, MAX_LEN_PERC from config section
|
200 |
+
)
|
201 |
+
print("\nStandard Summary:")
|
202 |
+
print(standard_summary)
|
203 |
+
print("-" * 50)
|
204 |
+
|
205 |
+
# --- Generate Output 2: NER-Influenced Summary (using percentage lengths) ---
|
206 |
+
print("\n--- Output 2: NER-Influenced Abstractive Summary (Percentage Length) ---")
|
207 |
+
# a) Extract entities
|
208 |
+
extracted_entities = extract_entities(ner_model, text_to_process)
|
209 |
+
print("\nKey Entities Extracted by NER:")
|
210 |
+
if extracted_entities:
|
211 |
+
for text_ent, label in extracted_entities:
|
212 |
+
print(f" - '{text_ent}' ({label})")
|
213 |
+
else:
|
214 |
+
print(" No entities found by NER model.")
|
215 |
+
|
216 |
+
# b) Create prompted input
|
217 |
+
prompted_input_text = create_prompted_input(text_to_process, extracted_entities)
|
218 |
+
|
219 |
+
# c) Generate summary from prompted input (using percentage lengths)
|
220 |
+
ner_influenced_summary = summarize_text(
|
221 |
+
summ_tokenizer, summ_model, prompted_input_text,
|
222 |
+
num_beams=SUMM_NUM_BEAMS
|
223 |
+
# Uses default percentages MIN_LEN_PERC, MAX_LEN_PERC from config section
|
224 |
+
)
|
225 |
+
print("\nNER-Influenced Summary (Generated using entities as prefix):")
|
226 |
+
print(ner_influenced_summary)
|
227 |
+
print("\nNOTE: Compare this summary with the standard summary (Output 1).")
|
228 |
+
print("See if prepending entities influenced the output and included more of them.")
|
229 |
+
print("This method is experimental and doesn't guarantee inclusion.")
|
230 |
+
print("="*50)
|
231 |
+
|
232 |
+
|
233 |
+
if __name__ == "__main__":
|
234 |
+
main()
|
predict2.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
from pathlib import Path
|
3 |
+
import sys
|
4 |
+
|
5 |
+
# --- Configuration ---
|
6 |
+
# Ensure this path points to your best trained model directory
|
7 |
+
# We are using the one trained on the CPU from the previous steps.
|
8 |
+
MODEL_PATH = Path("./training_400/model-best")
|
9 |
+
# --- End Configuration ---
|
10 |
+
|
11 |
+
def load_model(path):
|
12 |
+
"""Loads the spaCy model."""
|
13 |
+
if not path.exists():
|
14 |
+
print(f"✘ Error: Model directory not found at {path.resolve()}")
|
15 |
+
print("Please ensure the path is correct and you have trained the model.")
|
16 |
+
sys.exit(1)
|
17 |
+
try:
|
18 |
+
# The CuPy warnings might still appear here if CUDA PATH isn't set,
|
19 |
+
# but loading should proceed using CPU for this model.
|
20 |
+
nlp = spacy.load(path)
|
21 |
+
print(f"\n✔ Successfully loaded model from: {path.resolve()}")
|
22 |
+
return nlp
|
23 |
+
except Exception as e:
|
24 |
+
print(f"✘ Error loading model from {path.resolve()}: {e}")
|
25 |
+
print("Please ensure the model path is correct and the model files are intact (especially meta.json).")
|
26 |
+
sys.exit(1) # Exit if model can't be loaded
|
27 |
+
|
28 |
+
def predict_entities(nlp, text):
|
29 |
+
"""Processes text and prints found entities."""
|
30 |
+
if not text or text.isspace():
|
31 |
+
print("Input text is empty.")
|
32 |
+
return
|
33 |
+
|
34 |
+
# Limit display length for very long inputs in the prompt message
|
35 |
+
display_text = f"\"{text[:100]}...\"" if len(text) > 100 else f"\"{text}\""
|
36 |
+
print(f"\n---> Processing text: {display_text}")
|
37 |
+
|
38 |
+
# Process the text with the loaded NLP model
|
39 |
+
doc = nlp(text)
|
40 |
+
|
41 |
+
# Check if any entities were found
|
42 |
+
if doc.ents:
|
43 |
+
print("\n--- Entities Found ---")
|
44 |
+
for ent in doc.ents:
|
45 |
+
print(f" Text: '{ent.text}'")
|
46 |
+
print(f" Label: {ent.label_}")
|
47 |
+
print(f" Start: {ent.start_char}, End: {ent.end_char}")
|
48 |
+
print("-" * 25) # Separator between entities
|
49 |
+
else:
|
50 |
+
print("\n--- No entities found in this text. ---")
|
51 |
+
print("=" * 40) # Separator between different predictions
|
52 |
+
|
53 |
+
def main():
|
54 |
+
"""Main function to load model and run interactive prediction loop."""
|
55 |
+
nlp_model = load_model(MODEL_PATH)
|
56 |
+
|
57 |
+
print("\n==============================")
|
58 |
+
print(" Interactive NER Predictor")
|
59 |
+
print("==============================")
|
60 |
+
print(f"Model loaded: {MODEL_PATH.name}")
|
61 |
+
print("Enter Tamil text below to identify entities.")
|
62 |
+
print("Type 'quit' or 'exit' (or just press Enter on an empty line) to stop.")
|
63 |
+
print("-" * 40)
|
64 |
+
|
65 |
+
while True:
|
66 |
+
try:
|
67 |
+
# Get input from the user
|
68 |
+
user_input = input("Enter text >> ")
|
69 |
+
|
70 |
+
# Check for exit conditions
|
71 |
+
if user_input.lower() in ["quit", "exit", ""]:
|
72 |
+
print("\nExiting predictor.")
|
73 |
+
break
|
74 |
+
|
75 |
+
# Perform prediction
|
76 |
+
predict_entities(nlp_model, user_input)
|
77 |
+
|
78 |
+
except EOFError: # Handle Ctrl+D if used in some terminals
|
79 |
+
print("\nExiting predictor.")
|
80 |
+
break
|
81 |
+
except KeyboardInterrupt: # Handle Ctrl+C cleanly
|
82 |
+
print("\nExiting predictor.")
|
83 |
+
break
|
84 |
+
except Exception as e:
|
85 |
+
print(f"\nAn unexpected error occurred: {e}")
|
86 |
+
# Optionally continue or break based on error severity
|
87 |
+
# break
|
88 |
+
|
89 |
+
|
90 |
+
if __name__ == "__main__":
|
91 |
+
main()
|
requirements.txt
ADDED
Binary file (3.39 kB). View file
|
|