|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import streamlit as st |
|
import spacy |
|
from pathlib import Path |
|
import sys |
|
import torch |
|
import warnings |
|
import re |
|
import numpy as np |
|
|
|
try: |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
from peft import PeftModel |
|
import nltk |
|
nltk.download('punkt', quiet=True) |
|
print("✔ Successfully imported core libraries.") |
|
except ImportError as e: |
|
|
|
st.error(f"Error importing libraries: {e}. Please check requirements.txt and ensure all packages are installed.") |
|
st.stop() |
|
|
|
|
|
|
|
|
|
NER_MODEL_PATH = Path("./training_400") |
|
BASE_SUMMARIZATION_MODEL = "csebuetnlp/mT5_multilingual_XLSum" |
|
ADAPTER_PATH = Path("./mt5_finetuned_tamil_summary") |
|
|
|
|
|
DEVICE = "cpu" |
|
if torch.cuda.is_available(): |
|
print("INFO: CUDA device detected. Setting DEVICE to 'cuda'.") |
|
DEVICE = "cuda" |
|
else: |
|
print("INFO: No CUDA device detected. Using CPU.") |
|
|
|
|
|
SUMM_NUM_BEAMS = 4 |
|
MIN_LEN_PERC = 0.30 |
|
MAX_LEN_PERC = 0.70 |
|
ABS_MIN_TOKEN_LEN = 30 |
|
ABS_MAX_TOKEN_LEN = 512 |
|
|
|
|
|
|
|
warnings.filterwarnings("ignore", message="CUDA path could not be detected*") |
|
warnings.filterwarnings("ignore", message=".*You are using `torch.load` with `weights_only=False`.*") |
|
warnings.filterwarnings("ignore", message=".*The sentencepiece tokenizer that you are converting.*") |
|
|
|
|
|
ner_model_global = None |
|
summ_tokenizer_global = None |
|
summ_model_global = None |
|
models_loaded_status = "Not Loaded" |
|
|
|
|
|
@st.cache_resource |
|
def load_ner_model_cached(path): |
|
"""Loads the spaCy NER model.""" |
|
global models_loaded_status |
|
models_loaded_status = f"Loading NER model from: {path}..." |
|
st.info(models_loaded_status) |
|
if not path.exists(): |
|
st.error(f"NER Model directory not found at {path.resolve()}") |
|
models_loaded_status = "Error: NER Model Not Found" |
|
return None |
|
try: |
|
nlp = spacy.load(path) |
|
|
|
if not nlp.has_pipe("sentencizer") and not nlp.has_pipe("parser"): |
|
component_to_add_before = "ner" if "ner" in nlp.pipe_names else "tok2vec" if "tok2vec" in nlp.pipe_names else None |
|
if component_to_add_before: nlp.add_pipe("sentencizer", before=component_to_add_before) |
|
else: nlp.add_pipe("sentencizer", first=True) |
|
print("INFO: Added 'sentencizer' to NER pipeline.") |
|
print(f"✔ NER model loaded from: {path}") |
|
return nlp |
|
except Exception as e: |
|
st.error(f"Error loading NER model: {e}") |
|
models_loaded_status = f"Error Loading NER Model: {e}" |
|
return None |
|
|
|
@st.cache_resource |
|
def load_summarizer_cached(base_model_name, adapter_path, device): |
|
"""Loads the Hugging Face base model and applies PEFT adapter.""" |
|
global models_loaded_status |
|
models_loaded_status = f"Loading Summarizer (Base: {base_model_name}, Adapter: {adapter_path})..." |
|
st.info(models_loaded_status) |
|
try: |
|
print(f"┣ Loading base tokenizer: {base_model_name}...") |
|
tokenizer = AutoTokenizer.from_pretrained(base_model_name) |
|
|
|
print(f"┣ Loading base model: {base_model_name}...") |
|
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name) |
|
|
|
print(f"┣ Loading PEFT adapter from: {adapter_path}...") |
|
if not adapter_path.exists(): |
|
st.error(f"✘ FATAL: PEFT Adapter directory not found at {adapter_path.resolve()}. Using BASE model only.") |
|
model = base_model |
|
else: |
|
model = PeftModel.from_pretrained(base_model, adapter_path) |
|
print(f"✔ Successfully loaded PEFT adapter.") |
|
|
|
print(f"┣ Moving summarization model to {device}...") |
|
model.to(device) |
|
model.eval() |
|
print(f"✔ Summarization model loaded on {device}.") |
|
return tokenizer, model |
|
except Exception as e: |
|
st.error(f"Error loading summarization model: {e}") |
|
print(f"✘ FATAL: Error loading summarization model: {e}") |
|
import traceback |
|
traceback.print_exc() |
|
models_loaded_status = f"Error Loading Summarizer: {e}" |
|
return None, None |
|
|
|
|
|
def summarize_text_internal(tokenizer, model, text, device, num_beams=SUMM_NUM_BEAMS, |
|
min_length_perc=MIN_LEN_PERC, max_length_perc=MAX_LEN_PERC): |
|
"""Internal function to generate summary.""" |
|
if not text or text.isspace(): return "[Error: Input text is empty]" |
|
|
|
if not tokenizer or not model: return "[Error: Summarization model not ready]" |
|
print("INFO: Generating summary (percentage lengths)...") |
|
try: |
|
|
|
with tokenizer.as_target_tokenizer(): |
|
input_ids = tokenizer(text, return_tensors="pt", truncation=False, padding=False).input_ids |
|
input_token_count = input_ids.shape[1] |
|
if input_token_count == 0: return "[Error: Input tokenized to zero tokens]" |
|
min_len_tokens = max(ABS_MIN_TOKEN_LEN, int(input_token_count * min_length_perc)) |
|
max_len_tokens = max(min_len_tokens + 10, int(input_token_count * max_length_perc)) |
|
max_len_tokens = min(ABS_MAX_TOKEN_LEN, max_len_tokens) |
|
min_len_tokens = min(min_len_tokens, max_len_tokens) |
|
print(f"INFO: Target summary tokens: min={min_len_tokens}, max={max_len_tokens}") |
|
|
|
|
|
inputs = tokenizer(text, max_length=1024, return_tensors="pt", padding="max_length", truncation=True).to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
summary_ids = model.generate( |
|
input_ids=inputs['input_ids'], |
|
num_beams=num_beams, |
|
max_length=max_len_tokens, |
|
min_length=min_len_tokens, |
|
early_stopping=True |
|
) |
|
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True) |
|
print("✔ Summary generation complete.") |
|
return summary |
|
except Exception as e: |
|
st.error(f"Error during summary generation: {e}") |
|
print(f"✘ Error during summary generation: {e}") |
|
import traceback |
|
traceback.print_exc() |
|
return f"[Error generating summary: {e}]" |
|
|
|
def extract_entities_internal(ner_nlp, text): |
|
"""Extracts entities and formats them as a markdown string.""" |
|
if not text or text.isspace(): return [], "- No input text -" |
|
if ner_nlp is None: return [], "[Error: NER model not loaded]" |
|
print("INFO: Extracting entities...") |
|
try: |
|
doc = ner_nlp(text) |
|
entities = list({(ent.text.strip(), ent.label_) for ent in doc.ents if ent.text.strip()}) |
|
print(f"✔ Extracted {len(entities)} unique entities.") |
|
if entities: |
|
|
|
entity_list_str = "\n".join([f"- **{lbl}:** {txt}" for txt, lbl in sorted(entities, key=lambda x: x[1])]) |
|
else: |
|
entity_list_str = "(No entities found by NER model)" |
|
return entities, entity_list_str |
|
except Exception as e: |
|
st.error(f"Error during entity extraction: {e}") |
|
print(f"✘ Error during entity extraction: {e}") |
|
return [], "[Error extracting entities]" |
|
|
|
def create_prompted_input_internal(text, entities): |
|
"""Creates input string with unique entities prepended.""" |
|
if not entities: return text |
|
if not isinstance(text, str): return "[Invalid Input Text]" |
|
unique_entity_texts = sorted(list({ent[0] for ent in entities if ent[0]})) |
|
entity_string = ", ".join(unique_entity_texts) |
|
separator = ". முக்கிய சொற்கள்: " |
|
prompted_text = f"{entity_string}{separator}{text}" |
|
print(f"INFO: Created prompted input with {len(unique_entity_texts)} unique entities.") |
|
return prompted_text |
|
|
|
|
|
st.set_page_config(layout="wide", page_title="Tamil NER Summarizer", page_icon="✍️") |
|
|
|
st.title("தமிழ் செய்தி சுருக்கம் மற்றும் NER ஒருங்கிணைப்பு") |
|
st.markdown("*(Tamil News Summarization with NER Integration)*") |
|
st.markdown("---") |
|
|
|
|
|
|
|
|
|
ner_model_global = load_ner_model_cached(NER_MODEL_PATH) |
|
summ_tokenizer_global, summ_model_global = load_summarizer_cached(BASE_SUMMARIZATION_MODEL, ADAPTER_PATH, DEVICE) |
|
|
|
|
|
models_ready = ner_model_global is not None and summ_tokenizer_global is not None and summ_model_global is not None |
|
if not models_ready: |
|
st.error("One or more essential models failed to load. Please check the application logs (terminal/HF Spaces logs) for details. The app cannot function.") |
|
st.stop() |
|
else: |
|
st.sidebar.success(f"Models loaded successfully on {DEVICE.upper()}!") |
|
st.sidebar.markdown(f"**NER Model:** `{NER_MODEL_PATH.name}`") |
|
st.sidebar.markdown(f"**Summarizer:** `{BASE_SUMMARIZATION_MODEL}` + Adapter") |
|
|
|
|
|
|
|
st.header(" உள்ளீடு / Input") |
|
input_text = st.text_area("உங்கள் தமிழ் உரையை இங்கே ஒட்டவும் (Paste your Tamil text here):", height=300, key="input_text_area") |
|
|
|
|
|
if st.button("சுருக்கம் & NER ஐ உருவாக்குக (Generate Summary & NER)", key="generate_button"): |
|
if input_text and not input_text.isspace(): |
|
text_to_process = input_text.strip() |
|
st.markdown("---") |
|
st.header(" முடிவுகள் / Results") |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
|
|
with col1: |
|
st.subheader("முக்கிய சொற்கள் (NER Entities)") |
|
with st.spinner("Extracting entities..."): |
|
extracted_entities_raw, entities_display_string = extract_entities_internal(ner_model_global, text_to_process) |
|
|
|
st.markdown(entities_display_string) |
|
|
|
|
|
with col2: |
|
st.subheader("NER-உடன் செல்வாக்கு பெற்ற சுருக்கம்") |
|
st.markdown("*(NER-Influenced Summary)*") |
|
with st.spinner(f"Generating summary on {DEVICE}... (This may take time)"): |
|
|
|
prompted_input_text = create_prompted_input_internal(text_to_process, extracted_entities_raw) |
|
|
|
ner_influenced_summary = summarize_text_internal( |
|
summ_tokenizer_global, summ_model_global, prompted_input_text, DEVICE |
|
) |
|
|
|
st.markdown(ner_influenced_summary) |
|
st.caption("Summary generated using fine-tuned model with NER entities prepended to input.") |
|
|
|
st.success("Processing complete!") |
|
|
|
elif input_text is None or input_text.isspace(): |
|
st.warning("Please enter some text into the input area.") |
|
|
|
|
|
|
|
|
|
|
|
st.markdown("---") |
|
st.caption("Developed using Streamlit, spaCy, and Hugging Face Transformers/PEFT.") |