Spaces:

Nivas007
/

Tamil_txt_Summarisation_NER

Running

File size: 12,783 Bytes

5af94d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad1d2e6
4e18f58
 
ad1d2e6
4e18f58
ad1d2e6
4e18f58
 
 
ad1d2e6
 
 
5af94d2
 
 
4e18f58
ad1d2e6
5af94d2
 
4e18f58
ad1d2e6
5af94d2
ad1d2e6
 
99101d2
5af94d2
 
 
 
 
4e18f58
5af94d2
 
 
 
ad1d2e6
5af94d2
ad1d2e6
 
5af94d2
 
ad1d2e6
5af94d2
ad1d2e6
 
5af94d2
ad1d2e6
5af94d2
 
 
 
 
 
 
ad1d2e6
4e18f58
5af94d2
4e18f58
 
5af94d2
 
 
4e18f58
 
5af94d2
4e18f58
ad1d2e6
4e18f58
5af94d2
4e18f58
5af94d2
 
 
ad1d2e6
5af94d2
4e18f58
ad1d2e6
4e18f58
5af94d2
4e18f58
ad1d2e6
5af94d2
4e18f58
 
5af94d2
 
 
ad1d2e6
5af94d2
4e18f58
 
5af94d2
4e18f58
 
5af94d2
4e18f58
5af94d2
 
 
 
 
4e18f58
5af94d2
4e18f58
5af94d2
 
4e18f58
ad1d2e6
4e18f58
5af94d2
ad1d2e6
5af94d2
 
4e18f58
ad1d2e6
5af94d2
4e18f58
ad1d2e6
8aacf07
 
5af94d2
 
 
ad1d2e6
5af94d2
 
 
 
 
8aacf07
5af94d2
 
 
8aacf07
 
5af94d2
 
4e18f58
5af94d2
 
 
 
336badc
5af94d2
 
4e18f58
5af94d2
ad1d2e6
5af94d2
ad1d2e6
 
4e18f58
ad1d2e6
4e18f58
 
ad1d2e6
 
 
5af94d2
 
 
 
ad1d2e6
 
 
5af94d2
 
 
 
 
 
 
ad1d2e6
4e18f58
ad1d2e6
5af94d2
ad1d2e6
 
5af94d2
ad1d2e6
5af94d2
 
 
ad1d2e6
 
5af94d2
ad1d2e6
 
4e18f58
5af94d2
4e18f58
5af94d2
 
 
4e18f58
 
 
5af94d2
 
 
4e18f58
 
5af94d2
 
 
4e18f58
5af94d2
 
 
 
4e18f58
 
 
5af94d2
 
4e18f58
 
5af94d2
4e18f58
 
 
5af94d2
4e18f58
5af94d2
4e18f58
 
5af94d2
4e18f58
5af94d2
4e18f58
5af94d2
 
 
4e18f58
5af94d2
4e18f58
5af94d2
 
 
 
4e18f58
5af94d2
4e18f58
5af94d2
4e18f58
5af94d2
 
 
4e18f58
 
 
5af94d2
 
 
 
 
 
4e18f58

# -*- coding: utf-8 -*-

# --- Prerequisites ---
# Ensure these are in your requirements.txt for Hugging Face Spaces:
# spacy==3.5.0 # Or the version used to train NER model
# streamlit>=1.0.0
# transformers>=4.20.0
# torch>=1.10.0 # Or tensorflow
# sentencepiece>=0.1.90
# protobuf==3.20.3
# peft>=0.5.0 # Parameter-Efficient Fine-Tuning library
# accelerate>=0.26.0
# numpy
# nltk # For ROUGE metric calculation during fine-tuning (needed for postprocess_text if kept)
# bitsandbytes # If using 8-bit optimizer

import streamlit as st
import spacy
from pathlib import Path
import sys
import torch
import warnings
import re
import numpy as np

try:
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
    from peft import PeftModel
    import nltk
    nltk.download('punkt', quiet=True) # Ensure punkt tokenizer is available for potential NLTK use
    print("✔ Successfully imported core libraries.")
except ImportError as e:
    # Display error in the Streamlit app itself if imports fail during runtime
    st.error(f"Error importing libraries: {e}. Please check requirements.txt and ensure all packages are installed.")
    st.stop() # Stop execution if libraries are missing


# --- Configuration ---
# Use paths relative to this app.py script
NER_MODEL_PATH = Path("./training_400") # Assumes model-best folder is at the repo root
BASE_SUMMARIZATION_MODEL = "csebuetnlp/mT5_multilingual_XLSum"
ADAPTER_PATH = Path("./mt5_finetuned_tamil_summary") # Path to your fine-tuned adapters

# Device Selection
DEVICE = "cpu" # Default to CPU for broader compatibility on free tiers
if torch.cuda.is_available():
    print("INFO: CUDA device detected. Setting DEVICE to 'cuda'.")
    DEVICE = "cuda"
else:
    print("INFO: No CUDA device detected. Using CPU.")

# Summarization parameters
SUMM_NUM_BEAMS = 4
MIN_LEN_PERC = 0.30
MAX_LEN_PERC = 0.70
ABS_MIN_TOKEN_LEN = 30
ABS_MAX_TOKEN_LEN = 512
# --- End Configuration ---

# --- Suppress Warnings ---
warnings.filterwarnings("ignore", message="CUDA path could not be detected*")
warnings.filterwarnings("ignore", message=".*You are using `torch.load` with `weights_only=False`.*")
warnings.filterwarnings("ignore", message=".*The sentencepiece tokenizer that you are converting.*")

# --- Global Variables & Model Loading Control ---
ner_model_global = None
summ_tokenizer_global = None
summ_model_global = None
models_loaded_status = "Not Loaded" # More descriptive status

# --- Model Loading with Streamlit Caching ---
@st.cache_resource # Loads only once per browser session
def load_ner_model_cached(path):
    """Loads the spaCy NER model."""
    global models_loaded_status
    models_loaded_status = f"Loading NER model from: {path}..."
    st.info(models_loaded_status)
    if not path.exists():
        st.error(f"NER Model directory not found at {path.resolve()}")
        models_loaded_status = "Error: NER Model Not Found"
        return None
    try:
        nlp = spacy.load(path)
        # Add sentencizer if needed (crucial for sentence splitting later)
        if not nlp.has_pipe("sentencizer") and not nlp.has_pipe("parser"):
             component_to_add_before = "ner" if "ner" in nlp.pipe_names else "tok2vec" if "tok2vec" in nlp.pipe_names else None
             if component_to_add_before: nlp.add_pipe("sentencizer", before=component_to_add_before)
             else: nlp.add_pipe("sentencizer", first=True)
             print("INFO: Added 'sentencizer' to NER pipeline.")
        print(f"✔ NER model loaded from: {path}")
        return nlp
    except Exception as e:
        st.error(f"Error loading NER model: {e}")
        models_loaded_status = f"Error Loading NER Model: {e}"
        return None

@st.cache_resource # Loads only once per browser session
def load_summarizer_cached(base_model_name, adapter_path, device):
    """Loads the Hugging Face base model and applies PEFT adapter."""
    global models_loaded_status
    models_loaded_status = f"Loading Summarizer (Base: {base_model_name}, Adapter: {adapter_path})..."
    st.info(models_loaded_status)
    try:
        print(f"┣ Loading base tokenizer: {base_model_name}...")
        tokenizer = AutoTokenizer.from_pretrained(base_model_name)

        print(f"┣ Loading base model: {base_model_name}...")
        base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name)

        print(f"┣ Loading PEFT adapter from: {adapter_path}...")
        if not adapter_path.exists():
             st.error(f"✘ FATAL: PEFT Adapter directory not found at {adapter_path.resolve()}. Using BASE model only.")
             model = base_model # Fallback to base model
        else:
             model = PeftModel.from_pretrained(base_model, adapter_path)
             print(f"✔ Successfully loaded PEFT adapter.")

        print(f"┣ Moving summarization model to {device}...")
        model.to(device)
        model.eval() # Set to evaluation mode
        print(f"✔ Summarization model loaded on {device}.")
        return tokenizer, model
    except Exception as e:
        st.error(f"Error loading summarization model: {e}")
        print(f"✘ FATAL: Error loading summarization model: {e}")
        import traceback
        traceback.print_exc()
        models_loaded_status = f"Error Loading Summarizer: {e}"
        return None, None

# --- Helper Functions ---
def summarize_text_internal(tokenizer, model, text, device, num_beams=SUMM_NUM_BEAMS,
                            min_length_perc=MIN_LEN_PERC, max_length_perc=MAX_LEN_PERC):
    """Internal function to generate summary."""
    if not text or text.isspace(): return "[Error: Input text is empty]"
    # Ensure models are loaded before proceeding
    if not tokenizer or not model: return "[Error: Summarization model not ready]"
    print("INFO: Generating summary (percentage lengths)...")
    try:
        # Calculate lengths
        with tokenizer.as_target_tokenizer():
            input_ids = tokenizer(text, return_tensors="pt", truncation=False, padding=False).input_ids
        input_token_count = input_ids.shape[1]
        if input_token_count == 0: return "[Error: Input tokenized to zero tokens]"
        min_len_tokens = max(ABS_MIN_TOKEN_LEN, int(input_token_count * min_length_perc))
        max_len_tokens = max(min_len_tokens + 10, int(input_token_count * max_length_perc))
        max_len_tokens = min(ABS_MAX_TOKEN_LEN, max_len_tokens)
        min_len_tokens = min(min_len_tokens, max_len_tokens)
        print(f"INFO: Target summary tokens: min={min_len_tokens}, max={max_len_tokens}")

        # Tokenize for input
        inputs = tokenizer(text, max_length=1024, return_tensors="pt", padding="max_length", truncation=True).to(device)

        # Generate
        with torch.no_grad():
            summary_ids = model.generate(
                input_ids=inputs['input_ids'],
                num_beams=num_beams,
                max_length=max_len_tokens,
                min_length=min_len_tokens,
                early_stopping=True
            )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        print("✔ Summary generation complete.")
        return summary
    except Exception as e:
        st.error(f"Error during summary generation: {e}")
        print(f"✘ Error during summary generation: {e}")
        import traceback
        traceback.print_exc()
        return f"[Error generating summary: {e}]"

def extract_entities_internal(ner_nlp, text):
    """Extracts entities and formats them as a markdown string."""
    if not text or text.isspace(): return [], "- No input text -"
    if ner_nlp is None: return [], "[Error: NER model not loaded]"
    print("INFO: Extracting entities...")
    try:
        doc = ner_nlp(text)
        entities = list({(ent.text.strip(), ent.label_) for ent in doc.ents if ent.text.strip()})
        print(f"✔ Extracted {len(entities)} unique entities.")
        if entities:
            # Format as Markdown list
            entity_list_str = "\n".join([f"- **{lbl}:** {txt}" for txt, lbl in sorted(entities, key=lambda x: x[1])]) # Sort by label
        else:
            entity_list_str = "(No entities found by NER model)"
        return entities, entity_list_str
    except Exception as e:
        st.error(f"Error during entity extraction: {e}")
        print(f"✘ Error during entity extraction: {e}")
        return [], "[Error extracting entities]"

def create_prompted_input_internal(text, entities):
    """Creates input string with unique entities prepended."""
    if not entities: return text
    if not isinstance(text, str): return "[Invalid Input Text]"
    unique_entity_texts = sorted(list({ent[0] for ent in entities if ent[0]}))
    entity_string = ", ".join(unique_entity_texts)
    separator = ". முக்கிய சொற்கள்: "
    prompted_text = f"{entity_string}{separator}{text}"
    print(f"INFO: Created prompted input with {len(unique_entity_texts)} unique entities.")
    return prompted_text

# --- Streamlit App Layout ---
st.set_page_config(layout="wide", page_title="Tamil NER Summarizer", page_icon="✍️")

st.title("தமிழ் செய்தி சுருக்கம் மற்றும் NER ஒருங்கிணைப்பு")
st.markdown("*(Tamil News Summarization with NER Integration)*")
st.markdown("---")

# --- Load Models ---
# Trigger loading models using the cached functions
# Assign to global variables if loading is successful
ner_model_global = load_ner_model_cached(NER_MODEL_PATH)
summ_tokenizer_global, summ_model_global = load_summarizer_cached(BASE_SUMMARIZATION_MODEL, ADAPTER_PATH, DEVICE)

# Check if models loaded successfully before proceeding
models_ready = ner_model_global is not None and summ_tokenizer_global is not None and summ_model_global is not None
if not models_ready:
    st.error("One or more essential models failed to load. Please check the application logs (terminal/HF Spaces logs) for details. The app cannot function.")
    st.stop() # Stop the app if models aren't ready
else:
    st.sidebar.success(f"Models loaded successfully on {DEVICE.upper()}!")
    st.sidebar.markdown(f"**NER Model:** `{NER_MODEL_PATH.name}`")
    st.sidebar.markdown(f"**Summarizer:** `{BASE_SUMMARIZATION_MODEL}` + Adapter")


# --- Input Area ---
st.header(" உள்ளீடு / Input")
input_text = st.text_area("உங்கள் தமிழ் உரையை இங்கே ஒட்டவும் (Paste your Tamil text here):", height=300, key="input_text_area")

# --- Processing Trigger ---
if st.button("சுருக்கம் & NER ஐ உருவாக்குக (Generate Summary & NER)", key="generate_button"):
    if input_text and not input_text.isspace():
        text_to_process = input_text.strip()
        st.markdown("---")
        st.header(" முடிவுகள் / Results")

        # Use columns for the final output
        col1, col2 = st.columns(2)

        # --- Column 1: NER Entities ---
        with col1:
            st.subheader("முக்கிய சொற்கள் (NER Entities)")
            with st.spinner("Extracting entities..."):
                extracted_entities_raw, entities_display_string = extract_entities_internal(ner_model_global, text_to_process)
            # Display entities using markdown for copyability
            st.markdown(entities_display_string)

        # --- Column 2: NER-Influenced Summary ---
        with col2:
            st.subheader("NER-உடன் செல்வாக்கு பெற்ற சுருக்கம்")
            st.markdown("*(NER-Influenced Summary)*")
            with st.spinner(f"Generating summary on {DEVICE}... (This may take time)"):
                # Create prompted input using the extracted entities
                prompted_input_text = create_prompted_input_internal(text_to_process, extracted_entities_raw)
                # Generate the summary
                ner_influenced_summary = summarize_text_internal(
                    summ_tokenizer_global, summ_model_global, prompted_input_text, DEVICE
                )
            # Display summary using markdown for copyability
            st.markdown(ner_influenced_summary)
            st.caption("Summary generated using fine-tuned model with NER entities prepended to input.")

        st.success("Processing complete!")

    elif input_text is None or input_text.isspace():
         st.warning("Please enter some text into the input area.")
    # Handle the case where button hasn't been pressed yet explicitly
    # else:
    #      st.info("Click the button to generate summaries and extract entities.")


st.markdown("---")
st.caption("Developed using Streamlit, spaCy, and Hugging Face Transformers/PEFT.")