File size: 12,783 Bytes
5af94d2 ad1d2e6 4e18f58 ad1d2e6 4e18f58 ad1d2e6 4e18f58 ad1d2e6 5af94d2 4e18f58 ad1d2e6 5af94d2 4e18f58 ad1d2e6 5af94d2 ad1d2e6 99101d2 5af94d2 4e18f58 5af94d2 ad1d2e6 5af94d2 ad1d2e6 5af94d2 ad1d2e6 5af94d2 ad1d2e6 5af94d2 ad1d2e6 5af94d2 ad1d2e6 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 ad1d2e6 4e18f58 5af94d2 4e18f58 5af94d2 ad1d2e6 5af94d2 4e18f58 ad1d2e6 4e18f58 5af94d2 4e18f58 ad1d2e6 5af94d2 4e18f58 5af94d2 ad1d2e6 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 ad1d2e6 4e18f58 5af94d2 ad1d2e6 5af94d2 4e18f58 ad1d2e6 5af94d2 4e18f58 ad1d2e6 8aacf07 5af94d2 ad1d2e6 5af94d2 8aacf07 5af94d2 8aacf07 5af94d2 4e18f58 5af94d2 336badc 5af94d2 4e18f58 5af94d2 ad1d2e6 5af94d2 ad1d2e6 4e18f58 ad1d2e6 4e18f58 ad1d2e6 5af94d2 ad1d2e6 5af94d2 ad1d2e6 4e18f58 ad1d2e6 5af94d2 ad1d2e6 5af94d2 ad1d2e6 5af94d2 ad1d2e6 5af94d2 ad1d2e6 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 5af94d2 4e18f58 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 |
# -*- coding: utf-8 -*-
# --- Prerequisites ---
# Ensure these are in your requirements.txt for Hugging Face Spaces:
# spacy==3.5.0 # Or the version used to train NER model
# streamlit>=1.0.0
# transformers>=4.20.0
# torch>=1.10.0 # Or tensorflow
# sentencepiece>=0.1.90
# protobuf==3.20.3
# peft>=0.5.0 # Parameter-Efficient Fine-Tuning library
# accelerate>=0.26.0
# numpy
# nltk # For ROUGE metric calculation during fine-tuning (needed for postprocess_text if kept)
# bitsandbytes # If using 8-bit optimizer
import streamlit as st
import spacy
from pathlib import Path
import sys
import torch
import warnings
import re
import numpy as np
try:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel
import nltk
nltk.download('punkt', quiet=True) # Ensure punkt tokenizer is available for potential NLTK use
print("✔ Successfully imported core libraries.")
except ImportError as e:
# Display error in the Streamlit app itself if imports fail during runtime
st.error(f"Error importing libraries: {e}. Please check requirements.txt and ensure all packages are installed.")
st.stop() # Stop execution if libraries are missing
# --- Configuration ---
# Use paths relative to this app.py script
NER_MODEL_PATH = Path("./training_400") # Assumes model-best folder is at the repo root
BASE_SUMMARIZATION_MODEL = "csebuetnlp/mT5_multilingual_XLSum"
ADAPTER_PATH = Path("./mt5_finetuned_tamil_summary") # Path to your fine-tuned adapters
# Device Selection
DEVICE = "cpu" # Default to CPU for broader compatibility on free tiers
if torch.cuda.is_available():
print("INFO: CUDA device detected. Setting DEVICE to 'cuda'.")
DEVICE = "cuda"
else:
print("INFO: No CUDA device detected. Using CPU.")
# Summarization parameters
SUMM_NUM_BEAMS = 4
MIN_LEN_PERC = 0.30
MAX_LEN_PERC = 0.70
ABS_MIN_TOKEN_LEN = 30
ABS_MAX_TOKEN_LEN = 512
# --- End Configuration ---
# --- Suppress Warnings ---
warnings.filterwarnings("ignore", message="CUDA path could not be detected*")
warnings.filterwarnings("ignore", message=".*You are using `torch.load` with `weights_only=False`.*")
warnings.filterwarnings("ignore", message=".*The sentencepiece tokenizer that you are converting.*")
# --- Global Variables & Model Loading Control ---
ner_model_global = None
summ_tokenizer_global = None
summ_model_global = None
models_loaded_status = "Not Loaded" # More descriptive status
# --- Model Loading with Streamlit Caching ---
@st.cache_resource # Loads only once per browser session
def load_ner_model_cached(path):
"""Loads the spaCy NER model."""
global models_loaded_status
models_loaded_status = f"Loading NER model from: {path}..."
st.info(models_loaded_status)
if not path.exists():
st.error(f"NER Model directory not found at {path.resolve()}")
models_loaded_status = "Error: NER Model Not Found"
return None
try:
nlp = spacy.load(path)
# Add sentencizer if needed (crucial for sentence splitting later)
if not nlp.has_pipe("sentencizer") and not nlp.has_pipe("parser"):
component_to_add_before = "ner" if "ner" in nlp.pipe_names else "tok2vec" if "tok2vec" in nlp.pipe_names else None
if component_to_add_before: nlp.add_pipe("sentencizer", before=component_to_add_before)
else: nlp.add_pipe("sentencizer", first=True)
print("INFO: Added 'sentencizer' to NER pipeline.")
print(f"✔ NER model loaded from: {path}")
return nlp
except Exception as e:
st.error(f"Error loading NER model: {e}")
models_loaded_status = f"Error Loading NER Model: {e}"
return None
@st.cache_resource # Loads only once per browser session
def load_summarizer_cached(base_model_name, adapter_path, device):
"""Loads the Hugging Face base model and applies PEFT adapter."""
global models_loaded_status
models_loaded_status = f"Loading Summarizer (Base: {base_model_name}, Adapter: {adapter_path})..."
st.info(models_loaded_status)
try:
print(f"┣ Loading base tokenizer: {base_model_name}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
print(f"┣ Loading base model: {base_model_name}...")
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name)
print(f"┣ Loading PEFT adapter from: {adapter_path}...")
if not adapter_path.exists():
st.error(f"✘ FATAL: PEFT Adapter directory not found at {adapter_path.resolve()}. Using BASE model only.")
model = base_model # Fallback to base model
else:
model = PeftModel.from_pretrained(base_model, adapter_path)
print(f"✔ Successfully loaded PEFT adapter.")
print(f"┣ Moving summarization model to {device}...")
model.to(device)
model.eval() # Set to evaluation mode
print(f"✔ Summarization model loaded on {device}.")
return tokenizer, model
except Exception as e:
st.error(f"Error loading summarization model: {e}")
print(f"✘ FATAL: Error loading summarization model: {e}")
import traceback
traceback.print_exc()
models_loaded_status = f"Error Loading Summarizer: {e}"
return None, None
# --- Helper Functions ---
def summarize_text_internal(tokenizer, model, text, device, num_beams=SUMM_NUM_BEAMS,
min_length_perc=MIN_LEN_PERC, max_length_perc=MAX_LEN_PERC):
"""Internal function to generate summary."""
if not text or text.isspace(): return "[Error: Input text is empty]"
# Ensure models are loaded before proceeding
if not tokenizer or not model: return "[Error: Summarization model not ready]"
print("INFO: Generating summary (percentage lengths)...")
try:
# Calculate lengths
with tokenizer.as_target_tokenizer():
input_ids = tokenizer(text, return_tensors="pt", truncation=False, padding=False).input_ids
input_token_count = input_ids.shape[1]
if input_token_count == 0: return "[Error: Input tokenized to zero tokens]"
min_len_tokens = max(ABS_MIN_TOKEN_LEN, int(input_token_count * min_length_perc))
max_len_tokens = max(min_len_tokens + 10, int(input_token_count * max_length_perc))
max_len_tokens = min(ABS_MAX_TOKEN_LEN, max_len_tokens)
min_len_tokens = min(min_len_tokens, max_len_tokens)
print(f"INFO: Target summary tokens: min={min_len_tokens}, max={max_len_tokens}")
# Tokenize for input
inputs = tokenizer(text, max_length=1024, return_tensors="pt", padding="max_length", truncation=True).to(device)
# Generate
with torch.no_grad():
summary_ids = model.generate(
input_ids=inputs['input_ids'],
num_beams=num_beams,
max_length=max_len_tokens,
min_length=min_len_tokens,
early_stopping=True
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
print("✔ Summary generation complete.")
return summary
except Exception as e:
st.error(f"Error during summary generation: {e}")
print(f"✘ Error during summary generation: {e}")
import traceback
traceback.print_exc()
return f"[Error generating summary: {e}]"
def extract_entities_internal(ner_nlp, text):
"""Extracts entities and formats them as a markdown string."""
if not text or text.isspace(): return [], "- No input text -"
if ner_nlp is None: return [], "[Error: NER model not loaded]"
print("INFO: Extracting entities...")
try:
doc = ner_nlp(text)
entities = list({(ent.text.strip(), ent.label_) for ent in doc.ents if ent.text.strip()})
print(f"✔ Extracted {len(entities)} unique entities.")
if entities:
# Format as Markdown list
entity_list_str = "\n".join([f"- **{lbl}:** {txt}" for txt, lbl in sorted(entities, key=lambda x: x[1])]) # Sort by label
else:
entity_list_str = "(No entities found by NER model)"
return entities, entity_list_str
except Exception as e:
st.error(f"Error during entity extraction: {e}")
print(f"✘ Error during entity extraction: {e}")
return [], "[Error extracting entities]"
def create_prompted_input_internal(text, entities):
"""Creates input string with unique entities prepended."""
if not entities: return text
if not isinstance(text, str): return "[Invalid Input Text]"
unique_entity_texts = sorted(list({ent[0] for ent in entities if ent[0]}))
entity_string = ", ".join(unique_entity_texts)
separator = ". முக்கிய சொற்கள்: "
prompted_text = f"{entity_string}{separator}{text}"
print(f"INFO: Created prompted input with {len(unique_entity_texts)} unique entities.")
return prompted_text
# --- Streamlit App Layout ---
st.set_page_config(layout="wide", page_title="Tamil NER Summarizer", page_icon="✍️")
st.title("தமிழ் செய்தி சுருக்கம் மற்றும் NER ஒருங்கிணைப்பு")
st.markdown("*(Tamil News Summarization with NER Integration)*")
st.markdown("---")
# --- Load Models ---
# Trigger loading models using the cached functions
# Assign to global variables if loading is successful
ner_model_global = load_ner_model_cached(NER_MODEL_PATH)
summ_tokenizer_global, summ_model_global = load_summarizer_cached(BASE_SUMMARIZATION_MODEL, ADAPTER_PATH, DEVICE)
# Check if models loaded successfully before proceeding
models_ready = ner_model_global is not None and summ_tokenizer_global is not None and summ_model_global is not None
if not models_ready:
st.error("One or more essential models failed to load. Please check the application logs (terminal/HF Spaces logs) for details. The app cannot function.")
st.stop() # Stop the app if models aren't ready
else:
st.sidebar.success(f"Models loaded successfully on {DEVICE.upper()}!")
st.sidebar.markdown(f"**NER Model:** `{NER_MODEL_PATH.name}`")
st.sidebar.markdown(f"**Summarizer:** `{BASE_SUMMARIZATION_MODEL}` + Adapter")
# --- Input Area ---
st.header(" உள்ளீடு / Input")
input_text = st.text_area("உங்கள் தமிழ் உரையை இங்கே ஒட்டவும் (Paste your Tamil text here):", height=300, key="input_text_area")
# --- Processing Trigger ---
if st.button("சுருக்கம் & NER ஐ உருவாக்குக (Generate Summary & NER)", key="generate_button"):
if input_text and not input_text.isspace():
text_to_process = input_text.strip()
st.markdown("---")
st.header(" முடிவுகள் / Results")
# Use columns for the final output
col1, col2 = st.columns(2)
# --- Column 1: NER Entities ---
with col1:
st.subheader("முக்கிய சொற்கள் (NER Entities)")
with st.spinner("Extracting entities..."):
extracted_entities_raw, entities_display_string = extract_entities_internal(ner_model_global, text_to_process)
# Display entities using markdown for copyability
st.markdown(entities_display_string)
# --- Column 2: NER-Influenced Summary ---
with col2:
st.subheader("NER-உடன் செல்வாக்கு பெற்ற சுருக்கம்")
st.markdown("*(NER-Influenced Summary)*")
with st.spinner(f"Generating summary on {DEVICE}... (This may take time)"):
# Create prompted input using the extracted entities
prompted_input_text = create_prompted_input_internal(text_to_process, extracted_entities_raw)
# Generate the summary
ner_influenced_summary = summarize_text_internal(
summ_tokenizer_global, summ_model_global, prompted_input_text, DEVICE
)
# Display summary using markdown for copyability
st.markdown(ner_influenced_summary)
st.caption("Summary generated using fine-tuned model with NER entities prepended to input.")
st.success("Processing complete!")
elif input_text is None or input_text.isspace():
st.warning("Please enter some text into the input area.")
# Handle the case where button hasn't been pressed yet explicitly
# else:
# st.info("Click the button to generate summaries and extract entities.")
st.markdown("---")
st.caption("Developed using Streamlit, spaCy, and Hugging Face Transformers/PEFT.") |