|
import streamlit as st |
|
|
|
|
|
st.set_page_config( |
|
page_title="Khmer Text Summarization", |
|
page_icon="π", |
|
layout="wide", |
|
initial_sidebar_state="expanded" |
|
) |
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
|
|
MODEL_ID = "songhieng/khmer-mt5-summarization" |
|
|
|
|
|
@st.cache_resource |
|
def load_tokenizer_and_model(model_id): |
|
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_id) |
|
return tokenizer, model |
|
|
|
tokenizer, model = load_tokenizer_and_model(MODEL_ID) |
|
|
|
|
|
st.title("π Khmer Text Summarization") |
|
st.write("Paste your Khmer text below and click **Summarize** to get a concise summary.") |
|
|
|
|
|
st.sidebar.header("Summarization Settings") |
|
max_length = st.sidebar.slider("Maximum summary length", 50, 300, 150, step=10) |
|
min_length = st.sidebar.slider("Minimum summary length", 10, 100, 30, step=5) |
|
num_beams = st.sidebar.slider("Beam search width", 1, 10, 4, step=1) |
|
|
|
|
|
user_input = st.text_area( |
|
"Enter Khmer text hereβ¦", |
|
height=300, |
|
placeholder="ααΌαααΆαα’αααααααααααα
ααΈαααβ¦" |
|
) |
|
|
|
|
|
if st.button("Summarize"): |
|
if not user_input.strip(): |
|
st.warning("β οΈ Please enter some text to summarize.") |
|
else: |
|
with st.spinner("Generating summaryβ¦"): |
|
|
|
inputs = tokenizer( |
|
user_input, |
|
return_tensors="pt", |
|
truncation=True, |
|
padding="longest" |
|
) |
|
|
|
summary_ids = model.generate( |
|
**inputs, |
|
max_length=max_length, |
|
min_length=min_length, |
|
num_beams=num_beams, |
|
length_penalty=2.0, |
|
early_stopping=True |
|
) |
|
|
|
summary = tokenizer.decode( |
|
summary_ids[0], |
|
skip_special_tokens=True |
|
) |
|
st.subheader("π Summary:") |
|
st.write(summary) |
|
|