songhieng commited on
Commit
9599706
Β·
verified Β·
1 Parent(s): 8ef53b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -32
app.py CHANGED
@@ -1,49 +1,75 @@
1
  import streamlit as st
2
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
 
4
- # Load tokenizer and model
5
- model_identifier = "songhieng/khmer-mt5-summarization"
6
- tokenizer = AutoTokenizer.from_pretrained(model_identifier, use_fast=False)
7
- model = AutoModelForSeq2SeqLM.from_pretrained(model_identifier, use_fast=False)
8
 
9
- # Set page configuration
10
- st.set_page_config(page_title="Khmer Text Summarization", layout="wide")
 
 
 
 
11
 
12
- # App title and description
13
- st.title("Khmer Text Summarization")
14
- st.write("Enter Khmer text below to generate a concise summary.")
15
 
16
- # Text input
17
- user_input = st.text_area("Input Text:", height=300)
 
 
 
 
18
 
19
- # Summarization parameters
 
 
 
 
20
  st.sidebar.header("Summarization Settings")
21
- max_length = st.sidebar.slider("Maximum Summary Length", min_value=50, max_value=300, value=150, step=10)
22
- min_length = st.sidebar.slider("Minimum Summary Length", min_value=10, max_value=100, value=30, step=5)
23
- num_beams = st.sidebar.slider("Number of Beams", min_value=1, max_value=10, value=4, step=1)
 
 
 
 
 
 
24
 
25
- # Summarize button
26
- if st.button("Summarize"):
27
- if user_input.strip():
28
- try:
29
- # Tokenize input
30
- inputs = tokenizer.encode(user_input, return_tensors="pt", truncation=True)
31
 
32
- # Generate summary
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  summary_ids = model.generate(
34
- inputs,
35
  max_length=max_length,
36
  min_length=min_length,
37
  num_beams=num_beams,
38
  length_penalty=2.0,
39
  early_stopping=True
40
  )
41
- summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
42
-
43
- # Display summary
44
- st.subheader("Summary:")
45
- st.write(summary)
46
- except Exception as e:
47
- st.error(f"An error occurred during summarization: {e}")
48
- else:
49
- st.warning("Please enter some text to summarize.")
 
1
  import streamlit as st
2
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
 
4
+ # 1. Model identifier
5
+ MODEL_ID = "songhieng/khmer-mt5-summarization"
 
 
6
 
7
+ # 2. Load tokenizer (you can choose fast or slow; fast is the default)
8
+ @st.cache_resource
9
+ def load_tokenizer_and_model(model_id):
10
+ tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
11
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
12
+ return tokenizer, model
13
 
14
+ tokenizer, model = load_tokenizer_and_model(MODEL_ID)
 
 
15
 
16
+ # 3. Streamlit page config
17
+ st.set_page_config(
18
+ page_title="Khmer Text Summarization",
19
+ layout="wide",
20
+ initial_sidebar_state="expanded"
21
+ )
22
 
23
+ # 4. App header
24
+ st.title("πŸ“ Khmer Text Summarization")
25
+ st.write("Paste your Khmer text below and click **Summarize** to get a concise summary.")
26
+
27
+ # 5. Sidebar summarization settings
28
  st.sidebar.header("Summarization Settings")
29
+ max_length = st.sidebar.slider(
30
+ "Maximum summary length", 50, 300, 150, step=10
31
+ )
32
+ min_length = st.sidebar.slider(
33
+ "Minimum summary length", 10, 100, 30, step=5
34
+ )
35
+ num_beams = st.sidebar.slider(
36
+ "Beam search width", 1, 10, 4, step=1
37
+ )
38
 
39
+ # 6. Text input
40
+ user_input = st.text_area(
41
+ "Enter Khmer text here…",
42
+ height=300,
43
+ placeholder="αžŸαžΌαž˜αžœαžΆαž™αž’αžαŸ’αžαž”αž‘αžαŸ’αž˜αŸ‚αžšαž“αŸ…αž‘αžΈαž“αŸαŸ‡β€¦"
44
+ )
45
 
46
+ # 7. Summarize button
47
+ if st.button("Summarize"):
48
+ if not user_input.strip():
49
+ st.warning("⚠️ Please enter some text to summarize.")
50
+ else:
51
+ with st.spinner("Generating summary…"):
52
+ # Tokenize
53
+ inputs = tokenizer(
54
+ user_input,
55
+ return_tensors="pt",
56
+ truncation=True,
57
+ padding="longest"
58
+ )
59
+ # Generate
60
  summary_ids = model.generate(
61
+ **inputs,
62
  max_length=max_length,
63
  min_length=min_length,
64
  num_beams=num_beams,
65
  length_penalty=2.0,
66
  early_stopping=True
67
  )
68
+ # Decode
69
+ summary = tokenizer.decode(
70
+ summary_ids[0],
71
+ skip_special_tokens=True
72
+ )
73
+ # Display
74
+ st.subheader("πŸ”– Summary:")
75
+ st.write(summary)