wiraindrak commited on
Commit
ee90915
·
1 Parent(s): d3f7143

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -45
app.py CHANGED
@@ -1,4 +1,6 @@
1
- from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, BertTokenizer, EncoderDecoderModel, AutoTokenizer, AutoModelForSeq2SeqLM
 
 
2
 
3
  import gradio as gr
4
  from gradio.mix import Parallel
@@ -6,13 +8,24 @@ from gradio.mix import Parallel
6
  tokenizer_t5 = T5Tokenizer.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
7
  model_t5 = T5ForConditionalGeneration.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
8
 
9
- tokenizer_bert = BertTokenizer.from_pretrained("cahya/bert2bert-indonesian-summarization")
10
- tokenizer_bert.bos_token = tokenizer_bert.cls_token
11
- tokenizer_bert.eos_token = tokenizer_bert.sep_token
12
- model_bert = EncoderDecoderModel.from_pretrained("cahya/bert2bert-indonesian-summarization")
13
 
14
- t5_para_tokenizer = AutoTokenizer.from_pretrained("Wikidepia/IndoT5-base-paraphrase")
15
- t5_para_model = AutoModelForSeq2SeqLM.from_pretrained("Wikidepia/IndoT5-base-paraphrase")
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
  def summ_t5(text):
@@ -28,45 +41,44 @@ def summ_t5(text):
28
  summary_text = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
29
  return summary_text
30
 
31
- def summ_bert(text):
32
- input_ids = tokenizer_bert.encode(text, return_tensors="pt")
33
- summary_ids= model_bert.generate(input_ids,
34
- max_length=100,
35
- num_beams=10,
36
- repetition_penalty=2.5,
37
- length_penalty=1.0,
38
- early_stopping=True,
39
- no_repeat_ngram_size=2,
40
- use_cache=True)
41
-
42
- summary_text = tokenizer_bert.decode(summary_ids[0], skip_special_tokens=True)
43
- return summary_text
44
 
45
- def para_t5(text):
46
- encoding = t5_para_tokenizer(text, padding='longest', return_tensors='pt')
47
- outputs = t5_para_model.generate(
48
- input_ids=encoding["input_ids"],
49
- attention_mask=encoding["attention_mask"],
50
- max_length=100,
51
- do_sample=True,
52
- top_k=120,
53
- top_p=0.95,
54
- early_stopping=True,
55
- num_return_sequences=1)
56
- return t5_para_tokenizer.decode(
57
- outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True
58
- )
59
 
60
-
61
- def summarize(text):
62
- t5_ = summ_t5(text)
63
- bert_ = summ_bert(text)
64
- para_ = para_t5(t5_)
65
- return t5_, bert_, para_
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  if __name__ == "__main__":
68
  with gr.Blocks() as demo:
69
- gr.Markdown("""<h1 style="text-align:center">Summary of Summarizer - Indonesia</h1>""")
70
 
71
  gr.Markdown(
72
  """
@@ -77,9 +89,14 @@ if __name__ == "__main__":
77
  with gr.Column():
78
  input_text = gr.Textbox(label="Input Text")
79
  analyze_button = gr.Button(label="Analyze")
 
 
 
 
 
80
  with gr.Column():
81
- t5_output = gr.Textbox(label="T5 Base Output")
82
- bert_output = gr.Textbox(label="Bert2Bert Base Output")
83
- para_output = gr.Textbox(label="T5 Paraphrase Output")
84
- analyze_button.click(summarize, inputs=input_text, outputs=[t5_output, bert_output, para_output])
85
  demo.launch()
 
1
+ from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, pipeline
2
+ import nltk.data
3
+
4
 
5
  import gradio as gr
6
  from gradio.mix import Parallel
 
8
  tokenizer_t5 = T5Tokenizer.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
9
  model_t5 = T5ForConditionalGeneration.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
10
 
11
+ pretrained_sentiment = "w11wo/indonesian-roberta-base-sentiment-classifier"
12
+ pretrained_ner = "cahya/bert-base-indonesian-NER"
13
+
14
+ sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
15
 
16
+ sentiment_pipeline = pipeline(
17
+ "sentiment-analysis",
18
+ model=pretrained_sentiment,
19
+ tokenizer=pretrained_sentiment,
20
+ return_all_scores=True
21
+ )
22
+
23
+ ner_pipeline = pipeline(
24
+ "ner",
25
+ model=pretrained_ner,
26
+ tokenizer=pretrained_ner,
27
+ grouped_entities=True
28
+ )
29
 
30
 
31
  def summ_t5(text):
 
41
  summary_text = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
42
  return summary_text
43
 
44
+ def sentiment_analysis(text):
45
+ output = sentiment_pipeline(text)
46
+ return {elm["label"]: elm["score"] for elm in output[0]}
 
 
 
 
 
 
 
 
 
 
47
 
48
+ def ner(text):
49
+ output = ner_pipeline(text)
50
+ for elm in output:
51
+ elm['entity'] = elm['entity_group']
52
+ return {"text": text, "entities": output}
 
 
 
 
 
 
 
 
 
53
 
54
+ def sentiment_df(text):
55
+ text_list = tokenizer.tokenize(text)
56
+ result = [sentiment_analysis(text) for text in text_list]
57
+ sentence = []
58
+ labels = []
59
+ scores = []
60
+ for pred in result:
61
+ idx = list(pred.values()).index(max(list(pred.values())))
62
+ labels.append(list(pred.keys())[idx])
63
+ scores.append(round(list(pred.values())[idx], 3))
64
+ df['Text'] = text_list
65
+ df['Label'] = labels
66
+ df['Score'] = scores
67
+ return df
68
+
69
+ def run(text):
70
+ summ_ = summ_t5(text)
71
+ sent_ = sentiment_analysis(summ_)
72
+ ner_ = ner(summ_)
73
+ df_ = sentiment_df(text)
74
+ ner_all = ner(text)
75
+ fig = plt.figure()
76
+ df.groupby(["Label"])["Text"].count().plot.pie(autopct="%.1f%%", figsize=(6,6))
77
+ return summ_, sent_, ner_, fig, ner_all
78
 
79
  if __name__ == "__main__":
80
  with gr.Blocks() as demo:
81
+ gr.Markdown("""<h1 style="text-align:center">News Analyzer - Indonesia</h1>""")
82
 
83
  gr.Markdown(
84
  """
 
89
  with gr.Column():
90
  input_text = gr.Textbox(label="Input Text")
91
  analyze_button = gr.Button(label="Analyze")
92
+
93
+ summ_output = gr.Textbox(label="Article Summary")
94
+ ner_output = gr.HighlightedText(label="NER Summary")
95
+ sent_output = gr.Textbox(label="Sentiment Summary")
96
+
97
  with gr.Column():
98
+ plot_component = gr.Plot(label="Pie Chart of Sentiments")
99
+ ner_all_output = gr.HighlightedText(label="NER Article")
100
+
101
+ analyze_button.click(run, inputs=input_text, outputs=[summ_output, sent_output, ner_output, plot_component, ner_all_output])
102
  demo.launch()