DeepLearning101 commited on
Commit
b9c5636
·
verified ·
1 Parent(s): 00f629f

Update app.py

Browse files

File "/home/user/app/app.py", line 57
return corrected_text + ' ' + str(details)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
SyntaxError: 'return' outside function

在except區塊中定義了幾個函數,但在except區塊的末尾,您直接使用了return語句,而這個return語句不屬於任何函數,這就是導致語法錯誤的原因。

移動函數定義:將ai_text、to_highlight和get_errors函數移出except區塊,使其成為全域函數。
例外處理:在except區塊中加入適當的異常處理邏輯,例如列印錯誤訊息。
介面定義:確認Gradio 介面的建立和配置正確無誤。

Files changed (1) hide show
  1. app.py +32 -54
app.py CHANGED
@@ -1,5 +1,3 @@
1
- # -*- coding: utf-8 -*-
2
-
3
  import gradio as gr
4
  import operator
5
  import torch
@@ -7,58 +5,41 @@ import os
7
  from transformers import BertTokenizer, BertForMaskedLM
8
 
9
  # 使用私有模型和分詞器
10
- model_name_or_path = "DeepLearning101/Corrector101zhTW"
11
- # auth_token = os.getenv("Corrector101zhTW") # 從環境變量中獲取 token
12
-
13
- # tokenizer = BertTokenizer.from_pretrained(model_name_or_path, use_auth_token=auth_token)
14
- # model = BertForMaskedLM.from_pretrained(model_name_or_path, use_auth_token=auth_token)
15
- # tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
16
- # model = BertForMaskedLM.from_pretrained(model_name_or_path)
17
-
18
  model_name_or_path = "DeepLearning101/Corrector101zhTW"
19
 
 
20
  try:
21
  tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
22
  model = BertForMaskedLM.from_pretrained(model_name_or_path)
23
  except Exception as e:
24
-
25
-
26
- def ai_text(text):
27
- with torch.no_grad():
28
- outputs = model(**tokenizer([text], padding=True, return_tensors='pt'))
29
 
30
- def to_highlight(corrected_sent, errs):
31
- output = [{"entity": "糾錯", "word": err[1], "start": err[2], "end": err[3]} for i, err in
32
- enumerate(errs)]
33
- return {"text": corrected_sent, "entities": output}
34
-
35
- def get_errors(corrected_text, origin_text):
36
- sub_details = []
37
- for i, ori_char in enumerate(origin_text):
38
- if ori_char in [' ', '“', '”', '‘', '’', '琊', '\n', '…', '—', '擤']:
39
- # add unk word
40
- corrected_text = corrected_text[:i] + ori_char + corrected_text[i:]
41
- continue
42
- if i >= len(corrected_text):
43
- continue
44
- if ori_char != corrected_text[i]:
45
- if ori_char.lower() == corrected_text[i]:
46
- # pass english upper char
47
- corrected_text = corrected_text[:i] + ori_char + corrected_text[i + 1:]
48
- continue
49
- sub_details.append((ori_char, corrected_text[i], i, i + 1))
50
- sub_details = sorted(sub_details, key=operator.itemgetter(2))
51
- return corrected_text, sub_details
52
-
53
- _text = tokenizer.decode(torch.argmax(outputs.logits[0], dim=-1), skip_special_tokens=True).replace(' ', '')
54
- corrected_text = _text[:len(text)]
55
- corrected_text, details = get_errors(corrected_text, text)
56
- print(text, ' => ', corrected_text, details)
57
  return corrected_text + ' ' + str(details)
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  if __name__ == '__main__':
61
-
62
  examples = [
63
  ['你究輸入利的手機門號跟生分證就可以了。'],
64
  ['這裡是客服中新,很高性為您服物,請問金天有什麼須要幫忙'],
@@ -66,16 +47,13 @@ if __name__ == '__main__':
66
  ['我來看以下,他的時價是多少?起實您就可以直皆就不用到門事'],
67
  ['因為你現在月富是六九九嘛,我幫擬減衣百塊,兒且也不會江速'],
68
  ]
69
-
70
- inputs=[gr.Textbox(lines=2, label="欲校正的文字")],
71
- outputs=[gr.Textbox(lines=2, label="修正後的文字")],
72
  gr.Interface(
73
- inputs='text',
74
- outputs='text',
75
- title="客服ASR文本AI糾錯系統",
76
- description="""
77
- <a href="https://www.twman.org" target='_blank'>TonTon Huang Ph.D. @ 2024/04 </a><br>
78
- 輸入ASR文本,糾正同音字/詞錯誤<br>
79
- Masked Language Model (MLM) as correction BERT
80
- """, examples=examples
81
  ).launch()
 
 
 
1
  import gradio as gr
2
  import operator
3
  import torch
 
5
  from transformers import BertTokenizer, BertForMaskedLM
6
 
7
  # 使用私有模型和分詞器
 
 
 
 
 
 
 
 
8
  model_name_or_path = "DeepLearning101/Corrector101zhTW"
9
 
10
+ # 嘗試加載模型和分詞器
11
  try:
12
  tokenizer = BertTokenizer.from_pretrained(model_name_or_path)
13
  model = BertForMaskedLM.from_pretrained(model_name_or_path)
14
  except Exception as e:
15
+ print(f"加載模型或分詞器失敗,錯誤信息:{e}")
16
+ exit(1)
 
 
 
17
 
18
+ def ai_text(text):
19
+ with torch.no_grad():
20
+ outputs = model(**tokenizer([text], padding=True, return_tensors='pt'))
21
+ corrected_text, details = get_errors(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  return corrected_text + ' ' + str(details)
23
 
24
+ def to_highlight(corrected_sent, errs):
25
+ output = [{"entity": "糾錯", "word": err[1], "start": err[2], "end": err[3]} for err in errs]
26
+ return {"text": corrected_sent, "entities": output}
27
+
28
+ def get_errors(text):
29
+ sub_details = []
30
+ corrected_text = tokenizer.decode(torch.argmax(outputs.logits[0], dim=-1), skip_special_tokens=True).replace(' ', '')
31
+ for i, ori_char in enumerate(text):
32
+ # 略過特定字符
33
+ if ori_char in [' ', '“', '”', '‘', '’', '琊', '\n', '…', '—', '擤']:
34
+ continue
35
+ if i >= len(corrected_text):
36
+ continue
37
+ if ori_char != corrected_text[i]:
38
+ sub_details.append((ori_char, corrected_text[i], i, i + 1))
39
+ sub_details = sorted(sub_details, key=operator.itemgetter(2))
40
+ return corrected_text, sub_details
41
 
42
  if __name__ == '__main__':
 
43
  examples = [
44
  ['你究輸入利的手機門號跟生分證就可以了。'],
45
  ['這裡是客服中新,很高性為您服物,請問金天有什麼須要幫忙'],
 
47
  ['我來看以下,他的時價是多少?起實您就可以直皆就不用到門事'],
48
  ['因為你現在月富是六九九嘛,我幫擬減衣百塊,兒且也不會江速'],
49
  ]
 
 
 
50
  gr.Interface(
51
+ fn=ai_text,
52
+ inputs=gr.Textbox(lines=2, label="欲校正的文字"),
53
+ outputs=gr.Textbox(lines=2, label="修正後的文字"),
54
+ title="客服ASR文本AI糾錯系統",
55
+ description="""<a href="https://www.twman.org" target='_blank'>TonTon Huang Ph.D. @ 2024/04 </a><br>
56
+ 輸入ASR文本,糾正同音字/詞錯誤<br>
57
+ Masked Language Model (MLM) as correction BERT""",
58
+ examples=examples
59
  ).launch()