File size: 4,711 Bytes
b79c056
 
 
5acee80
b79c056
 
 
d6e90a6
237df5d
d6e90a6
b9c5636
d6e90a6
237df5d
 
 
d6e90a6
b9c5636
 
b79c056
b9c5636
5887026
b9c5636
5887026
 
 
b79c056
 
5887026
 
b9c5636
 
 
 
 
 
 
 
 
 
 
b79c056
 
9aa7185
 
 
 
 
 
 
b79c056
b9c5636
 
 
deda166
 
b9c5636
60901ea
fbea724
1a66e81
fbea724
520bdcd
 
8fc82cb
520bdcd
b9c5636
5887026
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gradio as gr
import operator
import torch
import os
from transformers import BertTokenizer, BertForMaskedLM

# 使用私有模型和分詞器
model_name_or_path = "DeepLearning101/Corrector101zhTW"
auth_token = os.getenv("HF_HOME") 

# 嘗試加載模型和分詞器
try:
    tokenizer = BertTokenizer.from_pretrained(model_name_or_path, use_auth_token=auth_token)
    model = BertForMaskedLM.from_pretrained(model_name_or_path, use_auth_token=auth_token)
    model.eval()
except Exception as e:
    print(f"加載模型或分詞器失敗,錯誤信息:{e}")
    exit(1)

def ai_text(text):
    """處理輸入文本並返回修正後的文本及錯誤細節"""
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", padding=True)
        outputs = model(**inputs)
    corrected_text, details = get_errors(text, outputs)
    return corrected_text + ' ' + str(details)

def get_errors(text, outputs):
    """識別原始文本和模型輸出之間的差異"""
    sub_details = []
    corrected_text = tokenizer.decode(torch.argmax(outputs.logits[0], dim=-1), skip_special_tokens=True).replace(' ', '')
    for i, ori_char in enumerate(text):
        if ori_char in [' ', '“', '”', '‘', '’', '琊', '\n', '…', '—', '擤']:
            continue
        if i >= len(corrected_text):
            continue
        if ori_char != corrected_text[i]:
            sub_details.append((ori_char, corrected_text[i], i, i + 1))
    sub_details = sorted(sub_details, key=operator.itemgetter(2))
    return corrected_text, sub_details

if __name__ == '__main__':
    examples = [
        ['你究輸入利的手機門號跟生分證就可以了。'],
        ['這裡是客服中新,很高性為您服物,請問金天有什麼須要幫忙'],
        ['因為我們這邊是按天術比例計蒜給您的,其實不會有態大的穎響。也就是您用前面的資非的廢率來做計算'],
        ['我來看以下,他的時價是多少?起實您就可以直皆就不用到門事'],
        ['因為你現在月富是六九九嘛,我幫擬減衣百塊,兒且也不會江速'],
    ]
    gr.Interface(
        fn=ai_text,
        inputs=gr.Textbox(lines=2, label="欲校正的文字"),
        outputs=gr.Textbox(lines=2, label="修正後的文字"),
        title="<h1 align='center'>客服ASR文本AI糾錯系統</h1>",
        description="""<h2 align='center'><a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D. @ 2024/04 </a></h2><br>
                    輸入ASR文本,糾正同音字/詞錯誤<br>
                    <a href='https://github.com/Deep-Learning-101' target='_blank'>Deep Learning 101 Github</a> | <a href='http://deeplearning101.twman.org' target='_blank'>Deep Learning 101</a> | <a href='https://www.facebook.com/groups/525579498272187/' target='_blank'>台灣人工智慧社團 FB</a> | <a href='https://www.youtube.com/c/DeepLearning101' target='_blank'>YouTube</a><br>
                    <a href='https://reurl.cc/g6GlZX' target='_blank'>手把手帶你一起踩AI坑</a> | <a href='https://blog.twman.org/2024/11/diffusion.html' target='_blank'>ComfyUI + Stable Diffuision</a><br>
                    <a href='https://blog.twman.org/2024/08/LLM.html' target='_blank'>白話文手把手帶你科普 GenAI</a> | <a href='https://blog.twman.org/2024/09/LLM.html' target='_blank'>大型語言模型直接就打完收工?</a><br>
                    <a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>什麼是大語言模型,它是什麼?想要嗎?</a> | <a href='https://blog.twman.org/2024/07/RAG.html' target='_blank'>那些檢索增強生成要踩的坑 </a><br>
                    <a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a> | <a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (Natural Language Processing, NLP) 踩的坑</a><br>
                    <a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>那些ASR和TTS可能會踩的坑</a> | <a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>那些大模型開發會踩的坑</a><br>
                    <a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a> | <a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a><br>
                    <a href='https://github.com/shibing624/pycorrector' target='_blank'>Masked Language Model (MLM) as correction BERT</a>""",
        examples=examples
    ).launch()