File size: 4,711 Bytes
b79c056 5acee80 b79c056 d6e90a6 237df5d d6e90a6 b9c5636 d6e90a6 237df5d d6e90a6 b9c5636 b79c056 b9c5636 5887026 b9c5636 5887026 b79c056 5887026 b9c5636 b79c056 9aa7185 b79c056 b9c5636 deda166 b9c5636 60901ea fbea724 1a66e81 fbea724 520bdcd 8fc82cb 520bdcd b9c5636 5887026 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import gradio as gr
import operator
import torch
import os
from transformers import BertTokenizer, BertForMaskedLM
# 使用私有模型和分詞器
model_name_or_path = "DeepLearning101/Corrector101zhTW"
auth_token = os.getenv("HF_HOME")
# 嘗試加載模型和分詞器
try:
tokenizer = BertTokenizer.from_pretrained(model_name_or_path, use_auth_token=auth_token)
model = BertForMaskedLM.from_pretrained(model_name_or_path, use_auth_token=auth_token)
model.eval()
except Exception as e:
print(f"加載模型或分詞器失敗,錯誤信息:{e}")
exit(1)
def ai_text(text):
"""處理輸入文本並返回修正後的文本及錯誤細節"""
with torch.no_grad():
inputs = tokenizer(text, return_tensors="pt", padding=True)
outputs = model(**inputs)
corrected_text, details = get_errors(text, outputs)
return corrected_text + ' ' + str(details)
def get_errors(text, outputs):
"""識別原始文本和模型輸出之間的差異"""
sub_details = []
corrected_text = tokenizer.decode(torch.argmax(outputs.logits[0], dim=-1), skip_special_tokens=True).replace(' ', '')
for i, ori_char in enumerate(text):
if ori_char in [' ', '“', '”', '‘', '’', '琊', '\n', '…', '—', '擤']:
continue
if i >= len(corrected_text):
continue
if ori_char != corrected_text[i]:
sub_details.append((ori_char, corrected_text[i], i, i + 1))
sub_details = sorted(sub_details, key=operator.itemgetter(2))
return corrected_text, sub_details
if __name__ == '__main__':
examples = [
['你究輸入利的手機門號跟生分證就可以了。'],
['這裡是客服中新,很高性為您服物,請問金天有什麼須要幫忙'],
['因為我們這邊是按天術比例計蒜給您的,其實不會有態大的穎響。也就是您用前面的資非的廢率來做計算'],
['我來看以下,他的時價是多少?起實您就可以直皆就不用到門事'],
['因為你現在月富是六九九嘛,我幫擬減衣百塊,兒且也不會江速'],
]
gr.Interface(
fn=ai_text,
inputs=gr.Textbox(lines=2, label="欲校正的文字"),
outputs=gr.Textbox(lines=2, label="修正後的文字"),
title="<h1 align='center'>客服ASR文本AI糾錯系統</h1>",
description="""<h2 align='center'><a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D. @ 2024/04 </a></h2><br>
輸入ASR文本,糾正同音字/詞錯誤<br>
<a href='https://github.com/Deep-Learning-101' target='_blank'>Deep Learning 101 Github</a> | <a href='http://deeplearning101.twman.org' target='_blank'>Deep Learning 101</a> | <a href='https://www.facebook.com/groups/525579498272187/' target='_blank'>台灣人工智慧社團 FB</a> | <a href='https://www.youtube.com/c/DeepLearning101' target='_blank'>YouTube</a><br>
<a href='https://reurl.cc/g6GlZX' target='_blank'>手把手帶你一起踩AI坑</a> | <a href='https://blog.twman.org/2024/11/diffusion.html' target='_blank'>ComfyUI + Stable Diffuision</a><br>
<a href='https://blog.twman.org/2024/08/LLM.html' target='_blank'>白話文手把手帶你科普 GenAI</a> | <a href='https://blog.twman.org/2024/09/LLM.html' target='_blank'>大型語言模型直接就打完收工?</a><br>
<a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>什麼是大語言模型,它是什麼?想要嗎?</a> | <a href='https://blog.twman.org/2024/07/RAG.html' target='_blank'>那些檢索增強生成要踩的坑 </a><br>
<a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a> | <a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (Natural Language Processing, NLP) 踩的坑</a><br>
<a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>那些ASR和TTS可能會踩的坑</a> | <a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>那些大模型開發會踩的坑</a><br>
<a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a> | <a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a><br>
<a href='https://github.com/shibing624/pycorrector' target='_blank'>Masked Language Model (MLM) as correction BERT</a>""",
examples=examples
).launch()
|