|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
|
import torch |
|
import gradio as gr |
|
import easyocr |
|
import pdfplumber |
|
import random |
|
|
|
|
|
MODEL_PATH = "valhalla/t5-base-qg-hl" |
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH) |
|
|
|
def generate_text(prompt, max_length=100, temperature=0.8, top_p=0.9): |
|
inputs = tokenizer(prompt, return_tensors="pt") |
|
with torch.no_grad(): |
|
outputs = model.generate(**inputs, max_length=max_length, do_sample=True, temperature=temperature, top_p=top_p) |
|
return tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
def question_generator_with_answer(context): |
|
question = generate_text(f"question: {context}") |
|
correct_answer = generate_text(f"answer: {context}", temperature=1.0) |
|
|
|
wrong_answers = set() |
|
while len(wrong_answers) < 3: |
|
wrong_answer = generate_text(f"answer: {context}", max_length=50, temperature=1.0, top_p=0.8) |
|
if wrong_answer != correct_answer and "?" not in wrong_answer: |
|
wrong_answers.add(wrong_answer) |
|
|
|
choices = list(wrong_answers) + [correct_answer] |
|
random.shuffle(choices) |
|
|
|
return { |
|
"question": question, |
|
"choices": choices, |
|
"correct_answer": correct_answer |
|
} |
|
|
|
def format_question_output(context): |
|
question_result = [] |
|
for j in range(4): |
|
result = question_generator_with_answer(context) |
|
question_text = f"{result['question']}\n" |
|
choices_text = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(result['choices'])]) |
|
question_result.append(f"\nQ{j+1}.{question_text}\n{choices_text}\n") |
|
return "\n".join(question_result) |
|
|
|
def extract_text_from_pdf(pdf_path): |
|
text = "" |
|
with pdfplumber.open(pdf_path.name) as pdf: |
|
for page in pdf.pages: |
|
text += page.extract_text() + "\n" |
|
return format_question_output(text) |
|
|
|
def OCR(photo): |
|
reader = easyocr.Reader(['en', 'ch_tra']) |
|
results = reader.readtext(photo) |
|
return "".join([text for (_, text, _) in results]) |
|
|
|
def OCR_gen(text): |
|
if not text.strip(): |
|
return "錯誤:OCR 沒有輸出任何可用的文字,請重新檢查圖片內容。" |
|
return format_question_output(text) |
|
|
|
demo = gr.Blocks() |
|
with demo: |
|
gr.Markdown("<h1>產生英文題目</h1>") |
|
gr.Markdown("這是一個利用 hugging face 產生英文題目的小專案") |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("輸入文字"): |
|
text_input = gr.Textbox(label="請輸入英文文章:", placeholder="While lily is setting...", lines=5) |
|
text_output = gr.Textbox(label="題目") |
|
text_button = gr.Button("產生題目") |
|
|
|
with gr.TabItem("PDF文件辨識"): |
|
PDF_input = gr.File(label="請上傳PDF文件") |
|
PDF_output = gr.Textbox() |
|
PDF_button = gr.Button("產生題目") |
|
|
|
with gr.TabItem("圖片辨識"): |
|
image_input = gr.Image() |
|
img_tem = gr.Textbox(placeholder="請確認辨識結果", label="辨識結果") |
|
img_button = gr.Button("開始解析") |
|
image_button = gr.Button("產生題目") |
|
image_output = gr.Textbox(label="題目") |
|
|
|
text_button.click(format_question_output, inputs=text_input, outputs=text_output) |
|
PDF_button.click(extract_text_from_pdf, inputs=PDF_input, outputs=PDF_output) |
|
img_button.click(OCR, inputs=image_input, outputs=img_tem) |
|
image_button.click(OCR_gen, inputs=img_tem, outputs=image_output) |
|
|
|
demo.launch() |
|
|