File size: 2,851 Bytes
da9721c
4d76f31
 
 
 
2000233
 
4d76f31
80f49c6
5764a9a
4d76f31
2000233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f979edc
 
 
2000233
 
 
4d76f31
 
 
 
 
 
 
486107f
 
a378385
c8ac480
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c95aef7
 
 
a378385
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

import requests
from paddleocr import PaddleOCR, draw_ocr
from PIL import Image
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

img = "input_data/ocr_input/japan1.jpg"
text = "ν‘œν˜„μ΄ μ„œνˆ° 것도 잘λͺ»μΈκ°€μš”. λ‚˜ μ°¨κ°€μš΄ λ„μ‹œμ— λ”°λœ»ν•œ μ—¬μž”λ°. κ·Έλƒ₯ μ’‹μ•„ν•œλ‹¨ 말도 μ•ˆ λ˜λŠ”κ°€μš”. μ†”μ§ν•˜κ²Œ λ‚œ λ§ν•˜κ³  μ‹Άμ–΄μš”"

model_id = "deepseek-ai/deepseek-llm-7b-chat"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)

def text_inference(text, language):
    system_prompt = (
        f"Given the following {language} text, extract all words in their base (dictionary) form, including verbs, adjectives, nouns, and particles. "
        "Remove all duplicates. Return the base form words as a comma-separated list, and nothing else."
    )
    user_prompt = f"{system_prompt}\n\nText:\n{text}"

    input_ids = tokenizer.apply_chat_template([{"role": "user", "content": user_prompt}], return_tensors="pt").to(model.device)
    output_ids = model.generate(input_ids, max_new_tokens=256)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

     # Parse response: take last line, split by commas
    last_line = output_text.strip().split("\n")[-1]
    words = [w.strip() for w in last_line.split(",") if w.strip()]
    return words

def ocr_inference(img, lang):
	ocr = PaddleOCR(use_angle_cls=True, lang=lang,use_gpu=False)
	img_path = img  
	result = ocr.ocr(img_path, cls=True)[0]
	image = Image.open(img_path).convert('RGB')
	boxes = [line[0] for line in result]
	txts = [line[1][0] for line in result]
	scores = [line[1][1] for line in result]
	return txts

def make_flashcards(words, language):
    cards=[]
    for word in words:
        system_prompt = (
            f"Given the following {language} word, create an anki flashcard which has the word, the definition, and an example sentence using that word."
            "return the flashcard as each field separated by colons"
        )
        user_prompt = f"{system_prompt}\n\nText:\n{text}"

        input_ids = tokenizer.apply_chat_template([{"role": "user", "content": user_prompt}], return_tensors="pt").to(model.device)
        output_ids = model.generate(input_ids, max_new_tokens=256)
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

         # Parse response: take last line, split by commas
        last_line = output_text.strip().split("\n")[-1]
        card = [w.strip() for w in last_line.split(":") if w.strip()]
        cards.add(card)
    return cards
    
words=text_inference(text, "korean")
print("OUTPUT TOUT OUETOI EIFJ IEFJ",words)
print("flashcard output:",make_flashcards(words, "korean"))