import requests from paddleocr import PaddleOCR, draw_ocr from PIL import Image import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch img = "input_data/ocr_input/korean1.jpg" text = "표현이 서툰 것도 잘못인가요. 나 차가운 도시에 따뜻한 여잔데. 그냥 좋아한단 말도 안 되는가요. 솔직하게 난 말하고 싶어요" model_id = "deepseek-ai/deepseek-llm-7b-chat" tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True) def text_inference(text, language): system_prompt = ( f"Given the following {language} text, extract all words in their base (dictionary) form, including verbs, adjectives, nouns, and particles. " "Remove all duplicates. Return the base form words as a comma-separated list, and nothing else." ) user_prompt = f"{system_prompt}\n\nText:\n{text}" input_ids = tokenizer.apply_chat_template([{"role": "user", "content": user_prompt}], return_tensors="pt").to(model.device) output_ids = model.generate(input_ids, max_new_tokens=256) output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) # Parse response: take last line, split by commas last_line = output_text.strip().split("\n")[-1] words = [w.strip() for w in last_line.split(",") if w.strip()] return words def ocr_inference(img, lang): ocr = PaddleOCR(use_angle_cls=True, lang=lang,use_gpu=False) img_path = img result = ocr.ocr(img_path, cls=True)[0] image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] scores = [line[1][1] for line in result] return txts def make_flashcards(words, language): system_prompt = ( f"for each {language} word in the list, write a flashcard in this format: the word, then its definition, then an example sentence using the word, and then a translation of example sentence" ) user_prompt = f"{system_prompt}\n\nWords:\n{words}" input_ids = tokenizer.apply_chat_template([{"role": "user", "content": user_prompt}], return_tensors="pt").to(model.device) output_ids = model.generate(input_ids, max_new_tokens=256) output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) # Parse response: take last line, split by commas last_line = output_text.strip().split("\n")[-1] output = [w.strip() for w in last_line.split(":") if w.strip()] return output # words=text_inference(text, "korean") # print("OUTPUT TOUT OUETOI EIFJ IEFJ",words) # print("flashcard output:",make_flashcards(words, "korean")) print("OCR OUTPUT: ", ocr_inference(img, "korean")) words=text_inference(text, "korean") print("TEXT INPUT: ", text) print("WORD PARSING: ",words) print("flashcard output:",make_flashcards(words, "korean"))