Spaces:
Sleeping
Sleeping
import requests | |
from paddleocr import PaddleOCR, draw_ocr | |
from PIL import Image | |
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
img = "input_data/ocr_input/korean1.jpg" | |
text = "ννμ΄ μν° κ²λ μλͺ»μΈκ°μ. λ μ°¨κ°μ΄ λμμ λ°λ»ν μ¬μλ°. κ·Έλ₯ μ’μνλ¨ λ§λ μ λλκ°μ. μμ§νκ² λ λ§νκ³ μΆμ΄μ" | |
model_id = "deepseek-ai/deepseek-llm-7b-chat" | |
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True) | |
def text_inference(text, language): | |
system_prompt = ( | |
f"Given the following {language} text, convert each word into their base form. Remove all duplicates. Return the base form words as a comma-separated list, and nothing else." | |
) | |
user_prompt = f"{system_prompt}\n\nText:\n{text}" | |
input_ids = tokenizer.apply_chat_template([{"role": "user", "content": user_prompt}], return_tensors="pt").to(model.device) | |
output_ids = model.generate(input_ids, max_new_tokens=256) | |
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
# Parse response: take last line, split by commas | |
last_line = output_text.strip().split("\n")[-1] | |
words = [w.strip() for w in last_line.split(",") if w.strip()] | |
return words | |
def ocr_inference(img, lang): | |
ocr = PaddleOCR(use_angle_cls=True, lang=lang,use_gpu=False) | |
img_path = img | |
result = ocr.ocr(img_path, cls=True)[0] | |
image = Image.open(img_path).convert('RGB') | |
boxes = [line[0] for line in result] | |
txts = [line[1][0] for line in result] | |
scores = [line[1][1] for line in result] | |
return txts | |
def make_flashcards(words, language): | |
system_prompt = ( | |
f"for each {language} word in the list, write a flashcard in this format: the word, then its definition, then an example sentence using the word, and then a translation of example sentence" | |
) | |
user_prompt = f"{system_prompt}\n\nWords:\n{words}" | |
input_ids = tokenizer.apply_chat_template([{"role": "user", "content": user_prompt}], return_tensors="pt").to(model.device) | |
output_ids = model.generate(input_ids, max_new_tokens=256) | |
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
# Parse response: take last line, split by commas | |
last_line = output_text.strip().split("\n")[-1] | |
output = [w.strip() for w in last_line.split(":") if w.strip()] | |
return output | |
# words=text_inference(text, "korean") | |
# print("OUTPUT TOUT OUETOI EIFJ IEFJ",words) | |
# print("flashcard output:",make_flashcards(words, "korean")) | |
print("OCR OUTPUT: ", ocr_inference(img, "korean")) | |
words=text_inference(text, "korean") | |
print("TEXT INPUT: ", text) | |
print("WORD PARSING: ",words) | |
print("flashcard output:",make_flashcards(words, "korean")) |