dimasdeffieux commited on
Commit
2000233
·
verified ·
1 Parent(s): 80f49c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -4
app.py CHANGED
@@ -3,10 +3,31 @@ import requests
3
  from paddleocr import PaddleOCR, draw_ocr
4
  from PIL import Image
5
  import gradio as gr
 
 
6
 
7
  img = "input_data/ocr_input/japan1.jpg"
8
 
9
- def inference(img, lang):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ocr = PaddleOCR(use_angle_cls=True, lang=lang,use_gpu=False)
11
  img_path = img
12
  result = ocr.ocr(img_path, cls=True)[0]
@@ -16,6 +37,5 @@ def inference(img, lang):
16
  scores = [line[1][1] for line in result]
17
  return txts
18
 
19
-
20
- print(inference(img,"japan"))
21
- #balls
 
3
  from paddleocr import PaddleOCR, draw_ocr
4
  from PIL import Image
5
  import gradio as gr
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer
7
+ import torch
8
 
9
  img = "input_data/ocr_input/japan1.jpg"
10
 
11
+ model_id = "deepseek-ai/deepseek-llm-7b-chat"
12
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
13
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
14
+
15
+ def text_inference(text, language):
16
+ system_prompt = (
17
+ f"Given the following {language} text, extract all words in their base (dictionary) form, including verbs, adjectives, nouns, and particles. "
18
+ "Remove all duplicates. Return the base form words as a comma-separated list, and nothing else."
19
+ )
20
+ user_prompt = f"{system_prompt}\n\nText:\n{text}"
21
+
22
+ input_ids = tokenizer.apply_chat_template([{"role": "user", "content": user_prompt}], return_tensors="pt").to(model.device)
23
+ output_ids = model.generate(input_ids, max_new_tokens=256)
24
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
25
+
26
+ # Parse comma-separated string into list
27
+ words = [word.strip() for word in output_text.split(",") if word.strip()]
28
+ return words
29
+
30
+ def ocr_inference(img, lang):
31
  ocr = PaddleOCR(use_angle_cls=True, lang=lang,use_gpu=False)
32
  img_path = img
33
  result = ocr.ocr(img_path, cls=True)[0]
 
37
  scores = [line[1][1] for line in result]
38
  return txts
39
 
40
+ def make_flashcards(words):
41
+ pass;