dimasdeffieux commited on
Commit
c516aac
Β·
verified Β·
1 Parent(s): 1ede285

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -74
app.py CHANGED
@@ -1,88 +1,70 @@
1
- import requests
2
- from paddleocr import PaddleOCR, draw_ocr
3
- from PIL import Image
4
  import gradio as gr
5
- from transformers import AutoModelForCausalLM, AutoTokenizer
6
- import torch
 
7
 
8
- img = "input_data/ocr_input/korean1.jpg"
9
- text = "ν‘œν˜„μ΄ μ„œνˆ° 것도 잘λͺ»μΈκ°€μš”. λ‚˜ μ°¨κ°€μš΄ λ„μ‹œμ— λ”°λœ»ν•œ μ—¬μž”λ°. κ·Έλƒ₯ μ’‹μ•„ν•œλ‹¨ 말도 μ•ˆ λ˜λŠ”κ°€μš”. μ†”μ§ν•˜κ²Œ λ‚œ λ§ν•˜κ³  μ‹Άμ–΄μš”"
10
- model_id = "deepseek-ai/deepseek-llm-7b-chat"
 
 
 
 
11
 
12
- tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
13
- model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
 
 
 
 
14
 
 
15
  def text_inference(text, language):
16
- system_prompt = (
17
- f"Given the following {language} text, convert each word into their base form. Remove all duplicates. Return the base form words as a comma-separated list, and nothing else."
 
 
18
  )
19
- user_prompt = f"{system_prompt}\n\nText:\n{text}"
20
-
21
- input_ids = tokenizer.apply_chat_template([{"role": "user", "content": user_prompt}], return_tensors="pt").to(model.device)
22
- output_ids = model.generate(input_ids, max_new_tokens=256)
23
- output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
24
-
25
- # Parse response: take last line, split by commas
26
- last_line = output_text.strip().split("\n")[-1]
27
- words = [w.strip() for w in last_line.split(",") if w.strip()]
28
  return words
29
 
30
- def ocr_inference(img, lang):
31
- ocr = PaddleOCR(use_angle_cls=True, lang=lang,use_gpu=False)
32
- img_path = img
33
- result = ocr.ocr(img_path, cls=True)[0]
34
- image = Image.open(img_path).convert('RGB')
35
- boxes = [line[0] for line in result]
36
- txts = [line[1][0] for line in result]
37
- scores = [line[1][1] for line in result]
38
- return txts
39
-
40
  def make_flashcards(words, language):
41
-
42
- system_prompt = (
43
- f"for each {language} word in the list, write a flashcard in this format: the word, then its definition, then an example sentence using the word, and then a translation of example sentence"
 
44
  )
45
- user_prompt = f"{system_prompt}\n\nWords:\n{words}"
46
-
47
- input_ids = tokenizer.apply_chat_template([{"role": "user", "content": user_prompt}], return_tensors="pt").to(model.device)
48
- output_ids = model.generate(input_ids, max_new_tokens=256)
49
- output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
50
-
51
- # Parse response: take last line, split by commas
52
- last_line = output_text.strip().split("\n")[-1]
53
- output = [w.strip() for w in last_line.split(":") if w.strip()]
54
- return output
55
-
56
- # words=text_inference(text, "korean")
57
- # print("OUTPUT TOUT OUETOI EIFJ IEFJ",words)
58
- # print("flashcard output:",make_flashcards(words, "korean"))
59
-
60
- # print("OCR OUTPUT: ", ocr_inference(img, "korean"))
61
- # words=text_inference(text, "korean")
62
- # print("TEXT INPUT: ", text)
63
- # print("WORD PARSING: ",words)
64
- # print("flashcard output:",make_flashcards(words, "korean"))
65
 
66
- examples = [
67
- [{"text": "@RolmOCR OCR the Text in the Image", "files": ["rolm/1.jpeg"]}],
68
- [{"text": "@RolmOCR Explain the Ad in Detail", "files": ["examples/videoplayback.mp4"]}],
69
- [{"text": "@RolmOCR OCR the Image", "files": ["rolm/3.jpeg"]}],
70
- [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
71
- ]
 
 
 
72
 
73
- demo = gr.ChatInterface(
74
- fn=ocr_inference,
75
- description="# **Multimodal OCR `@RolmOCR and Default Qwen2VL OCR`**",
76
- examples=examples,
77
- textbox=gr.MultimodalTextbox(
78
- label="Query Input",
79
- file_types=["image", "video"],
80
- file_count="multiple",
81
- placeholder="Use tag @RolmOCR for RolmOCR, or leave blank for default Qwen2VL OCR"
82
- ),
83
- stop_btn="Stop Generation",
84
- multimodal=True,
85
- cache_examples=False,
 
86
  )
87
 
88
- demo.launch(debug=True)
 
 
 
 
 
1
  import gradio as gr
2
+ from llama_cpp import Llama
3
+ from paddleocr import PaddleOCR
4
+ from PIL import Image
5
 
6
+ # Load GGUF model
7
+ llm = Llama(
8
+ model_path="./deepseek-v3-0324.Q4_K_M.gguf", # Make sure this file is in your repo
9
+ n_ctx=2048,
10
+ n_threads=8,
11
+ n_gpu_layers=20 # Set to 0 if you are on CPU-only
12
+ )
13
 
14
+ # OCR Function
15
+ def ocr_inference(img, lang):
16
+ ocr = PaddleOCR(use_angle_cls=True, lang=lang, use_gpu=False)
17
+ result = ocr.ocr(img, cls=True)[0]
18
+ txts = [line[1][0] for line in result]
19
+ return " ".join(txts)
20
 
21
+ # Step 1: Convert text to base form words
22
  def text_inference(text, language):
23
+ prompt = (
24
+ f"Given the following {language} text, convert each word into its base form. "
25
+ f"Remove all duplicates. Return the base form words as a comma-separated list.\n\n"
26
+ f"Text:\n{text}"
27
  )
28
+ response = llm(prompt, max_tokens=256, stop=["</s>"])
29
+ output_text = response["choices"][0]["text"].strip()
30
+ words = [w.strip() for w in output_text.split(",") if w.strip()]
 
 
 
 
 
 
31
  return words
32
 
33
+ # Step 2: Generate flashcards for those words
 
 
 
 
 
 
 
 
 
34
  def make_flashcards(words, language):
35
+ prompt = (
36
+ f"For each {language} word in the list, write a flashcard in this format:\n"
37
+ f"word - definition - example sentence - translated sentence.\n\n"
38
+ f"Words:\n{', '.join(words)}"
39
  )
40
+ response = llm(prompt, max_tokens=512, stop=["</s>"])
41
+ return response["choices"][0]["text"].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ # Wrapper logic to handle OCR or text
44
+ def flashcard_pipeline(text, image, language):
45
+ if image:
46
+ text = ocr_inference(image, language)
47
+ if not text:
48
+ return "", "Please provide either text or an image."
49
+ words = text_inference(text, language)
50
+ flashcards = make_flashcards(words, language)
51
+ return "\n".join(words), flashcards
52
 
53
+ # Gradio UI
54
+ demo = gr.Interface(
55
+ fn=flashcard_pipeline,
56
+ inputs=[
57
+ gr.Textbox(label="Input Text (leave empty to use image)", lines=4, placeholder="Type or paste sentence here..."),
58
+ gr.Image(label="Upload Image for OCR (optional)", type="filepath"),
59
+ gr.Dropdown(choices=["korean", "japan", "french", "ch"], label="Language (for OCR and LLM)")
60
+ ],
61
+ outputs=[
62
+ gr.Textbox(label="Base Form Words"),
63
+ gr.Textbox(label="Flashcards"),
64
+ ],
65
+ title="Language Flashcard Generator (with OCR + DeepSeek GGUF)",
66
+ description="Either input text or upload an image. The app will extract words, lemmatize them, and generate flashcards."
67
  )
68
 
69
+ if __name__ == "__main__":
70
+ demo.launch()