Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,29 +7,34 @@ import gradio as gr
|
|
7 |
from gradio import FileData
|
8 |
import time
|
9 |
import spaces
|
10 |
-
|
11 |
-
import
|
12 |
-
|
13 |
-
import tempfile
|
14 |
|
15 |
ckpt = "Daemontatox/DocumentCogito"
|
16 |
model = MllamaForConditionalGeneration.from_pretrained(ckpt,
|
17 |
torch_dtype=torch.bfloat16).to("cuda")
|
18 |
processor = AutoProcessor.from_pretrained(ckpt)
|
19 |
|
20 |
-
def
|
21 |
-
"""Convert PDF
|
22 |
-
|
23 |
-
|
24 |
text = ""
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
return images, text
|
28 |
|
29 |
-
def is_pdf(file_path):
|
30 |
-
"""Check if the file is a PDF."""
|
31 |
-
return file_path.lower().endswith('.pdf')
|
32 |
-
|
33 |
@spaces.GPU()
|
34 |
def bot_streaming(message, history, max_new_tokens=2048):
|
35 |
txt = message["text"]
|
@@ -41,7 +46,7 @@ def bot_streaming(message, history, max_new_tokens=2048):
|
|
41 |
# Process history
|
42 |
for i, msg in enumerate(history):
|
43 |
if isinstance(msg[0], tuple):
|
44 |
-
messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "
|
45 |
messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
|
46 |
images.append(Image.open(msg[0][0]).convert("RGB"))
|
47 |
elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
|
@@ -52,11 +57,13 @@ def bot_streaming(message, history, max_new_tokens=2048):
|
|
52 |
|
53 |
# Process current message
|
54 |
if len(message["files"]) == 1:
|
55 |
-
|
|
|
56 |
|
57 |
-
if
|
58 |
-
|
59 |
-
|
|
|
60 |
images.extend(pdf_images)
|
61 |
txt = f"{txt}\nExtracted text from PDF:\n{pdf_text}"
|
62 |
else:
|
@@ -73,11 +80,16 @@ def bot_streaming(message, history, max_new_tokens=2048):
|
|
73 |
if not images:
|
74 |
inputs = processor(text=texts, return_tensors="pt").to("cuda")
|
75 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
|
77 |
|
78 |
streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
|
79 |
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
|
80 |
-
generated_text = ""
|
81 |
|
82 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
83 |
thread.start()
|
@@ -85,10 +97,10 @@ def bot_streaming(message, history, max_new_tokens=2048):
|
|
85 |
|
86 |
for new_text in streamer:
|
87 |
buffer += new_text
|
88 |
-
generated_text_without_prompt = buffer
|
89 |
time.sleep(0.01)
|
90 |
yield buffer
|
91 |
|
|
|
92 |
demo = gr.ChatInterface(
|
93 |
fn=bot_streaming,
|
94 |
title="Document Analyzer",
|
@@ -116,7 +128,8 @@ demo = gr.ChatInterface(
|
|
116 |
multimodal=True
|
117 |
)
|
118 |
|
119 |
-
# Update file types
|
120 |
demo.textbox.file_types = ["image", "pdf"]
|
121 |
|
|
|
122 |
demo.launch(debug=True)
|
|
|
7 |
from gradio import FileData
|
8 |
import time
|
9 |
import spaces
|
10 |
+
import fitz # PyMuPDF
|
11 |
+
import io
|
12 |
+
import numpy as np
|
|
|
13 |
|
14 |
ckpt = "Daemontatox/DocumentCogito"
|
15 |
model = MllamaForConditionalGeneration.from_pretrained(ckpt,
|
16 |
torch_dtype=torch.bfloat16).to("cuda")
|
17 |
processor = AutoProcessor.from_pretrained(ckpt)
|
18 |
|
19 |
+
def process_pdf_file(file_path):
|
20 |
+
"""Convert PDF to images and extract text using PyMuPDF."""
|
21 |
+
doc = fitz.open(file_path)
|
22 |
+
images = []
|
23 |
text = ""
|
24 |
+
|
25 |
+
for page in doc:
|
26 |
+
# Extract text
|
27 |
+
text += page.get_text() + "\n"
|
28 |
+
|
29 |
+
# Convert page to image
|
30 |
+
pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72)) # 300 DPI
|
31 |
+
img_data = pix.tobytes("png")
|
32 |
+
img = Image.open(io.BytesIO(img_data))
|
33 |
+
images.append(img.convert("RGB"))
|
34 |
+
|
35 |
+
doc.close()
|
36 |
return images, text
|
37 |
|
|
|
|
|
|
|
|
|
38 |
@spaces.GPU()
|
39 |
def bot_streaming(message, history, max_new_tokens=2048):
|
40 |
txt = message["text"]
|
|
|
46 |
# Process history
|
47 |
for i, msg in enumerate(history):
|
48 |
if isinstance(msg[0], tuple):
|
49 |
+
messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
|
50 |
messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
|
51 |
images.append(Image.open(msg[0][0]).convert("RGB"))
|
52 |
elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
|
|
|
57 |
|
58 |
# Process current message
|
59 |
if len(message["files"]) == 1:
|
60 |
+
file_data = message["files"][0]
|
61 |
+
file_path = file_data["path"] if isinstance(file_data, dict) else file_data
|
62 |
|
63 |
+
# Check if file is PDF
|
64 |
+
if file_path.lower().endswith('.pdf'):
|
65 |
+
# Process PDF
|
66 |
+
pdf_images, pdf_text = process_pdf_file(file_path)
|
67 |
images.extend(pdf_images)
|
68 |
txt = f"{txt}\nExtracted text from PDF:\n{pdf_text}"
|
69 |
else:
|
|
|
80 |
if not images:
|
81 |
inputs = processor(text=texts, return_tensors="pt").to("cuda")
|
82 |
else:
|
83 |
+
# Handle multiple images if needed
|
84 |
+
max_images = 4 # Limit number of images to process
|
85 |
+
if len(images) > max_images:
|
86 |
+
images = images[:max_images]
|
87 |
+
txt += f"\n(Note: Only processing first {max_images} pages of the PDF)"
|
88 |
+
|
89 |
inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
|
90 |
|
91 |
streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
|
92 |
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
|
|
|
93 |
|
94 |
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
95 |
thread.start()
|
|
|
97 |
|
98 |
for new_text in streamer:
|
99 |
buffer += new_text
|
|
|
100 |
time.sleep(0.01)
|
101 |
yield buffer
|
102 |
|
103 |
+
# Create the Gradio interface
|
104 |
demo = gr.ChatInterface(
|
105 |
fn=bot_streaming,
|
106 |
title="Document Analyzer",
|
|
|
128 |
multimodal=True
|
129 |
)
|
130 |
|
131 |
+
# Update accepted file types
|
132 |
demo.textbox.file_types = ["image", "pdf"]
|
133 |
|
134 |
+
# Launch the interface
|
135 |
demo.launch(debug=True)
|