Daemontatox commited on
Commit
e4611cf
·
verified ·
1 Parent(s): e574b9a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -22
app.py CHANGED
@@ -7,29 +7,34 @@ import gradio as gr
7
  from gradio import FileData
8
  import time
9
  import spaces
10
- from pdf2image import convert_from_path
11
- import os
12
- from PyPDF2 import PdfReader
13
- import tempfile
14
 
15
  ckpt = "Daemontatox/DocumentCogito"
16
  model = MllamaForConditionalGeneration.from_pretrained(ckpt,
17
  torch_dtype=torch.bfloat16).to("cuda")
18
  processor = AutoProcessor.from_pretrained(ckpt)
19
 
20
- def process_pdf(pdf_path):
21
- """Convert PDF pages to images and extract text."""
22
- images = convert_from_path(pdf_path)
23
- pdf_reader = PdfReader(pdf_path)
24
  text = ""
25
- for page in pdf_reader.pages:
26
- text += page.extract_text() + "\n"
 
 
 
 
 
 
 
 
 
 
27
  return images, text
28
 
29
- def is_pdf(file_path):
30
- """Check if the file is a PDF."""
31
- return file_path.lower().endswith('.pdf')
32
-
33
  @spaces.GPU()
34
  def bot_streaming(message, history, max_new_tokens=2048):
35
  txt = message["text"]
@@ -41,7 +46,7 @@ def bot_streaming(message, history, max_new_tokens=2048):
41
  # Process history
42
  for i, msg in enumerate(history):
43
  if isinstance(msg[0], tuple):
44
- messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "text", "text": history[i+1][1]}]})
45
  messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
46
  images.append(Image.open(msg[0][0]).convert("RGB"))
47
  elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
@@ -52,11 +57,13 @@ def bot_streaming(message, history, max_new_tokens=2048):
52
 
53
  # Process current message
54
  if len(message["files"]) == 1:
55
- file_path = message["files"][0]["path"] if isinstance(message["files"][0], dict) else message["files"][0]
 
56
 
57
- if is_pdf(file_path):
58
- # Handle PDF
59
- pdf_images, pdf_text = process_pdf(file_path)
 
60
  images.extend(pdf_images)
61
  txt = f"{txt}\nExtracted text from PDF:\n{pdf_text}"
62
  else:
@@ -73,11 +80,16 @@ def bot_streaming(message, history, max_new_tokens=2048):
73
  if not images:
74
  inputs = processor(text=texts, return_tensors="pt").to("cuda")
75
  else:
 
 
 
 
 
 
76
  inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
77
 
78
  streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
79
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
80
- generated_text = ""
81
 
82
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
83
  thread.start()
@@ -85,10 +97,10 @@ def bot_streaming(message, history, max_new_tokens=2048):
85
 
86
  for new_text in streamer:
87
  buffer += new_text
88
- generated_text_without_prompt = buffer
89
  time.sleep(0.01)
90
  yield buffer
91
 
 
92
  demo = gr.ChatInterface(
93
  fn=bot_streaming,
94
  title="Document Analyzer",
@@ -116,7 +128,8 @@ demo = gr.ChatInterface(
116
  multimodal=True
117
  )
118
 
119
- # Update file types to include PDFs
120
  demo.textbox.file_types = ["image", "pdf"]
121
 
 
122
  demo.launch(debug=True)
 
7
  from gradio import FileData
8
  import time
9
  import spaces
10
+ import fitz # PyMuPDF
11
+ import io
12
+ import numpy as np
 
13
 
14
  ckpt = "Daemontatox/DocumentCogito"
15
  model = MllamaForConditionalGeneration.from_pretrained(ckpt,
16
  torch_dtype=torch.bfloat16).to("cuda")
17
  processor = AutoProcessor.from_pretrained(ckpt)
18
 
19
+ def process_pdf_file(file_path):
20
+ """Convert PDF to images and extract text using PyMuPDF."""
21
+ doc = fitz.open(file_path)
22
+ images = []
23
  text = ""
24
+
25
+ for page in doc:
26
+ # Extract text
27
+ text += page.get_text() + "\n"
28
+
29
+ # Convert page to image
30
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72)) # 300 DPI
31
+ img_data = pix.tobytes("png")
32
+ img = Image.open(io.BytesIO(img_data))
33
+ images.append(img.convert("RGB"))
34
+
35
+ doc.close()
36
  return images, text
37
 
 
 
 
 
38
  @spaces.GPU()
39
  def bot_streaming(message, history, max_new_tokens=2048):
40
  txt = message["text"]
 
46
  # Process history
47
  for i, msg in enumerate(history):
48
  if isinstance(msg[0], tuple):
49
+ messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
50
  messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
51
  images.append(Image.open(msg[0][0]).convert("RGB"))
52
  elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
 
57
 
58
  # Process current message
59
  if len(message["files"]) == 1:
60
+ file_data = message["files"][0]
61
+ file_path = file_data["path"] if isinstance(file_data, dict) else file_data
62
 
63
+ # Check if file is PDF
64
+ if file_path.lower().endswith('.pdf'):
65
+ # Process PDF
66
+ pdf_images, pdf_text = process_pdf_file(file_path)
67
  images.extend(pdf_images)
68
  txt = f"{txt}\nExtracted text from PDF:\n{pdf_text}"
69
  else:
 
80
  if not images:
81
  inputs = processor(text=texts, return_tensors="pt").to("cuda")
82
  else:
83
+ # Handle multiple images if needed
84
+ max_images = 4 # Limit number of images to process
85
+ if len(images) > max_images:
86
+ images = images[:max_images]
87
+ txt += f"\n(Note: Only processing first {max_images} pages of the PDF)"
88
+
89
  inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
90
 
91
  streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
92
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
 
93
 
94
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
95
  thread.start()
 
97
 
98
  for new_text in streamer:
99
  buffer += new_text
 
100
  time.sleep(0.01)
101
  yield buffer
102
 
103
+ # Create the Gradio interface
104
  demo = gr.ChatInterface(
105
  fn=bot_streaming,
106
  title="Document Analyzer",
 
128
  multimodal=True
129
  )
130
 
131
+ # Update accepted file types
132
  demo.textbox.file_types = ["image", "pdf"]
133
 
134
+ # Launch the interface
135
  demo.launch(debug=True)