gagan3012 commited on
Commit
20757be
·
1 Parent(s): b524f07

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -30
app.py CHANGED
@@ -9,7 +9,21 @@ from io import BytesIO
9
  import openai
10
  import requests
11
  from nougat.dataset.rasterize import rasterize_paper
 
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def predict_arabic(img, model_name="UBC-NLP/Qalam"):
15
  # if img is None:
@@ -61,22 +75,25 @@ def predict_english(img, model_name="naver-clova-ix/donut-base-finetuned-cord-v2
61
  sequence = re.sub(r"<.*?>", "", sequence).strip()
62
  return sequence
63
 
 
64
  def predict_nougat(img, model_name="facebook/nougat-small"):
65
- device="cpu"
66
  processor = NougatProcessor.from_pretrained(model_name)
67
  model = VisionEncoderDecoderModel.from_pretrained(model_name)
68
  image = img.convert("RGB")
69
- pixel_values = processor(image, return_tensors="pt", data_format="channels_first").pixel_values
70
-
 
71
  # generate transcription (here we only generate 30 tokens)
72
  outputs = model.generate(
73
- pixel_values.to(device),
74
- min_length=1,
75
- max_new_tokens=1500,
76
- bad_words_ids=[[processor.tokenizer.unk_token_id]],
77
  )
78
-
79
- page_sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0]
 
80
  # page_sequence = processor.post_process_generation(page_sequence, fix_markdown=False)
81
  return page_sequence
82
 
@@ -91,20 +108,23 @@ def inference_nougat(pdf_file, pdf_link):
91
  else:
92
  file_name = pdf_file.name
93
  pdf_name = pdf_file.name.split('/')[-1].split('.')[0]
94
-
95
  images = rasterize_paper(file_name, return_pil=True)
96
  sequence = ""
97
  # infer for every page and concat
98
  for image in images:
99
  sequence += predict_nougat(image)
100
-
101
- content = sequence.replace(r'\(', '$').replace(r'\)', '$').replace(r'\[', '$$').replace(r'\]', '$$')
 
102
  return content
103
 
 
104
  def predict_tesseract(img):
105
  text = pytesseract.image_to_string(Image.open(img))
106
  return text
107
 
 
108
  st.set_option('deprecation.showfileUploaderEncoding', False)
109
 
110
  st.set_page_config(
@@ -123,7 +143,8 @@ st.set_page_config(
123
  st.header("Qalam: A Multilingual OCR System")
124
  st.sidebar.header("Configuration and Image Upload")
125
  st.sidebar.subheader("Adjust Image Enhancement Options")
126
- img_file = st.sidebar.file_uploader(label='Upload a file', type=['png', 'jpg', "pdf"])
 
127
  input_file = st.sidebar.text_input("Enter the file URL")
128
  realtime_update = st.sidebar.checkbox(label="Update in Real Time", value=True)
129
  # box_color = st.sidebar.color_picker(label="Box Color", value='#0000FF')
@@ -145,12 +166,11 @@ Models = {
145
  "Korean": "Donut",
146
  "Chinese": "Donut"
147
  }
148
-
149
- st.sidebar.markdown(f"### Selected Model: {Models[Lng]}")
150
 
 
151
 
152
 
153
- if img_file:
154
  cropped_img = Image.open(img_file)
155
  if not realtime_update:
156
  st.write("Double click to save crop")
@@ -160,19 +180,19 @@ if img_file:
160
  # st.subheader("Input: Upload and Crop Your Image")
161
  # # Get a cropped image from the frontend
162
  # cropped_img = st_cropper(
163
- # img,
164
- # realtime_update=realtime_update,
165
  # box_color="#FF0000",
166
  # aspect_ratio=aspect_ratio,
167
  # should_resize_image=True,
168
  # )
169
-
170
  # with col2:
171
  # # Manipulate cropped image at will
172
  # st.subheader("Output: Preview and Analyze")
173
  # # _ = cropped_img.thumbnail((150, 150))
174
  # st.image(cropped_img)
175
-
176
  button = st.sidebar.button("Run OCR")
177
 
178
  if button:
@@ -191,9 +211,10 @@ if img_file:
191
  st.subheader(f"OCR Results for {Lng}")
192
  st.write(ocr_text)
193
  text_file = BytesIO(ocr_text.encode())
194
- st.download_button('Download Text', text_file, file_name='ocr_text.txt')
 
195
 
196
- elif input_file is not "":
197
  button = st.sidebar.button("Run OCR")
198
 
199
  if button:
@@ -202,26 +223,26 @@ elif input_file is not "":
202
  st.subheader(f"OCR Results for the PDF file")
203
  st.write(ocr_text)
204
  text_file = BytesIO(ocr_text.encode())
205
- st.download_button('Download Text', text_file, file_name='ocr_text.txt')
206
-
207
 
208
  # openai.api_key = ""
209
-
210
  # if "openai_model" not in st.session_state:
211
  # st.session_state["openai_model"] = "gpt-3.5-turbo"
212
-
213
  # if "messages" not in st.session_state:
214
  # st.session_state.messages = []
215
-
216
  # for message in st.session_state.messages:
217
  # with st.chat_message(message["role"]):
218
  # st.markdown(message["content"])
219
-
220
  # if prompt := st.chat_input("How can I help?"):
221
  # st.session_state.messages.append({"role": "user", "content": ocr_text + prompt})
222
  # with st.chat_message("user"):
223
  # st.markdown(prompt)
224
-
225
  # with st.chat_message("assistant"):
226
  # message_placeholder = st.empty()
227
  # full_response = ""
@@ -237,4 +258,3 @@ elif input_file is not "":
237
  # message_placeholder.markdown(full_response + "▌")
238
  # message_placeholder.markdown(full_response)
239
  # st.session_state.messages.append({"role": "assistant", "content": full_response})
240
-
 
9
  import openai
10
  import requests
11
  from nougat.dataset.rasterize import rasterize_paper
12
+ import uuid
13
+ import os
14
 
15
+ def get_pdf(pdf_link):
16
+ unique_filename = f"{os.getcwd()}/downloaded_paper_{uuid.uuid4().hex}.pdf"
17
+
18
+ response = requests.get(pdf_link)
19
+
20
+ if response.status_code == 200:
21
+ with open(unique_filename, 'wb') as pdf_file:
22
+ pdf_file.write(response.content)
23
+ print("PDF downloaded successfully.")
24
+ else:
25
+ print("Failed to download the PDF.")
26
+ return unique_filename
27
 
28
  def predict_arabic(img, model_name="UBC-NLP/Qalam"):
29
  # if img is None:
 
75
  sequence = re.sub(r"<.*?>", "", sequence).strip()
76
  return sequence
77
 
78
+
79
  def predict_nougat(img, model_name="facebook/nougat-small"):
80
+ device = "cuda" if torch.cuda.is_available() else "cpu"
81
  processor = NougatProcessor.from_pretrained(model_name)
82
  model = VisionEncoderDecoderModel.from_pretrained(model_name)
83
  image = img.convert("RGB")
84
+ pixel_values = processor(image, return_tensors="pt",
85
+ data_format="channels_first").pixel_values
86
+
87
  # generate transcription (here we only generate 30 tokens)
88
  outputs = model.generate(
89
+ pixel_values.to(device),
90
+ min_length=1,
91
+ max_new_tokens=1500,
92
+ bad_words_ids=[[processor.tokenizer.unk_token_id]],
93
  )
94
+
95
+ page_sequence = processor.batch_decode(
96
+ outputs, skip_special_tokens=True)[0]
97
  # page_sequence = processor.post_process_generation(page_sequence, fix_markdown=False)
98
  return page_sequence
99
 
 
108
  else:
109
  file_name = pdf_file.name
110
  pdf_name = pdf_file.name.split('/')[-1].split('.')[0]
111
+
112
  images = rasterize_paper(file_name, return_pil=True)
113
  sequence = ""
114
  # infer for every page and concat
115
  for image in images:
116
  sequence += predict_nougat(image)
117
+
118
+ content = sequence.replace(r'\(', '$').replace(
119
+ r'\)', '$').replace(r'\[', '$$').replace(r'\]', '$$')
120
  return content
121
 
122
+
123
  def predict_tesseract(img):
124
  text = pytesseract.image_to_string(Image.open(img))
125
  return text
126
 
127
+
128
  st.set_option('deprecation.showfileUploaderEncoding', False)
129
 
130
  st.set_page_config(
 
143
  st.header("Qalam: A Multilingual OCR System")
144
  st.sidebar.header("Configuration and Image Upload")
145
  st.sidebar.subheader("Adjust Image Enhancement Options")
146
+ img_file = st.sidebar.file_uploader(
147
+ label='Upload a file', type=['png', 'jpg', "pdf"])
148
  input_file = st.sidebar.text_input("Enter the file URL")
149
  realtime_update = st.sidebar.checkbox(label="Update in Real Time", value=True)
150
  # box_color = st.sidebar.color_picker(label="Box Color", value='#0000FF')
 
166
  "Korean": "Donut",
167
  "Chinese": "Donut"
168
  }
 
 
169
 
170
+ st.sidebar.markdown(f"### Selected Model: {Models[Lng]}")
171
 
172
 
173
+ if not img_file.endswith(".pdf"):
174
  cropped_img = Image.open(img_file)
175
  if not realtime_update:
176
  st.write("Double click to save crop")
 
180
  # st.subheader("Input: Upload and Crop Your Image")
181
  # # Get a cropped image from the frontend
182
  # cropped_img = st_cropper(
183
+ # img,
184
+ # realtime_update=realtime_update,
185
  # box_color="#FF0000",
186
  # aspect_ratio=aspect_ratio,
187
  # should_resize_image=True,
188
  # )
189
+
190
  # with col2:
191
  # # Manipulate cropped image at will
192
  # st.subheader("Output: Preview and Analyze")
193
  # # _ = cropped_img.thumbnail((150, 150))
194
  # st.image(cropped_img)
195
+
196
  button = st.sidebar.button("Run OCR")
197
 
198
  if button:
 
211
  st.subheader(f"OCR Results for {Lng}")
212
  st.write(ocr_text)
213
  text_file = BytesIO(ocr_text.encode())
214
+ st.download_button('Download Text', text_file,
215
+ file_name='ocr_text.txt')
216
 
217
+ elif input_file is not "" or img_file.endswith(".pdf"):
218
  button = st.sidebar.button("Run OCR")
219
 
220
  if button:
 
223
  st.subheader(f"OCR Results for the PDF file")
224
  st.write(ocr_text)
225
  text_file = BytesIO(ocr_text.encode())
226
+ st.download_button('Download Text', text_file,
227
+ file_name='ocr_text.txt')
228
 
229
  # openai.api_key = ""
230
+
231
  # if "openai_model" not in st.session_state:
232
  # st.session_state["openai_model"] = "gpt-3.5-turbo"
233
+
234
  # if "messages" not in st.session_state:
235
  # st.session_state.messages = []
236
+
237
  # for message in st.session_state.messages:
238
  # with st.chat_message(message["role"]):
239
  # st.markdown(message["content"])
240
+
241
  # if prompt := st.chat_input("How can I help?"):
242
  # st.session_state.messages.append({"role": "user", "content": ocr_text + prompt})
243
  # with st.chat_message("user"):
244
  # st.markdown(prompt)
245
+
246
  # with st.chat_message("assistant"):
247
  # message_placeholder = st.empty()
248
  # full_response = ""
 
258
  # message_placeholder.markdown(full_response + "▌")
259
  # message_placeholder.markdown(full_response)
260
  # st.session_state.messages.append({"role": "assistant", "content": full_response})