Andrei Shadrikov commited on
Commit
d15e89e
·
1 Parent(s): d798dd1
Files changed (1) hide show
  1. app.py +23 -3
app.py CHANGED
@@ -6,6 +6,7 @@ from pdf2image import convert_from_path
6
  import shutil
7
  import tempfile
8
  from transformers import pipeline
 
9
 
10
  out_files = gr.State([])
11
  FILE_TIMEOUT = 10 ** 3
@@ -31,6 +32,18 @@ def handle_files(cur_files):
31
  cur_files = cur_files[-MAX_FILES:]
32
  return cur_files
33
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # Function to process PDF and generate ZIP file
35
  def process_pdf(pdf_file, cur_files):
36
 
@@ -60,7 +73,14 @@ def process_image_and_text(doc, text_query):
60
  outputs += p(img, question)
61
  return sorted(outputs, key=lambda x: x["score"], reverse=True)[0]['answer']
62
 
63
- # Interface for the Gradio app
 
 
 
 
 
 
 
64
  pdf_interface = gr.Interface(
65
  fn=process_pdf,
66
  inputs=[PDF(label="Upload PDF"), out_files],
@@ -82,9 +102,9 @@ image_interface = gr.Interface(
82
 
83
  # Create a tabbed interface
84
  tabbed_interface = gr.TabbedInterface(
85
- [pdf_interface, image_interface],
86
  title="PDF interaction",
87
- tab_names=["Converter", "Interaction"],
88
  # description="Choose a tab to perform the desired task."
89
  )
90
 
 
6
  import shutil
7
  import tempfile
8
  from transformers import pipeline
9
+ import subprocess as sp
10
 
11
  out_files = gr.State([])
12
  FILE_TIMEOUT = 10 ** 3
 
32
  cur_files = cur_files[-MAX_FILES:]
33
  return cur_files
34
 
35
+
36
+ def extract_text(pdf_file):
37
+ """
38
+ Generate a text rendering of a PDF file in the form of a list of lines.
39
+ """
40
+ args = ['pdftotext', '-layout', path, '-']
41
+ cp = sp.run(
42
+ args, stdout=sp.PIPE, stderr=sp.DEVNULL,
43
+ check=True, text=True
44
+ )
45
+ return cp.stdout
46
+
47
  # Function to process PDF and generate ZIP file
48
  def process_pdf(pdf_file, cur_files):
49
 
 
73
  outputs += p(img, question)
74
  return sorted(outputs, key=lambda x: x["score"], reverse=True)[0]['answer']
75
 
76
+ text_interface = gr.Interface(
77
+ fn=extract_text,
78
+ inputs=[PDF(label="Upload PDF"), out_files],
79
+ outputs=[gr.Textbox(label="Extracted Text"), out_files],
80
+ title="PDF extractor",
81
+ description="Extracts text from the PDF container."
82
+ )
83
+
84
  pdf_interface = gr.Interface(
85
  fn=process_pdf,
86
  inputs=[PDF(label="Upload PDF"), out_files],
 
102
 
103
  # Create a tabbed interface
104
  tabbed_interface = gr.TabbedInterface(
105
+ [text_interface, pdf_interface, image_interface],
106
  title="PDF interaction",
107
+ tab_names=["Text extractor", "Converter", "Interaction"],
108
  # description="Choose a tab to perform the desired task."
109
  )
110