AlhitawiMohammed22 commited on
Commit
0833a56
·
1 Parent(s): 574dd01

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -0
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ from pathlib import Path
4
+ import contextlib
5
+
6
+ logging.basicConfig(
7
+ level=logging.INFO,
8
+ format="%(asctime)s - %(levelname)s - %(message)s",
9
+ )
10
+
11
+
12
+ import gradio as gr
13
+ import nltk
14
+ import torch
15
+
16
+ from pdf2text import *
17
+
18
+ _here = Path(__file__).parent
19
+
20
+ nltk.download("stopwords") # TODO=find where this requirement originates from
21
+
22
+
23
+ def load_uploaded_file(file_obj, temp_dir: Path = None):
24
+ """
25
+ load_uploaded_file - process an uploaded file
26
+ Args:
27
+ file_obj (POTENTIALLY list): Gradio file object inside a list
28
+ Returns:
29
+ str, the uploaded file contents
30
+ """
31
+
32
+ # check if mysterious file object is a list
33
+ if isinstance(file_obj, list):
34
+ file_obj = file_obj[0]
35
+ file_path = Path(file_obj.name)
36
+
37
+ if temp_dir is None:
38
+ _temp_dir = _here / "temp"
39
+ _temp_dir.mkdir(exist_ok=True)
40
+
41
+ try:
42
+ pdf_bytes_obj = open(file_path, "rb").read()
43
+ temp_path = temp_dir / file_path.name if temp_dir else file_path
44
+ # save to PDF file
45
+ with open(temp_path, "wb") as f:
46
+ f.write(pdf_bytes_obj)
47
+ logging.info(f"The uploaded file saved to {temp_path}")
48
+ return str(temp_path.resolve())
49
+
50
+ except Exception as e:
51
+ logging.error(f"Trying to load file with path {file_path}, error: {e}")
52
+ print(f"Trying to load file with path {file_path}, error: {e}")
53
+ return None
54
+
55
+
56
+ def convert_PDF(
57
+ pdf_obj,
58
+ language: str = "en",
59
+ max_pages=20,
60
+ ):
61
+ """
62
+ convert_PDF - convert a PDF file to text
63
+ Args:
64
+ pdf_bytes_obj (bytes): PDF file contents
65
+ language (str, optional): Language to use for OCR. Defaults to "en".
66
+ Returns:
67
+ str, the PDF file contents as text
68
+ """
69
+ # clear local text cache
70
+ rm_local_text_files()
71
+ global ocr_model
72
+ st = time.perf_counter()
73
+ if isinstance(pdf_obj, list):
74
+ pdf_obj = pdf_obj[0]
75
+ file_path = Path(pdf_obj.name)
76
+ if not file_path.suffix == ".pdf":
77
+ logging.error(f"File {file_path} is not a PDF file")
78
+
79
+ html_error = f"""
80
+ <div style="color: red; font-size: 20px; font-weight: bold;">
81
+ File {file_path} is not a PDF file. Please upload a PDF file.
82
+ </div>
83
+ """
84
+ return "File is not a PDF file", html_error, None
85
+
86
+ conversion_stats = convert_PDF_to_Text(
87
+ file_path,
88
+ ocr_model=ocr_model,
89
+ max_pages=max_pages,
90
+ )
91
+ converted_txt = conversion_stats["converted_text"]
92
+ num_pages = conversion_stats["num_pages"]
93
+ was_truncated = conversion_stats["truncated"]
94
+ # if alt_lang: # TODO: fix this
95
+
96
+ rt = round((time.perf_counter() - st) / 60, 2)
97
+ print(f"Runtime: {rt} minutes")
98
+ html = ""
99
+ if was_truncated:
100
+ html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
101
+ html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
102
+
103
+ _output_name = f"RESULT_{file_path.stem}_OCR.txt"
104
+ with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
105
+ f.write(converted_txt)
106
+
107
+ return converted_txt, html, _output_name
108
+
109
+
110
+ if __name__ == "__main__":
111
+ logging.info("Starting app")
112
+
113
+ use_GPU = torch.cuda.is_available()
114
+ logging.info(f"Using GPU status: {use_GPU}")
115
+ logging.info("Loading OCR model")
116
+ with contextlib.redirect_stdout(None):
117
+ ocr_model = ocr_predictor(
118
+ "db_resnet50",
119
+ "crnn_mobilenet_v3_large",
120
+ pretrained=True,
121
+ assume_straight_pages=True,
122
+ )
123
+
124
+ # define pdf bytes as None
125
+ pdf_obj = _here / "exampler.pdf"
126
+ pdf_obj = str(pdf_obj.resolve())
127
+ _temp_dir = _here / "temp"
128
+ _temp_dir.mkdir(exist_ok=True)
129
+
130
+ logging.info("starting demo")
131
+ demo = gr.Blocks()
132
+
133
+ with demo:
134
+
135
+ gr.Markdown("# PDF to Text")
136
+ gr.Markdown(
137
+ "A basic demo for end-to-end text detection and recognition where the input will be in pdf format and the result is text conversion using OCR from the [doctr](https://mindee.github.io/doctr/index.html) package"
138
+ )
139
+ gr.Markdown("---")