Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -25,12 +25,12 @@ CORS(app, resources={r"/*": {"origins": ["http://localhost:*", "https://play.dev
|
|
25 |
process_status = {}
|
26 |
process_results = {}
|
27 |
app.config['file_path'] = None
|
|
|
28 |
|
29 |
data_ready = False # Flag to check if extraction is complete
|
30 |
lock = threading.Lock() # Lock to manage concurrent access
|
31 |
extracted_texts = {}
|
32 |
-
|
33 |
-
os.environ["HF_HOME"] = "/app/cache"
|
34 |
ocr_tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
|
35 |
ocr_model = AutoModel.from_pretrained(
|
36 |
'ucaslcl/GOT-OCR2_0', trust_remote_code=True,
|
@@ -45,8 +45,8 @@ class DynamicTableExtractor:
|
|
45 |
def __init__(self, pdf_bytes: bytes, output_folder: str):
|
46 |
self.pdf_bytes = pdf_bytes
|
47 |
self.images = convert_from_bytes(pdf_bytes)
|
48 |
-
self.output_folder = output_folder
|
49 |
-
os.makedirs(output_folder, exist_ok=True)
|
50 |
|
51 |
def detect_lines(self, img_array):
|
52 |
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
@@ -209,6 +209,7 @@ def extract_text_from_image(image_path):
|
|
209 |
return ocr_model.chat(ocr_tokenizer, image_path, ocr_type='ocr')
|
210 |
|
211 |
def save_text_pages_as_images(pdf_path, categorized_pages, output_dir="output_images"):
|
|
|
212 |
os.makedirs(output_dir, exist_ok=True)
|
213 |
text_only_pages = [page_num for page_num, category in categorized_pages.items() if category == "only text"]
|
214 |
extracted_texts = {}
|
@@ -220,6 +221,8 @@ def save_text_pages_as_images(pdf_path, categorized_pages, output_dir="output_im
|
|
220 |
return extracted_texts
|
221 |
|
222 |
def extract_text_from_table_pages(pdf_path, categorized_pages, output_folder="extracted_tables"):
|
|
|
|
|
223 |
extracted_texts = {}
|
224 |
table_pages = [page_num for page_num, category in categorized_pages.items() if category in ["only table", "text & table"]]
|
225 |
with open(pdf_path, "rb") as f:
|
@@ -278,7 +281,8 @@ def process_pdf(pdf_path, process_id):
|
|
278 |
extracted_texts = save_text_pages_as_images(pdf_path, categorized_pages)
|
279 |
table_texts = extract_text_from_table_pages(pdf_path, categorized_pages)
|
280 |
extracted_texts.update(table_texts)
|
281 |
-
temp_file_path =
|
|
|
282 |
filepath = save_extracted_text(extracted_texts, temp_file_path) # Save extracted text to file
|
283 |
app.config['file_path'] = filepath
|
284 |
process_status[process_id] = "completed"
|
@@ -298,7 +302,9 @@ def upload_pdf():
|
|
298 |
return jsonify({'error': 'No file provided'}), 400
|
299 |
|
300 |
file = request.files['file']
|
301 |
-
pdf_path = os.path.join("uploads", file.filename)
|
|
|
|
|
302 |
os.makedirs("uploads", exist_ok=True)
|
303 |
file.save(pdf_path)
|
304 |
process_id = str(uuid.uuid4())
|
|
|
25 |
process_status = {}
|
26 |
process_results = {}
|
27 |
app.config['file_path'] = None
|
28 |
+
TEMP_DIR = tempfile.mkdtemp()
|
29 |
|
30 |
data_ready = False # Flag to check if extraction is complete
|
31 |
lock = threading.Lock() # Lock to manage concurrent access
|
32 |
extracted_texts = {}
|
33 |
+
os.environ["HF_HOME"] = os.path.join(TEMP_DIR, "cache") #"/app/cache"
|
|
|
34 |
ocr_tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
|
35 |
ocr_model = AutoModel.from_pretrained(
|
36 |
'ucaslcl/GOT-OCR2_0', trust_remote_code=True,
|
|
|
45 |
def __init__(self, pdf_bytes: bytes, output_folder: str):
|
46 |
self.pdf_bytes = pdf_bytes
|
47 |
self.images = convert_from_bytes(pdf_bytes)
|
48 |
+
self.output_folder = os.path.join(TEMP_DIR, output_folder)
|
49 |
+
os.makedirs(self.output_folder, exist_ok=True)
|
50 |
|
51 |
def detect_lines(self, img_array):
|
52 |
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
|
|
209 |
return ocr_model.chat(ocr_tokenizer, image_path, ocr_type='ocr')
|
210 |
|
211 |
def save_text_pages_as_images(pdf_path, categorized_pages, output_dir="output_images"):
|
212 |
+
output_dir = os.path.join(TEMP_DIR, output_dir)
|
213 |
os.makedirs(output_dir, exist_ok=True)
|
214 |
text_only_pages = [page_num for page_num, category in categorized_pages.items() if category == "only text"]
|
215 |
extracted_texts = {}
|
|
|
221 |
return extracted_texts
|
222 |
|
223 |
def extract_text_from_table_pages(pdf_path, categorized_pages, output_folder="extracted_tables"):
|
224 |
+
output_folder = os.path.join(TEMP_DIR, output_folder)
|
225 |
+
os.makedirs(output_folder, exist_ok=True)
|
226 |
extracted_texts = {}
|
227 |
table_pages = [page_num for page_num, category in categorized_pages.items() if category in ["only table", "text & table"]]
|
228 |
with open(pdf_path, "rb") as f:
|
|
|
281 |
extracted_texts = save_text_pages_as_images(pdf_path, categorized_pages)
|
282 |
table_texts = extract_text_from_table_pages(pdf_path, categorized_pages)
|
283 |
extracted_texts.update(table_texts)
|
284 |
+
temp_file_path = os.path.join(TEMP_DIR, f"extracted_{process_id}.txt")
|
285 |
+
# temp_file_path = tempfile.mktemp(suffix='.txt')
|
286 |
filepath = save_extracted_text(extracted_texts, temp_file_path) # Save extracted text to file
|
287 |
app.config['file_path'] = filepath
|
288 |
process_status[process_id] = "completed"
|
|
|
302 |
return jsonify({'error': 'No file provided'}), 400
|
303 |
|
304 |
file = request.files['file']
|
305 |
+
pdf_path = os.path.join(TEMP_DIR, "uploads", file.filename)
|
306 |
+
os.makedirs(os.path.dirname(pdf_path), exist_ok=True)
|
307 |
+
# pdf_path = os.path.join("uploads", file.filename)
|
308 |
os.makedirs("uploads", exist_ok=True)
|
309 |
file.save(pdf_path)
|
310 |
process_id = str(uuid.uuid4())
|