Spanicin commited on
Commit
752b8f4
·
verified ·
1 Parent(s): 340e9ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -6
app.py CHANGED
@@ -25,12 +25,12 @@ CORS(app, resources={r"/*": {"origins": ["http://localhost:*", "https://play.dev
25
  process_status = {}
26
  process_results = {}
27
  app.config['file_path'] = None
 
28
 
29
  data_ready = False # Flag to check if extraction is complete
30
  lock = threading.Lock() # Lock to manage concurrent access
31
  extracted_texts = {}
32
-
33
- os.environ["HF_HOME"] = "/app/cache"
34
  ocr_tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
35
  ocr_model = AutoModel.from_pretrained(
36
  'ucaslcl/GOT-OCR2_0', trust_remote_code=True,
@@ -45,8 +45,8 @@ class DynamicTableExtractor:
45
  def __init__(self, pdf_bytes: bytes, output_folder: str):
46
  self.pdf_bytes = pdf_bytes
47
  self.images = convert_from_bytes(pdf_bytes)
48
- self.output_folder = output_folder
49
- os.makedirs(output_folder, exist_ok=True)
50
 
51
  def detect_lines(self, img_array):
52
  gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
@@ -209,6 +209,7 @@ def extract_text_from_image(image_path):
209
  return ocr_model.chat(ocr_tokenizer, image_path, ocr_type='ocr')
210
 
211
  def save_text_pages_as_images(pdf_path, categorized_pages, output_dir="output_images"):
 
212
  os.makedirs(output_dir, exist_ok=True)
213
  text_only_pages = [page_num for page_num, category in categorized_pages.items() if category == "only text"]
214
  extracted_texts = {}
@@ -220,6 +221,8 @@ def save_text_pages_as_images(pdf_path, categorized_pages, output_dir="output_im
220
  return extracted_texts
221
 
222
  def extract_text_from_table_pages(pdf_path, categorized_pages, output_folder="extracted_tables"):
 
 
223
  extracted_texts = {}
224
  table_pages = [page_num for page_num, category in categorized_pages.items() if category in ["only table", "text & table"]]
225
  with open(pdf_path, "rb") as f:
@@ -278,7 +281,8 @@ def process_pdf(pdf_path, process_id):
278
  extracted_texts = save_text_pages_as_images(pdf_path, categorized_pages)
279
  table_texts = extract_text_from_table_pages(pdf_path, categorized_pages)
280
  extracted_texts.update(table_texts)
281
- temp_file_path = tempfile.mktemp(suffix='.txt')
 
282
  filepath = save_extracted_text(extracted_texts, temp_file_path) # Save extracted text to file
283
  app.config['file_path'] = filepath
284
  process_status[process_id] = "completed"
@@ -298,7 +302,9 @@ def upload_pdf():
298
  return jsonify({'error': 'No file provided'}), 400
299
 
300
  file = request.files['file']
301
- pdf_path = os.path.join("uploads", file.filename)
 
 
302
  os.makedirs("uploads", exist_ok=True)
303
  file.save(pdf_path)
304
  process_id = str(uuid.uuid4())
 
25
  process_status = {}
26
  process_results = {}
27
  app.config['file_path'] = None
28
+ TEMP_DIR = tempfile.mkdtemp()
29
 
30
  data_ready = False # Flag to check if extraction is complete
31
  lock = threading.Lock() # Lock to manage concurrent access
32
  extracted_texts = {}
33
+ os.environ["HF_HOME"] = os.path.join(TEMP_DIR, "cache") #"/app/cache"
 
34
  ocr_tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
35
  ocr_model = AutoModel.from_pretrained(
36
  'ucaslcl/GOT-OCR2_0', trust_remote_code=True,
 
45
  def __init__(self, pdf_bytes: bytes, output_folder: str):
46
  self.pdf_bytes = pdf_bytes
47
  self.images = convert_from_bytes(pdf_bytes)
48
+ self.output_folder = os.path.join(TEMP_DIR, output_folder)
49
+ os.makedirs(self.output_folder, exist_ok=True)
50
 
51
  def detect_lines(self, img_array):
52
  gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
 
209
  return ocr_model.chat(ocr_tokenizer, image_path, ocr_type='ocr')
210
 
211
  def save_text_pages_as_images(pdf_path, categorized_pages, output_dir="output_images"):
212
+ output_dir = os.path.join(TEMP_DIR, output_dir)
213
  os.makedirs(output_dir, exist_ok=True)
214
  text_only_pages = [page_num for page_num, category in categorized_pages.items() if category == "only text"]
215
  extracted_texts = {}
 
221
  return extracted_texts
222
 
223
  def extract_text_from_table_pages(pdf_path, categorized_pages, output_folder="extracted_tables"):
224
+ output_folder = os.path.join(TEMP_DIR, output_folder)
225
+ os.makedirs(output_folder, exist_ok=True)
226
  extracted_texts = {}
227
  table_pages = [page_num for page_num, category in categorized_pages.items() if category in ["only table", "text & table"]]
228
  with open(pdf_path, "rb") as f:
 
281
  extracted_texts = save_text_pages_as_images(pdf_path, categorized_pages)
282
  table_texts = extract_text_from_table_pages(pdf_path, categorized_pages)
283
  extracted_texts.update(table_texts)
284
+ temp_file_path = os.path.join(TEMP_DIR, f"extracted_{process_id}.txt")
285
+ # temp_file_path = tempfile.mktemp(suffix='.txt')
286
  filepath = save_extracted_text(extracted_texts, temp_file_path) # Save extracted text to file
287
  app.config['file_path'] = filepath
288
  process_status[process_id] = "completed"
 
302
  return jsonify({'error': 'No file provided'}), 400
303
 
304
  file = request.files['file']
305
+ pdf_path = os.path.join(TEMP_DIR, "uploads", file.filename)
306
+ os.makedirs(os.path.dirname(pdf_path), exist_ok=True)
307
+ # pdf_path = os.path.join("uploads", file.filename)
308
  os.makedirs("uploads", exist_ok=True)
309
  file.save(pdf_path)
310
  process_id = str(uuid.uuid4())