Spaces:

UniquePratham
/

DualTextOCRFusion

Running

UniquePratham commited on Sep 27, 2024

Commit

8b34af2

verified ·

1 Parent(s): 5217ecf

Update ocr_cpu.py

New Factor ocr_cpu.py

Files changed (1) hide show

ocr_cpu.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
-from transformers import AutoModel, AutoTokenizer
 import torch
-import re
 # Load model and tokenizer
 model_name = "srimanth-d/GOT_CPU"  # Using GOT model on CPU
@@ -57,17 +56,22 @@ def extract_text_got(uploaded_file):
             os.remove(temp_file_path)
             print(f"Temporary file {temp_file_path} removed.")
-# Function to clean extracted text (removes extra spaces and handles special cases for Hindi and English)
-def clean_text(extracted_text):
     """
-    Cleans extracted text by removing extra spaces and handling language-specific issues (Hindi, English, Hinglish).
     """
-    # Normalize spaces (remove multiple spaces)
-    text = re.sub(r'\s+', ' ', extracted_text)
-    # Handle special cases based on Hindi, English, and Hinglish patterns
-    text = re.sub(r'([a-zA-Z]+)\s+([a-zA-Z]+)', r'\1 \2', text)  # For English
-    text = re.sub(r'([ा-ह]+)\s+([ा-ह]+)', r'\1\2', text)  # For Hindi (conjoining Devanagari characters)
-    # Remove trailing and leading spaces
-    return text.strip()

 import os
 import torch
+from transformers import AutoModel, AutoTokenizer
 # Load model and tokenizer
 model_name = "srimanth-d/GOT_CPU"  # Using GOT model on CPU
             os.remove(temp_file_path)
             print(f"Temporary file {temp_file_path} removed.")
+# Function to clean extracted text using AI
+def clean_text_with_ai(extracted_text):
     """
+    Cleans extracted text by leveraging an AI model to intelligently remove extra spaces.
     """
+    try:
+        # Prepare the input for the AI model
+        inputs = tokenizer(extracted_text, return_tensors="pt").to(device)
+        # Generate cleaned text using the AI model
+        with torch.no_grad():
+            outputs = model.generate(**inputs, max_new_tokens=100)  # Adjust max_new_tokens as needed
+        # Decode the generated output
+        cleaned_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return cleaned_text.strip()  # Return the cleaned text
+    except Exception as e:
+        return f"Error during AI text cleaning: {str(e)}"