Spaces:
Running
Running
Update ocr_cpu.py
Browse filesNew Factor ocr_cpu.py
- ocr_cpu.py +17 -13
ocr_cpu.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import os
|
2 |
-
from transformers import AutoModel, AutoTokenizer
|
3 |
import torch
|
4 |
-
import
|
5 |
|
6 |
# Load model and tokenizer
|
7 |
model_name = "srimanth-d/GOT_CPU" # Using GOT model on CPU
|
@@ -57,17 +56,22 @@ def extract_text_got(uploaded_file):
|
|
57 |
os.remove(temp_file_path)
|
58 |
print(f"Temporary file {temp_file_path} removed.")
|
59 |
|
60 |
-
# Function to clean extracted text
|
61 |
-
def
|
62 |
"""
|
63 |
-
Cleans extracted text by
|
64 |
"""
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
1 |
import os
|
|
|
2 |
import torch
|
3 |
+
from transformers import AutoModel, AutoTokenizer
|
4 |
|
5 |
# Load model and tokenizer
|
6 |
model_name = "srimanth-d/GOT_CPU" # Using GOT model on CPU
|
|
|
56 |
os.remove(temp_file_path)
|
57 |
print(f"Temporary file {temp_file_path} removed.")
|
58 |
|
59 |
+
# Function to clean extracted text using AI
|
60 |
+
def clean_text_with_ai(extracted_text):
|
61 |
"""
|
62 |
+
Cleans extracted text by leveraging an AI model to intelligently remove extra spaces.
|
63 |
"""
|
64 |
+
try:
|
65 |
+
# Prepare the input for the AI model
|
66 |
+
inputs = tokenizer(extracted_text, return_tensors="pt").to(device)
|
67 |
+
|
68 |
+
# Generate cleaned text using the AI model
|
69 |
+
with torch.no_grad():
|
70 |
+
outputs = model.generate(**inputs, max_new_tokens=100) # Adjust max_new_tokens as needed
|
71 |
|
72 |
+
# Decode the generated output
|
73 |
+
cleaned_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
74 |
+
|
75 |
+
return cleaned_text.strip() # Return the cleaned text
|
76 |
+
except Exception as e:
|
77 |
+
return f"Error during AI text cleaning: {str(e)}"
|