UniquePratham commited on
Commit
8b34af2
·
verified ·
1 Parent(s): 5217ecf

Update ocr_cpu.py

Browse files

New Factor ocr_cpu.py

Files changed (1) hide show
  1. ocr_cpu.py +17 -13
ocr_cpu.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
- from transformers import AutoModel, AutoTokenizer
3
  import torch
4
- import re
5
 
6
  # Load model and tokenizer
7
  model_name = "srimanth-d/GOT_CPU" # Using GOT model on CPU
@@ -57,17 +56,22 @@ def extract_text_got(uploaded_file):
57
  os.remove(temp_file_path)
58
  print(f"Temporary file {temp_file_path} removed.")
59
 
60
- # Function to clean extracted text (removes extra spaces and handles special cases for Hindi and English)
61
- def clean_text(extracted_text):
62
  """
63
- Cleans extracted text by removing extra spaces and handling language-specific issues (Hindi, English, Hinglish).
64
  """
65
- # Normalize spaces (remove multiple spaces)
66
- text = re.sub(r'\s+', ' ', extracted_text)
 
 
 
 
 
67
 
68
- # Handle special cases based on Hindi, English, and Hinglish patterns
69
- text = re.sub(r'([a-zA-Z]+)\s+([a-zA-Z]+)', r'\1 \2', text) # For English
70
- text = re.sub(r'([ा-ह]+)\s+([ा-ह]+)', r'\1\2', text) # For Hindi (conjoining Devanagari characters)
71
-
72
- # Remove trailing and leading spaces
73
- return text.strip()
 
1
  import os
 
2
  import torch
3
+ from transformers import AutoModel, AutoTokenizer
4
 
5
  # Load model and tokenizer
6
  model_name = "srimanth-d/GOT_CPU" # Using GOT model on CPU
 
56
  os.remove(temp_file_path)
57
  print(f"Temporary file {temp_file_path} removed.")
58
 
59
+ # Function to clean extracted text using AI
60
+ def clean_text_with_ai(extracted_text):
61
  """
62
+ Cleans extracted text by leveraging an AI model to intelligently remove extra spaces.
63
  """
64
+ try:
65
+ # Prepare the input for the AI model
66
+ inputs = tokenizer(extracted_text, return_tensors="pt").to(device)
67
+
68
+ # Generate cleaned text using the AI model
69
+ with torch.no_grad():
70
+ outputs = model.generate(**inputs, max_new_tokens=100) # Adjust max_new_tokens as needed
71
 
72
+ # Decode the generated output
73
+ cleaned_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
74
+
75
+ return cleaned_text.strip() # Return the cleaned text
76
+ except Exception as e:
77
+ return f"Error during AI text cleaning: {str(e)}"