winamnd commited on
Commit
07dea49
·
verified ·
1 Parent(s): 09a5cc0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -2
app.py CHANGED
@@ -19,12 +19,12 @@ RESULTS_CSV = "ocr_results.csv"
19
 
20
  # Ensure model exists
21
  if not os.path.exists(os.path.join(MODEL_PATH, "pytorch_model.bin")):
22
- print(f"⚠️ Model not found in {MODEL_PATH}. Downloading from Hugging Face Hub...")
23
  model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
24
  model.save_pretrained(MODEL_PATH)
25
  tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
26
  tokenizer.save_pretrained(MODEL_PATH)
27
- print(f" Model saved at {MODEL_PATH}.")
28
  else:
29
  model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)
30
  tokenizer = DistilBertTokenizer.from_pretrained(MODEL_PATH)
@@ -47,6 +47,13 @@ def ocr_with_easy(img):
47
  results = reader.readtext(gray_image, detail=0)
48
  return ' '.join(results)
49
 
 
 
 
 
 
 
 
50
  # OCR Function
51
  def generate_ocr(method, img):
52
  if img is None:
@@ -63,6 +70,9 @@ def generate_ocr(method, img):
63
  else: # KerasOCR
64
  text_output = ocr_with_keras(img)
65
 
 
 
 
66
  # Classify Text as Spam or Not Spam
67
  inputs = tokenizer(text_output, return_tensors="pt", truncation=True, padding=True, max_length=512)
68
 
 
19
 
20
  # Ensure model exists
21
  if not os.path.exists(os.path.join(MODEL_PATH, "pytorch_model.bin")):
22
+ print(f" Model not found in {MODEL_PATH}. Downloading from Hugging Face Hub...")
23
  model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
24
  model.save_pretrained(MODEL_PATH)
25
  tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
26
  tokenizer.save_pretrained(MODEL_PATH)
27
+ print(f" Model saved at {MODEL_PATH}.")
28
  else:
29
  model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)
30
  tokenizer = DistilBertTokenizer.from_pretrained(MODEL_PATH)
 
47
  results = reader.readtext(gray_image, detail=0)
48
  return ' '.join(results)
49
 
50
+ # Preprocess Text
51
+ def preprocess_text(text):
52
+ # Clean up the text by removing unwanted characters
53
+ text = text.strip() # Remove leading/trailing whitespace
54
+ text = ' '.join(text.split()) # Normalize spaces
55
+ return text
56
+
57
  # OCR Function
58
  def generate_ocr(method, img):
59
  if img is None:
 
70
  else: # KerasOCR
71
  text_output = ocr_with_keras(img)
72
 
73
+ # Preprocess the text before feeding to the model
74
+ text_output = preprocess_text(text_output)
75
+
76
  # Classify Text as Spam or Not Spam
77
  inputs = tokenizer(text_output, return_tensors="pt", truncation=True, padding=True, max_length=512)
78