Spaces:

winamnd
/

ocr-llm-test

Running

winamnd commited on Feb 16

Commit

07dea49

verified ·

1 Parent(s): 09a5cc0

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -19,12 +19,12 @@ RESULTS_CSV = "ocr_results.csv"
 # Ensure model exists
 if not os.path.exists(os.path.join(MODEL_PATH, "pytorch_model.bin")):
-    print(f"⚠️ Model not found in {MODEL_PATH}. Downloading from Hugging Face Hub...")
     model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
     model.save_pretrained(MODEL_PATH)
     tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
     tokenizer.save_pretrained(MODEL_PATH)
-    print(f"✅ Model saved at {MODEL_PATH}.")
 else:
     model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)
     tokenizer = DistilBertTokenizer.from_pretrained(MODEL_PATH)
@@ -47,6 +47,13 @@ def ocr_with_easy(img):
     results = reader.readtext(gray_image, detail=0)
     return ' '.join(results)
 # OCR Function
 def generate_ocr(method, img):
     if img is None:
@@ -63,6 +70,9 @@ def generate_ocr(method, img):
     else:  # KerasOCR
         text_output = ocr_with_keras(img)
     # Classify Text as Spam or Not Spam
     inputs = tokenizer(text_output, return_tensors="pt", truncation=True, padding=True, max_length=512)

 # Ensure model exists
 if not os.path.exists(os.path.join(MODEL_PATH, "pytorch_model.bin")):
+    print(f" Model not found in {MODEL_PATH}. Downloading from Hugging Face Hub...")
     model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
     model.save_pretrained(MODEL_PATH)
     tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
     tokenizer.save_pretrained(MODEL_PATH)
+    print(f" Model saved at {MODEL_PATH}.")
 else:
     model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)
     tokenizer = DistilBertTokenizer.from_pretrained(MODEL_PATH)
     results = reader.readtext(gray_image, detail=0)
     return ' '.join(results)
+# Preprocess Text
+def preprocess_text(text):
+    # Clean up the text by removing unwanted characters
+    text = text.strip()  # Remove leading/trailing whitespace
+    text = ' '.join(text.split())  # Normalize spaces
+    return text
 # OCR Function
 def generate_ocr(method, img):
     if img is None:
     else:  # KerasOCR
         text_output = ocr_with_keras(img)
+    # Preprocess the text before feeding to the model
+    text_output = preprocess_text(text_output)
     # Classify Text as Spam or Not Spam
     inputs = tokenizer(text_output, return_tensors="pt", truncation=True, padding=True, max_length=512)