Spaces:

winamnd
/

ocr-llm-test

Running

App Files Files Community

winamnd commited on Feb 16

Commit

14299e0

verified ·

1 Parent(s): 07dea49

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -13

app.py CHANGED Viewed

@@ -19,12 +19,12 @@ RESULTS_CSV = "ocr_results.csv"
 # Ensure model exists
 if not os.path.exists(os.path.join(MODEL_PATH, "pytorch_model.bin")):
-    print(f" Model not found in {MODEL_PATH}. Downloading from Hugging Face Hub...")
     model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
     model.save_pretrained(MODEL_PATH)
     tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
     tokenizer.save_pretrained(MODEL_PATH)
-    print(f" Model saved at {MODEL_PATH}.")
 else:
     model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)
     tokenizer = DistilBertTokenizer.from_pretrained(MODEL_PATH)
@@ -47,13 +47,6 @@ def ocr_with_easy(img):
     results = reader.readtext(gray_image, detail=0)
     return ' '.join(results)
-# Preprocess Text
-def preprocess_text(text):
-    # Clean up the text by removing unwanted characters
-    text = text.strip()  # Remove leading/trailing whitespace
-    text = ' '.join(text.split())  # Normalize spaces
-    return text
 # OCR Function
 def generate_ocr(method, img):
     if img is None:
@@ -70,9 +63,6 @@ def generate_ocr(method, img):
     else:  # KerasOCR
         text_output = ocr_with_keras(img)
-    # Preprocess the text before feeding to the model
-    text_output = preprocess_text(text_output)
     # Classify Text as Spam or Not Spam
     inputs = tokenizer(text_output, return_tensors="pt", truncation=True, padding=True, max_length=512)
@@ -106,4 +96,4 @@ demo = gr.Interface(
 # Launch App
 if __name__ == "__main__":
-    demo.launch()

 # Ensure model exists
 if not os.path.exists(os.path.join(MODEL_PATH, "pytorch_model.bin")):
+    print(f"⚠️ Model not found in {MODEL_PATH}. Downloading from Hugging Face Hub...")
     model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
     model.save_pretrained(MODEL_PATH)
     tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
     tokenizer.save_pretrained(MODEL_PATH)
+    print(f"✅ Model saved at {MODEL_PATH}.")
 else:
     model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)
     tokenizer = DistilBertTokenizer.from_pretrained(MODEL_PATH)
     results = reader.readtext(gray_image, detail=0)
     return ' '.join(results)
 # OCR Function
 def generate_ocr(method, img):
     if img is None:
     else:  # KerasOCR
         text_output = ocr_with_keras(img)
     # Classify Text as Spam or Not Spam
     inputs = tokenizer(text_output, return_tensors="pt", truncation=True, padding=True, max_length=512)
 # Launch App
 if __name__ == "__main__":
+    demo.launch()