Spaces:

winamnd
/

ocr-llm-test

Running

winamnd commited on Feb 16

Commit

70ac79e

verified ·

1 Parent(s): ed06b10

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -26,6 +26,7 @@ def ocr_with_easy(img):
     reader = easyocr.Reader(['en'])
     bounds = reader.readtext('image.png', paragraph="False", detail=0)
     extracted_text = ' '.join(bounds)
     return extracted_text
 """
@@ -37,7 +38,9 @@ def preprocess_text(text):
     filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
     stemmer = PorterStemmer()
     stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
-    return ' '.join(stemmed_tokens)
 """
 Load and Train Spam Classifier
@@ -67,9 +70,12 @@ def ocr_and_classify_spam(img):
     # Step 2: Preprocess and classify the extracted text
     if extracted_text:
         processed_text = preprocess_text(extracted_text)
-        input_tfidf = tfidf_vectorizer.transform([processed_text])
-        prediction = rf_classifier.predict(input_tfidf)
-        spam_result = "SPAM" if prediction[0] == 1 else "NOT SPAM"
     else:
         spam_result = "No text found in the image."

     reader = easyocr.Reader(['en'])
     bounds = reader.readtext('image.png', paragraph="False", detail=0)
     extracted_text = ' '.join(bounds)
+    print("Extracted Text:", extracted_text)  # Debugging line
     return extracted_text
 """
     filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
     stemmer = PorterStemmer()
     stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
+    processed_text = ' '.join(stemmed_tokens)
+    print("Processed Text:", processed_text)  # Debugging line
+    return processed_text
 """
 Load and Train Spam Classifier
     # Step 2: Preprocess and classify the extracted text
     if extracted_text:
         processed_text = preprocess_text(extracted_text)
+        if processed_text:  # Check if text is not empty after preprocessing
+            input_tfidf = tfidf_vectorizer.transform([processed_text])
+            prediction = rf_classifier.predict(input_tfidf)
+            spam_result = "SPAM" if prediction[0] == 1 else "NOT SPAM"
+        else:
+            spam_result = "No valid text to classify."
     else:
         spam_result = "No text found in the image."