Spaces:

winamnd
/

ocr-llm-test

Running

App Files Files Community

winamnd commited on Feb 16

Commit

fdc3a82

verified ·

1 Parent(s): 70ac79e

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -80

app.py CHANGED Viewed

@@ -1,18 +1,7 @@
-import gradio as gr
 import cv2
 import easyocr
-import pandas as pd
-import nltk
-from nltk.tokenize import word_tokenize
-from nltk.corpus import stopwords
-from nltk.stem import PorterStemmer
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.ensemble import RandomForestClassifier
-# Download necessary NLTK data
-nltk.data.path.append("/usr/local/lib/nltk_data")
-nltk.download('punkt')
-nltk.download('stopwords')
 """
 EasyOCR for Text Extraction
@@ -30,71 +19,10 @@ def ocr_with_easy(img):
     return extracted_text
 """
-Text Preprocessing for Spam Classification
-"""
-def preprocess_text(text):
-    tokens = word_tokenize(text.lower())
-    stop_words = set(stopwords.words('english'))
-    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
-    stemmer = PorterStemmer()
-    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
-    processed_text = ' '.join(stemmed_tokens)
-    print("Processed Text:", processed_text)  # Debugging line
-    return processed_text
-"""
-Load and Train Spam Classifier
-"""
-# Load the dataset
-data = pd.read_csv('spam.csv', encoding='latin-1')
-data['v2'] = data['v2'].apply(preprocess_text)
-# Feature Extraction (TF-IDF)
-tfidf_vectorizer = TfidfVectorizer()
-tfidf_matrix = tfidf_vectorizer.fit_transform(data['v2'])
-# Label Encoding
-data['v1'] = data['v1'].map({'ham': 0, 'spam': 1})
-# Create a Random Forest classifier
-rf_classifier = RandomForestClassifier(random_state=42)
-rf_classifier.fit(tfidf_matrix, data['v1'])
-"""
-OCR and Spam Classification Pipeline
-"""
-def ocr_and_classify_spam(img):
-    # Step 1: Extract text from the image using EasyOCR
-    extracted_text = ocr_with_easy(img)
-    # Step 2: Preprocess and classify the extracted text
-    if extracted_text:
-        processed_text = preprocess_text(extracted_text)
-        if processed_text:  # Check if text is not empty after preprocessing
-            input_tfidf = tfidf_vectorizer.transform([processed_text])
-            prediction = rf_classifier.predict(input_tfidf)
-            spam_result = "SPAM" if prediction[0] == 1 else "NOT SPAM"
-        else:
-            spam_result = "No valid text to classify."
-    else:
-        spam_result = "No text found in the image."
-    return extracted_text, spam_result
-"""
-Create User Interface with Gradio
 """
-image = gr.Image()
-output_text = gr.Textbox(label="Extracted Text")
-output_classification = gr.Textbox(label="Spam Classification")
-demo = gr.Interface(
-    fn=ocr_and_classify_spam,
-    inputs=image,
-    outputs=[output_text, output_classification],
-    title="OCR and Spam Classifier",
-    description="Upload an image with text. The text will be extracted using EasyOCR and then classified as SPAM or NOT SPAM.",
-    css=".gradio-container {background-color: lightgray}"
-)
-demo.launch()

 import cv2
 import easyocr
+from PIL import Image
+import numpy as np
 """
 EasyOCR for Text Extraction
     return extracted_text
 """
+Example Usage
 """
+if __name__ == "__main__":
+    # Example to test the OCR function
+    input_image = np.array(Image.open('example_image.png'))
+    text_output = ocr_with_easy(input_image)
+    print("Final Output:", text_output)