winamnd commited on
Commit
fdc3a82
·
verified ·
1 Parent(s): 70ac79e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -80
app.py CHANGED
@@ -1,18 +1,7 @@
1
- import gradio as gr
2
  import cv2
3
  import easyocr
4
- import pandas as pd
5
- import nltk
6
- from nltk.tokenize import word_tokenize
7
- from nltk.corpus import stopwords
8
- from nltk.stem import PorterStemmer
9
- from sklearn.feature_extraction.text import TfidfVectorizer
10
- from sklearn.ensemble import RandomForestClassifier
11
-
12
- # Download necessary NLTK data
13
- nltk.data.path.append("/usr/local/lib/nltk_data")
14
- nltk.download('punkt')
15
- nltk.download('stopwords')
16
 
17
  """
18
  EasyOCR for Text Extraction
@@ -30,71 +19,10 @@ def ocr_with_easy(img):
30
  return extracted_text
31
 
32
  """
33
- Text Preprocessing for Spam Classification
34
- """
35
- def preprocess_text(text):
36
- tokens = word_tokenize(text.lower())
37
- stop_words = set(stopwords.words('english'))
38
- filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
39
- stemmer = PorterStemmer()
40
- stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
41
- processed_text = ' '.join(stemmed_tokens)
42
- print("Processed Text:", processed_text) # Debugging line
43
- return processed_text
44
-
45
- """
46
- Load and Train Spam Classifier
47
- """
48
- # Load the dataset
49
- data = pd.read_csv('spam.csv', encoding='latin-1')
50
- data['v2'] = data['v2'].apply(preprocess_text)
51
-
52
- # Feature Extraction (TF-IDF)
53
- tfidf_vectorizer = TfidfVectorizer()
54
- tfidf_matrix = tfidf_vectorizer.fit_transform(data['v2'])
55
-
56
- # Label Encoding
57
- data['v1'] = data['v1'].map({'ham': 0, 'spam': 1})
58
-
59
- # Create a Random Forest classifier
60
- rf_classifier = RandomForestClassifier(random_state=42)
61
- rf_classifier.fit(tfidf_matrix, data['v1'])
62
-
63
- """
64
- OCR and Spam Classification Pipeline
65
- """
66
- def ocr_and_classify_spam(img):
67
- # Step 1: Extract text from the image using EasyOCR
68
- extracted_text = ocr_with_easy(img)
69
-
70
- # Step 2: Preprocess and classify the extracted text
71
- if extracted_text:
72
- processed_text = preprocess_text(extracted_text)
73
- if processed_text: # Check if text is not empty after preprocessing
74
- input_tfidf = tfidf_vectorizer.transform([processed_text])
75
- prediction = rf_classifier.predict(input_tfidf)
76
- spam_result = "SPAM" if prediction[0] == 1 else "NOT SPAM"
77
- else:
78
- spam_result = "No valid text to classify."
79
- else:
80
- spam_result = "No text found in the image."
81
-
82
- return extracted_text, spam_result
83
-
84
- """
85
- Create User Interface with Gradio
86
  """
87
- image = gr.Image()
88
- output_text = gr.Textbox(label="Extracted Text")
89
- output_classification = gr.Textbox(label="Spam Classification")
90
-
91
- demo = gr.Interface(
92
- fn=ocr_and_classify_spam,
93
- inputs=image,
94
- outputs=[output_text, output_classification],
95
- title="OCR and Spam Classifier",
96
- description="Upload an image with text. The text will be extracted using EasyOCR and then classified as SPAM or NOT SPAM.",
97
- css=".gradio-container {background-color: lightgray}"
98
- )
99
-
100
- demo.launch()
 
 
1
  import cv2
2
  import easyocr
3
+ from PIL import Image
4
+ import numpy as np
 
 
 
 
 
 
 
 
 
 
5
 
6
  """
7
  EasyOCR for Text Extraction
 
19
  return extracted_text
20
 
21
  """
22
+ Example Usage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  """
24
+ if __name__ == "__main__":
25
+ # Example to test the OCR function
26
+ input_image = np.array(Image.open('example_image.png'))
27
+ text_output = ocr_with_easy(input_image)
28
+ print("Final Output:", text_output)