rmdhirr commited on
Commit
a04dd4c
·
verified ·
1 Parent(s): 65ec533

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -2
app.py CHANGED
@@ -34,7 +34,7 @@ def normalize_length(text, target_length=50):
34
  if len(text) < target_length:
35
  text = text + " " * (target_length - len(text))
36
  else:
37
- text = text[: target_length]
38
  return text
39
 
40
  def preprocess_url(url):
@@ -62,6 +62,13 @@ def preprocess_html(html):
62
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
63
  return ' '.join(tokens)
64
 
 
 
 
 
 
 
 
65
  max_url_length = 180
66
  max_html_length = 2000
67
  max_words = 10000
@@ -84,7 +91,10 @@ def get_prediction(input_text):
84
  url_data = preprocess_input(cleaned_url, url_tokenizer, max_url_length)
85
  html_data = preprocess_input(cleaned_html, html_tokenizer, max_html_length)
86
 
87
- input_data = [url_data, html_data]
 
 
 
88
  prediction = model.predict(input_data)[0][0]
89
  return prediction
90
 
 
34
  if len(text) < target_length:
35
  text = text + " " * (target_length - len(text))
36
  else:
37
+ text = text[:target_length]
38
  return text
39
 
40
  def preprocess_url(url):
 
62
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
63
  return ' '.join(tokens)
64
 
65
+ def extract_features(url):
66
+ features = {}
67
+ features['length'] = len(url)
68
+ features['num_special_chars'] = len(re.findall(r'[^a-zA-Z0-9]', url))
69
+ features['num_digits'] = len(re.findall(r'\d', url))
70
+ return features
71
+
72
  max_url_length = 180
73
  max_html_length = 2000
74
  max_words = 10000
 
91
  url_data = preprocess_input(cleaned_url, url_tokenizer, max_url_length)
92
  html_data = preprocess_input(cleaned_html, html_tokenizer, max_html_length)
93
 
94
+ features = extract_features(input_text)
95
+ features_vector = np.array([[features['length'], features['num_special_chars'], features['num_digits']]])
96
+
97
+ input_data = [url_data, html_data, features_vector]
98
  prediction = model.predict(input_data)[0][0]
99
  return prediction
100