billusanda007 commited on
Commit
3635259
·
verified ·
1 Parent(s): 82b81d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -49
app.py CHANGED
@@ -1,61 +1,79 @@
1
- import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  import re
5
  import pickle
6
- import nltk
7
- from nltk.corpus import stopwords
 
 
8
  from tensorflow.keras.models import load_model
9
  from tensorflow.keras.preprocessing.text import Tokenizer
10
  from tensorflow.keras.preprocessing.sequence import pad_sequences
11
-
12
- nltk.download('stopwords')
13
- stop_words = set(stopwords.words('english'))
14
 
15
  def cleanResume(resumeText):
16
- resumeText = re.sub(r'http\S+\s*', ' ', resumeText)
17
- resumeText = re.sub(r'RT|cc', ' ', resumeText)
18
- resumeText = re.sub(r'#\S+', '', resumeText)
19
- resumeText = re.sub(r'@\S+', ' ', resumeText)
20
- resumeText = re.sub(r'[%s]' % re.escape("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"), ' ', resumeText)
21
  resumeText = re.sub(r'[^\x00-\x7f]', r' ', resumeText)
22
- resumeText = re.sub(r'\s+', ' ', resumeText)
23
- resumeText = ' '.join([word for word in resumeText.split() if word.lower() not in stop_words])
24
  return resumeText
25
 
26
- def load_resources():
27
- with open('tokenizer.pkl', 'rb') as f:
28
- tokenizer = pickle.load(f)
29
- with open('label_encoder.pkl', 'rb') as f:
30
- label_encoder = pickle.load(f)
31
- model = load_model('deeprank_model_v2.h5')
32
- return tokenizer, label_encoder, model
33
-
34
- def infer(text, tokenizer, label_encoder, model):
35
- cleaned_text = cleanResume(text)
36
- sequence = tokenizer.texts_to_sequences([cleaned_text])
37
- padded_sequence = pad_sequences(sequence, maxlen=500)
38
- prediction = model.predict(padded_sequence)
39
- predicted_class = label_encoder.inverse_transform([np.argmax(prediction)])
40
- return predicted_class[0]
41
-
42
- st.title("Resume Category Predictor")
43
-
44
- st.write("Upload a resume text file or enter text below to predict the job category.")
45
-
46
- uploaded_file = st.file_uploader("Upload Resume (TXT file)", type=["txt"])
47
- user_input = st.text_area("Or paste resume text here:")
48
-
49
- if uploaded_file is not None:
50
- resume_text = uploaded_file.read().decode("utf-8")
51
- st.session_state["resume_text"] = resume_text
52
- elif user_input:
53
- resume_text = user_input
54
- st.session_state["resume_text"] = resume_text
55
- else:
56
- resume_text = ""
57
-
58
- if st.button("Predict Category") and resume_text:
59
- tokenizer, label_encoder, model = load_resources()
60
- prediction = infer(resume_text, tokenizer, label_encoder, model)
61
- st.write(f"Predicted Category: **{prediction}**")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  import numpy as np
3
  import re
4
  import pickle
5
+ import pdfminer
6
+ from pdfminer.high_level import extract_text
7
+ import pytesseract
8
+ from pdf2image import convert_from_path
9
  from tensorflow.keras.models import load_model
10
  from tensorflow.keras.preprocessing.text import Tokenizer
11
  from tensorflow.keras.preprocessing.sequence import pad_sequences
12
+ from sklearn.preprocessing import LabelEncoder
 
 
13
 
14
  def cleanResume(resumeText):
15
+ resumeText = re.sub('http\S+\s*', ' ', resumeText)
16
+ resumeText = re.sub('RT|cc', ' ', resumeText)
17
+ resumeText = re.sub('#\S+', '', resumeText)
18
+ resumeText = re.sub('@\S+', ' ', resumeText)
19
+ resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)
20
  resumeText = re.sub(r'[^\x00-\x7f]', r' ', resumeText)
21
+ resumeText = re.sub('\s+', ' ', resumeText)
 
22
  return resumeText
23
 
24
+ def pdf_to_text(file):
25
+ text = extract_text(file)
26
+ if not text.strip(): # If PDF text extraction fails, use OCR
27
+ images = convert_from_path(file)
28
+ text = "\n".join([pytesseract.image_to_string(img) for img in images])
29
+ return text
30
+
31
+ def load_deeprank_model():
32
+ return load_model('deeprank_model.h5')
33
+
34
+ def predict_category(resumes_data, selected_category, max_sequence_length, model, tokenizer, label):
35
+ resumes_df = pd.DataFrame(resumes_data)
36
+ resumes_text = resumes_df['ResumeText'].values
37
+
38
+ tokenized_text = tokenizer.texts_to_sequences(resumes_text)
39
+ padded_text = pad_sequences(tokenized_text, maxlen=max_sequence_length)
40
+
41
+ predicted_probs = model.predict(padded_text)
42
+ for i, category in enumerate(label.classes_):
43
+ resumes_df[category] = predicted_probs[:, i]
44
+
45
+ resumes_df_sorted = resumes_df.sort_values(by=selected_category, ascending=False)
46
+ ranks = [{'Rank': rank + 1, 'FileName': row['FileName']} for rank, (idx, row) in enumerate(resumes_df_sorted.iterrows())]
47
+ return ranks
48
+
49
+ def main():
50
+ model = load_deeprank_model()
51
+ df = pd.read_csv('UpdatedResumeDataSet.csv')
52
+ df['cleaned'] = df['Resume'].apply(cleanResume)
53
+ label = LabelEncoder()
54
+ df['Category'] = label.fit_transform(df['Category'])
55
+
56
+ text = df['cleaned'].values
57
+ tokenizer = Tokenizer()
58
+ tokenizer.fit_on_texts(text)
59
+ vocab_size = len(tokenizer.word_index) + 1
60
+ num_classes = len(label.classes_)
61
+ max_sequence_length = 500
62
+
63
+ resumes_data = []
64
+ files = input("Enter the paths of resumes (comma-separated): ").split(',')
65
+ for file in files:
66
+ text = cleanResume(pdf_to_text(file.strip()))
67
+ resumes_data.append({'ResumeText': text, 'FileName': file.strip()})
68
+
69
+ print("Available categories:", list(label.classes_))
70
+ selected_category = input("Select a category to rank by: ")
71
+
72
+ if not resumes_data or selected_category not in label.classes_:
73
+ print("Error: Invalid input. Please provide valid resumes and select a valid category.")
74
+ else:
75
+ ranks = predict_category(resumes_data, selected_category, max_sequence_length, model, tokenizer, label)
76
+ print(pd.DataFrame(ranks))
77
+
78
+ if __name__ == '__main__':
79
+ main()