rohitashva commited on
Commit
763ca58
·
verified ·
1 Parent(s): 9107c5f

Delete doc2vec.py

Browse files
Files changed (1) hide show
  1. doc2vec.py +0 -277
doc2vec.py DELETED
@@ -1,277 +0,0 @@
1
- # Importing necessary libraries
2
- from collections import Counter
3
- import streamlit as st
4
- import nltk
5
- from gensim.models.doc2vec import Doc2Vec, TaggedDocument
6
- from nltk.tokenize import word_tokenize
7
- import PyPDF2
8
- import pandas as pd
9
- import re
10
- import matplotlib.pyplot as plt
11
- import seaborn as sns
12
-
13
- # Downloading the 'punkt' tokenizer from NLTK
14
- nltk.download('punkt')
15
-
16
- # Function to extract text from a PDF file
17
- def extract_text_from_pdf(pdf_file):
18
- pdf_reader = PyPDF2.PdfReader(pdf_file)
19
- text = ""
20
- for page_num in range(len(pdf_reader.pages)):
21
- text += pdf_reader.pages[page_num].extract_text()
22
- return text
23
-
24
- # Function to extract skills from a text using a list of skill keywords
25
- def extract_skills(text, skills_keywords):
26
- skills = [skill.lower()
27
- for skill in skills_keywords if re.search(r'\b' + re.escape(skill.lower()) + r'\b', text.lower())]
28
- return skills
29
-
30
- # Function to preprocess text by tokenizing and converting to lowercase
31
- def preprocess_text(text):
32
- return word_tokenize(text.lower())
33
-
34
- # Function to extract mobile numbers from a text
35
- def extract_mobile_numbers(text):
36
- mobile_pattern = r'\b\d{10}\b|\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
37
- return re.findall(mobile_pattern, text)
38
-
39
- # Function to extract emails from a text
40
- def extract_emails(text):
41
- email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
42
- return re.findall(email_pattern, text)
43
-
44
- # Function to train a Doc2Vec model on a list of tagged documents
45
- def train_doc2vec_model(documents):
46
- model = Doc2Vec(vector_size=20, min_count=2, epochs=50)
47
- model.build_vocab(documents)
48
- model.train(documents, total_examples=model.corpus_count,
49
- epochs=model.epochs)
50
- return model
51
-
52
- # Function to calculate the cosine similarity between two texts using a trained Doc2Vec model
53
- def calculate_similarity(model, text1, text2):
54
- vector1 = model.infer_vector(preprocess_text(text1))
55
- vector2 = model.infer_vector(preprocess_text(text2))
56
- return model.dv.cosine_similarities(vector1, [vector2])[0]
57
-
58
- # Function to calculate accuracy based on true positives, false positives, and false negatives
59
- def accuracy_calculation(true_positives, false_positives, false_negatives):
60
- total = true_positives + false_positives + false_negatives
61
- accuracy = true_positives / total if total != 0 else 0
62
- return accuracy
63
-
64
- # Function to extract CGPA from a text
65
- def extract_cgpa(resume_text):
66
- # Define a regular expression pattern for CGPA extraction
67
- cgpa_pattern = r'\b(?:CGPA|GPA|C.G.PA|Cumulative GPA)\s*:?[\s-]* ([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s*(?:CGPA|GPA)\b'
68
-
69
- # Search for CGPA pattern in the text
70
- match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
71
-
72
- # Check if a match is found
73
- if match:
74
- cgpa = match.group(1)
75
- if cgpa is not None:
76
- return float(cgpa)
77
- else:
78
- return float(match.group(2))
79
- else:
80
- return None
81
-
82
- # Regular expressions for email and phone number patterns
83
- email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
84
- phone_pattern = r'\b\d{10}\b|\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
85
-
86
- # Streamlit Frontend
87
- st.markdown("# Resume Matching Tool 📃📃")
88
- st.markdown("An application to match resumes with a job description.")
89
-
90
- # Sidebar - File Upload for Resumes
91
- st.sidebar.markdown("## Upload Resumes PDF")
92
- resumes_files = st.sidebar.file_uploader(
93
- "Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
94
-
95
- if resumes_files:
96
- # Sidebar - File Upload for Job Descriptions
97
- st.sidebar.markdown("## Upload Job Description PDF")
98
- job_descriptions_file = st.sidebar.file_uploader(
99
- "Upload Job Description PDF", type=["pdf"])
100
-
101
- if job_descriptions_file:
102
- # Sidebar - Sorting Options
103
- sort_options = ['Weighted Score', 'Similarity Score']
104
- selected_sort_option = st.sidebar.selectbox(
105
- "Sort results by", sort_options)
106
-
107
- # Backend Processing
108
- job_description_text = extract_text_from_pdf(job_descriptions_file)
109
- resumes_texts = [extract_text_from_pdf(
110
- resume_file) for resume_file in resumes_files]
111
-
112
- tagged_resumes = [TaggedDocument(words=preprocess_text(
113
- text), tags=[str(i)]) for i, text in enumerate(resumes_texts)]
114
- model_resumes = train_doc2vec_model(tagged_resumes)
115
-
116
- true_positives_mobile = 0
117
- false_positives_mobile = 0
118
- false_negatives_mobile = 0
119
-
120
- true_positives_email = 0
121
- false_positives_email = 0
122
- false_negatives_email = 0
123
-
124
- results_data = {'Resume': [], 'Similarity Score': [],
125
- 'Weighted Score': [], 'Email': [], 'Contact': [], 'CGPA': []}
126
-
127
- for i, resume_text in enumerate(resumes_texts):
128
- extracted_mobile_numbers = set(extract_mobile_numbers(resume_text))
129
- extracted_emails = set(extract_emails(resume_text))
130
- extracted_cgpa = extract_cgpa(resume_text)
131
-
132
- ground_truth_mobile_numbers = {'1234567890', '9876543210'}
133
- ground_truth_emails = {
134
135
-
136
- true_positives_mobile += len(
137
- extracted_mobile_numbers.intersection(ground_truth_mobile_numbers))
138
- false_positives_mobile += len(
139
- extracted_mobile_numbers.difference(ground_truth_mobile_numbers))
140
- false_negatives_mobile += len(
141
- ground_truth_mobile_numbers.difference(extracted_mobile_numbers))
142
-
143
- true_positives_email += len(
144
- extracted_emails.intersection(ground_truth_emails))
145
- false_positives_email += len(
146
- extracted_emails.difference(ground_truth_emails))
147
- false_negatives_email += len(
148
- ground_truth_emails.difference(extracted_emails))
149
-
150
- similarity_score = calculate_similarity(
151
- model_resumes, resume_text, job_description_text)
152
-
153
- other_criteria_score = 0
154
-
155
- weighted_score = (0.6 * similarity_score) + \
156
- (0.4 * other_criteria_score)
157
-
158
- results_data['Resume'].append(resumes_files[i].name)
159
- results_data['Similarity Score'].append(similarity_score * 100)
160
- results_data['Weighted Score'].append(weighted_score)
161
-
162
- emails = ', '.join(re.findall(email_pattern, resume_text))
163
- contacts = ', '.join(re.findall(phone_pattern, resume_text))
164
- results_data['Email'].append(emails)
165
- results_data['Contact'].append(contacts)
166
- results_data['CGPA'].append(extracted_cgpa)
167
-
168
- results_df = pd.DataFrame(results_data)
169
-
170
- if selected_sort_option == 'Similarity Score':
171
- results_df = results_df.sort_values(
172
- by='Similarity Score', ascending=False)
173
- else:
174
- results_df = results_df.sort_values(
175
- by='Weighted Score', ascending=False)
176
-
177
- st.subheader(f"Results Table (Sorted by {selected_sort_option}):")
178
-
179
- # Define a custom function to highlight maximum values in the specified columns
180
- def highlight_max(data, color='grey'):
181
- is_max = data == data.max()
182
- return [f'background-color: {color}' if val else '' for val in is_max]
183
-
184
- # Apply the custom highlighting function to the DataFrame
185
- st.dataframe(results_df.style.apply(highlight_max, subset=[
186
- 'Similarity Score', 'Weighted Score', 'CGPA']))
187
-
188
-
189
- highest_score_index = results_df['Similarity Score'].idxmax()
190
- highest_score_resume_name = resumes_files[highest_score_index].name
191
-
192
- st.subheader("\nDetails of Highest Similarity Score Resume:")
193
- st.write(f"Resume Name: {highest_score_resume_name}")
194
- st.write(
195
- f"Similarity Score: {results_df.loc[highest_score_index, 'Similarity Score']:.2f}")
196
-
197
- if 'Weighted Score' in results_df.columns:
198
- weighted_score_value = results_df.loc[highest_score_index,
199
- 'Weighted Score']
200
- st.write(f"Weighted Score: {weighted_score_value:.2f}" if pd.notnull(
201
- weighted_score_value) else "Weighted Score: Not Mentioned")
202
- else:
203
- st.write("Weighted Score: Not Mentioned")
204
-
205
- if 'Email' in results_df.columns:
206
- email_value = results_df.loc[highest_score_index, 'Email']
207
- st.write(f"Email: {email_value}" if pd.notnull(
208
- email_value) else "Email: Not Mentioned")
209
- else:
210
- st.write("Email: Not Mentioned")
211
-
212
- if 'Contact' in results_df.columns:
213
- contact_value = results_df.loc[highest_score_index, 'Contact']
214
- st.write(f"Contact: {contact_value}" if pd.notnull(
215
- contact_value) else "Contact: Not Mentioned")
216
- else:
217
- st.write("Contact: Not Mentioned")
218
-
219
- if 'CGPA' in results_df.columns:
220
- cgpa_value = results_df.loc[highest_score_index, 'CGPA']
221
- st.write(f"CGPA: {cgpa_value}" if pd.notnull(
222
- cgpa_value) else "CGPA: Not Mentioned")
223
- else:
224
- st.write("CGPA: Not Mentioned")
225
-
226
- mobile_accuracy = accuracy_calculation(
227
- true_positives_mobile, false_positives_mobile, false_negatives_mobile)
228
- email_accuracy = accuracy_calculation(
229
- true_positives_email, false_positives_email, false_negatives_email)
230
-
231
- st.subheader("\nHeatmap:")
232
- # st.write(f"Mobile Number Accuracy: {mobile_accuracy:.2%}")
233
- # st.write(f"Email Accuracy: {email_accuracy:.2%}")
234
-
235
- # Get skills keywords from user input
236
- skills_keywords_input = st.text_input(
237
- "Enter skills keywords separated by commas (e.g., python, java, machine learning):")
238
- skills_keywords = [skill.strip()
239
- for skill in skills_keywords_input.split(',') if skill.strip()]
240
-
241
- if skills_keywords:
242
- # Calculate the similarity score between each skill keyword and the resume text
243
- skills_similarity_scores = []
244
- for resume_text in resumes_texts:
245
- resume_text_similarity_scores = []
246
- for skill in skills_keywords:
247
- similarity_score = calculate_similarity(
248
- model_resumes, resume_text, skill)
249
- resume_text_similarity_scores.append(similarity_score)
250
- skills_similarity_scores.append(resume_text_similarity_scores)
251
-
252
- # Create a DataFrame with the similarity scores and set the index to the names of the PDFs
253
- skills_similarity_df = pd.DataFrame(
254
- skills_similarity_scores, columns=skills_keywords, index=[resume_file.name for resume_file in resumes_files])
255
-
256
- # Plot the heatmap
257
- fig, ax = plt.subplots(figsize=(12, 8))
258
-
259
- sns.heatmap(skills_similarity_df,
260
- cmap='YlGnBu', annot=True, fmt=".2f", ax=ax)
261
- ax.set_title('Heatmap for Skills Similarity')
262
- ax.set_xlabel('Skills')
263
- ax.set_ylabel('Resumes')
264
-
265
- # Rotate the y-axis labels for better readability
266
- plt.yticks(rotation=0)
267
-
268
- # Display the Matplotlib figure using st.pyplot()
269
- st.pyplot(fig)
270
- else:
271
- st.write("Please enter at least one skill keyword.")
272
-
273
-
274
- else:
275
- st.warning("Please upload the Job Description PDF to proceed.")
276
- else:
277
- st.warning("Please upload Resumes PDF to proceed.")