nlpblogs commited on
Commit
5b1512b
·
verified ·
1 Parent(s): 0e0978c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -108
app.py CHANGED
@@ -25,119 +25,70 @@ from sklearn.feature_extraction.text import TfidfVectorizer
25
  from sklearn.metrics.pairwise import cosine_similarity
26
  import tempfile
27
 
28
- # First set of inputs and calculations
29
- txt1 = st.text_area("Job description 1", key="text 1")
30
- job_description_series1 = pd.Series([txt1], name="Text")
31
- st.dataframe(job_description_series1)
32
-
33
- uploaded_files1 = st.file_uploader(
34
- "Choose PDF file(s) for candidate profiles 1", type="pdf", key="candidate 1", accept_multiple_files=True
35
- )
36
-
37
- all_resumes_text1 = [] # Store the text content of each PDF
38
-
39
- if uploaded_files1:
40
- for uploaded_file in uploaded_files1:
41
- try:
42
- pdf_reader = PdfReader(uploaded_file)
43
- text_data = ""
44
- for page in pdf_reader.pages:
45
- text_data += page.extract_text()
46
- model = GLiNER.from_pretrained("urchade/gliner_base")
47
- labels = ["person", "country", "organization", "time", "role"]
48
- entities = model.predict_entities(text_data, labels)
49
-
50
- entity_dict = {}
51
- for label in labels:
52
- entity_dict[label] = [entity["text"] for entity in entities if entity["label"] == label]
53
-
54
- data = {"Text": text_data, **entity_dict}
55
-
56
-
57
-
58
-
59
-
60
-
61
-
62
-
63
- all_resumes_text1.append(data)
64
- except Exception as e:
65
- st.error(f"Error processing file {uploaded_file.name}: {e}")
66
-
67
- if all_resumes_text1:
68
- all_documents1 = [job_description_series1.iloc[0]] + all_resumes_text1
69
-
70
- vectorizer1 = TfidfVectorizer()
71
- tfidf_matrix1 = vectorizer1.fit_transform(all_documents1)
72
-
73
- tfidf_df1 = pd.DataFrame(tfidf_matrix1.toarray(), columns=vectorizer1.get_feature_names_out())
74
- st.subheader("TF-IDF Values (Set 1):")
75
- st.dataframe(tfidf_df1)
76
-
77
- cosine_sim_matrix1 = cosine_similarity(tfidf_matrix1)
78
- cosine_sim_df1 = pd.DataFrame(cosine_sim_matrix1)
79
- st.subheader("Cosine Similarity Matrix (Set 1):")
80
- st.dataframe(cosine_sim_df1)
81
-
82
- st.subheader("Cosine Similarity Scores (Job Description 1 vs. Resumes 1):")
83
- for i, similarity_score in enumerate(cosine_sim_matrix1[0][1:]):
84
- st.write(f"Similarity with Candidate Profile {i + 1}: {similarity_score:.4f}")
85
 
86
- st.divider()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
- # Second set of inputs and calculations
89
- txt2 = st.text_area("Job description 2", key="text 2")
90
- job_description_series2 = pd.Series([txt2], name="Text")
91
- st.dataframe(job_description_series2)
92
-
93
- uploaded_files2 = st.file_uploader(
94
- "Choose PDF file(s) for candidate profiles 2", type="pdf", key="candidate 2", accept_multiple_files=True
95
- )
96
-
97
- all_resumes_text2 = [] # Store the text content of each PDF
98
-
99
- if uploaded_files2:
100
- for uploaded_file in uploaded_files2:
101
- try:
102
- pdf_reader = PdfReader(uploaded_file)
103
- text_data = ""
104
- for page in pdf_reader.pages:
105
- text_data += page.extract_text()
106
- model = GLiNER.from_pretrained("urchade/gliner_base")
107
- labels = ["person", "country", "organization", "time", "role"]
108
  entities = model.predict_entities(text_data, labels)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- entity_dict = {}
111
- for label in labels:
112
- entity_dict[label] = [entity["text"] for entity in entities if entity["label"] == label]
113
-
114
- data = {"Text": text_data, **entity_dict}
115
-
116
-
117
-
118
-
119
- all_resumes_text2.append(text_data)
120
- except Exception as e:
121
- st.error(f"Error processing file {uploaded_file.name}: {e}")
122
-
123
- if all_resumes_text2:
124
- all_documents2 = [job_description_series2.iloc[0]] + all_resumes_text2
125
-
126
- vectorizer2 = TfidfVectorizer()
127
- tfidf_matrix2 = vectorizer2.fit_transform(all_documents2)
128
-
129
- tfidf_df2 = pd.DataFrame(tfidf_matrix2.toarray(), columns=vectorizer2.get_feature_names_out())
130
- st.subheader("TF-IDF Values (Set 2):")
131
- st.dataframe(tfidf_df2)
132
-
133
- cosine_sim_matrix2 = cosine_similarity(tfidf_matrix2)
134
- cosine_sim_df2 = pd.DataFrame(cosine_sim_matrix2)
135
- st.subheader("Cosine Similarity Matrix (Set 2):")
136
- st.dataframe(cosine_sim_df2)
137
 
138
- st.subheader("Cosine Similarity Scores (Job Description 2 vs. Resumes 2):")
139
- for i, similarity_score in enumerate(cosine_sim_matrix2[0][1:]):
140
- st.write(f"Similarity with Candidate Profile {i + 1}: {similarity_score:.4f}")
141
 
142
 
143
 
 
25
  from sklearn.metrics.pairwise import cosine_similarity
26
  import tempfile
27
 
28
+ import streamlit as st
29
+ import pandas as pd
30
+ from PyPDF2 import PdfReader
31
+ from gliner import GLiNER
32
+ from sklearn.feature_extraction.text import TfidfVectorizer
33
+ from sklearn.metrics.pairwise import cosine_similarity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ def process_documents(job_description_key, file_uploader_key, title):
36
+ txt = st.text_area(f"Job description for {title}", key=job_description_key)
37
+ job_description_series = pd.Series([txt], name="Text")
38
+ st.dataframe(job_description_series)
39
+
40
+ uploaded_files = st.file_uploader(
41
+ f"Choose PDF file(s) for candidate profiles for {title}", type="pdf", key=file_uploader_key,
42
+ )
43
+
44
+ all_extracted_data = []
45
+ if uploaded_files:
46
+ model = GLiNER.from_pretrained("urchade/gliner_base")
47
+ labels = ["person", "country", "organization", "time", "role"]
48
+ for uploaded_file in uploaded_files:
49
+ try:
50
+ pdf_reader = PdfReader(uploaded_file)
51
+ text_data = ""
52
+ for page in pdf_reader.pages:
53
+ text_data += page.extract_text()
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  entities = model.predict_entities(text_data, labels)
56
+ entity_dict = {}
57
+ for label in labels:
58
+ entity_dict[label] = [entity["text"] for entity in entities if entity["label"] == label]
59
+ data = {"Text": text_data, **entity_dict}
60
+ all_extracted_data.append(data)
61
+ except Exception as e:
62
+ st.error(f"Error processing file {uploaded_file.name}: {e}")
63
+
64
+ if all_extracted_data:
65
+ df_entities = pd.DataFrame(all_extracted_data)
66
+ st.subheader(f"Extracted Entities ({title}):")
67
+ st.dataframe(df_entities)
68
+
69
+ all_documents = [job_description_series.iloc[0]] + df_entities['Text'].tolist()
70
+ vectorizer = TfidfVectorizer()
71
+ tfidf_matrix = vectorizer.fit_transform(all_documents)
72
+ tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
73
+ st.subheader(f"TF-IDF Values ({title}):")
74
+ st.dataframe(tfidf_df)
75
+
76
+ cosine_sim_matrix = cosine_similarity(tfidf_matrix)
77
+ cosine_sim_df = pd.DataFrame(cosine_sim_matrix)
78
+ st.subheader(f"Cosine Similarity Matrix ({title}):")
79
+ st.dataframe(cosine_sim_df)
80
+
81
+ st.subheader(f"Cosine Similarity Scores (Job Description for {title} vs. Resumes):")
82
+ for i, similarity_score in enumerate(cosine_sim_matrix[0][1:]):
83
+ st.write(f"Similarity with Candidate Profile {i + 1}: {similarity_score:.4f}")
84
+
85
+ st.header("Analysis Set 1")
86
+ process_documents("text 1", "candidate 1", "Set 1")
87
 
88
+ st.divider()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ st.header("Analysis Set 2")
91
+ process_documents("text 2", "candidate 2", "Set 2")
 
92
 
93
 
94