nlpblogs commited on
Commit
eddfa20
·
verified ·
1 Parent(s): 5b1512b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -60
app.py CHANGED
@@ -25,71 +25,116 @@ from sklearn.feature_extraction.text import TfidfVectorizer
25
  from sklearn.metrics.pairwise import cosine_similarity
26
  import tempfile
27
 
28
- import streamlit as st
29
- import pandas as pd
30
- from PyPDF2 import PdfReader
31
- from gliner import GLiNER
32
- from sklearn.feature_extraction.text import TfidfVectorizer
33
- from sklearn.metrics.pairwise import cosine_similarity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- def process_documents(job_description_key, file_uploader_key, title):
36
- txt = st.text_area(f"Job description for {title}", key=job_description_key)
37
- job_description_series = pd.Series([txt], name="Text")
38
- st.dataframe(job_description_series)
39
-
40
- uploaded_files = st.file_uploader(
41
- f"Choose PDF file(s) for candidate profiles for {title}", type="pdf", key=file_uploader_key,
42
- )
43
-
44
- all_extracted_data = []
45
- if uploaded_files:
46
- model = GLiNER.from_pretrained("urchade/gliner_base")
47
- labels = ["person", "country", "organization", "time", "role"]
48
- for uploaded_file in uploaded_files:
49
- try:
50
- pdf_reader = PdfReader(uploaded_file)
51
- text_data = ""
52
- for page in pdf_reader.pages:
53
- text_data += page.extract_text()
54
 
55
- entities = model.predict_entities(text_data, labels)
56
- entity_dict = {}
57
- for label in labels:
58
- entity_dict[label] = [entity["text"] for entity in entities if entity["label"] == label]
59
- data = {"Text": text_data, **entity_dict}
60
- all_extracted_data.append(data)
61
- except Exception as e:
62
- st.error(f"Error processing file {uploaded_file.name}: {e}")
63
-
64
- if all_extracted_data:
65
- df_entities = pd.DataFrame(all_extracted_data)
66
- st.subheader(f"Extracted Entities ({title}):")
67
- st.dataframe(df_entities)
68
-
69
- all_documents = [job_description_series.iloc[0]] + df_entities['Text'].tolist()
70
- vectorizer = TfidfVectorizer()
71
- tfidf_matrix = vectorizer.fit_transform(all_documents)
72
- tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
73
- st.subheader(f"TF-IDF Values ({title}):")
74
- st.dataframe(tfidf_df)
75
-
76
- cosine_sim_matrix = cosine_similarity(tfidf_matrix)
77
- cosine_sim_df = pd.DataFrame(cosine_sim_matrix)
78
- st.subheader(f"Cosine Similarity Matrix ({title}):")
79
- st.dataframe(cosine_sim_df)
80
-
81
- st.subheader(f"Cosine Similarity Scores (Job Description for {title} vs. Resumes):")
82
- for i, similarity_score in enumerate(cosine_sim_matrix[0][1:]):
83
- st.write(f"Similarity with Candidate Profile {i + 1}: {similarity_score:.4f}")
84
-
85
- st.header("Analysis Set 1")
86
- process_documents("text 1", "candidate 1", "Set 1")
87
 
 
 
 
 
 
 
 
88
  st.divider()
89
 
90
- st.header("Analysis Set 2")
91
- process_documents("text 2", "candidate 2", "Set 2")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
 
 
 
 
93
 
 
 
 
 
94
 
95
-
 
25
  from sklearn.metrics.pairwise import cosine_similarity
26
  import tempfile
27
 
28
+ txt1 = st.text_area("Job description", key = "text 1")
29
+ job_description_series1 = pd.Series(txt1, name="Text")
30
+ st.dataframe(job_description_series1)
31
+
32
+ uploaded_files = st.file_uploader(
33
+ "Choose a PDF file(s) for candidate profiles", type="pdf", key = "candidate 1"
34
+ )
35
+
36
+
37
+ all_resumes_text = [] # Store the text content of each PDF
38
+
39
+ if uploaded_files:
40
+ for uploaded_file in uploaded_files:
41
+ try:
42
+ pdf_reader = PdfReader(uploaded_file)
43
+ text_data = ""
44
+ for page in pdf_reader.pages:
45
+ text_data += page.extract_text()
46
+ model = GLiNER.from_pretrained("urchade/gliner_base")
47
+ labels = ["person", "country", "organization", "time", "role"]
48
+ entities = model.predict_entities(text_data, labels)
49
 
50
+ entity_dict = {}
51
+ for label in labels:
52
+ entity_dict[label] = [entity["text"] for entity in entities if entity["label"] == label]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ data = {"Text": text_data, **entity_dict}
55
+
56
+
57
+
58
+ all_resumes_text.append(text_data)
59
+ except Exception as e:
60
+ st.error(f"Error processing file {uploaded_file.name}: {e}")
61
+
62
+ if all_resumes_text:
63
+ all_documents = [job_description_series.iloc[0]] + all_resumes_text
64
+
65
+ vectorizer = TfidfVectorizer()
66
+ tfidf_matrix = vectorizer.fit_transform(all_documents)
67
+
68
+ tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
69
+ st.subheader("TF-IDF Values:")
70
+ st.dataframe(tfidf_df)
71
+
72
+ cosine_sim_matrix = cosine_similarity(tfidf_matrix)
73
+ cosine_sim_df = pd.DataFrame(cosine_sim_matrix)
74
+ st.subheader("Cosine Similarity Matrix:")
75
+ st.dataframe(cosine_sim_df)
 
 
 
 
 
 
 
 
 
 
76
 
77
+ # Display similarity scores between the job description and each resume
78
+ st.subheader("Cosine Similarity Scores (Job Description vs. Resumes):")
79
+ for i, similarity_score in enumerate(cosine_sim_matrix[0][1:]):
80
+ st.write(f"Similarity with Candidate Profile {i + 1}: {similarity_score:.4f}")
81
+
82
+
83
+
84
  st.divider()
85
 
86
+ txt2 = st.text_area("Job description", key = "text 2")
87
+ job_description_series2 = pd.Series(txt2, name="Text")
88
+ st.dataframe(job_description_series2)
89
+
90
+ uploaded_files = st.file_uploader(
91
+ "Choose a PDF file(s) for candidate profiles", type="pdf", key = "candidate 2"
92
+ )
93
+
94
+
95
+ all_resumes_text = [] # Store the text content of each PDF
96
+
97
+ if uploaded_files:
98
+ for uploaded_file in uploaded_files:
99
+ try:
100
+ pdf_reader = PdfReader(uploaded_file)
101
+ text_data = ""
102
+ for page in pdf_reader.pages:
103
+ text_data += page.extract_text()
104
+ model = GLiNER.from_pretrained("urchade/gliner_base")
105
+ labels = ["person", "country", "organization", "time", "role"]
106
+ entities = model.predict_entities(text_data, labels)
107
+
108
+ entity_dict = {}
109
+ for label in labels:
110
+ entity_dict[label] = [entity["text"] for entity in entities if entity["label"] == label]
111
+
112
+ data = {"Text": text_data, **entity_dict}
113
+
114
+
115
+
116
+ all_resumes_text.append(text_data)
117
+ except Exception as e:
118
+ st.error(f"Error processing file {uploaded_file.name}: {e}")
119
+
120
+ if all_resumes_text:
121
+ all_documents = [job_description_series.iloc[0]] + all_resumes_text
122
+
123
+ vectorizer = TfidfVectorizer()
124
+ tfidf_matrix = vectorizer.fit_transform(all_documents)
125
+
126
+ tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
127
+ st.subheader("TF-IDF Values:")
128
+ st.dataframe(tfidf_df)
129
 
130
+ cosine_sim_matrix = cosine_similarity(tfidf_matrix)
131
+ cosine_sim_df = pd.DataFrame(cosine_sim_matrix)
132
+ st.subheader("Cosine Similarity Matrix:")
133
+ st.dataframe(cosine_sim_df)
134
 
135
+ # Display similarity scores between the job description and each resume
136
+ st.subheader("Cosine Similarity Scores (Job Description vs. Resumes):")
137
+ for i, similarity_score in enumerate(cosine_sim_matrix[0][1:]):
138
+ st.write(f"Similarity with Candidate Profile {i + 1}: {similarity_score:.4f}")
139
 
140
+