nlpblogs's picture
Update app.py
5b1512b verified
raw
history blame
3.48 kB
import streamlit as st
from PyPDF2 import PdfReader
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st
from PyPDF2 import PdfReader
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gliner import GLiNER
import streamlit as st
import pandas as pd
from PyPDF2 import PdfReader
from gliner import GLiNER
import streamlit as st
import pandas as pd
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tempfile
import streamlit as st
import pandas as pd
from PyPDF2 import PdfReader
from gliner import GLiNER
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def process_documents(job_description_key, file_uploader_key, title):
txt = st.text_area(f"Job description for {title}", key=job_description_key)
job_description_series = pd.Series([txt], name="Text")
st.dataframe(job_description_series)
uploaded_files = st.file_uploader(
f"Choose PDF file(s) for candidate profiles for {title}", type="pdf", key=file_uploader_key,
)
all_extracted_data = []
if uploaded_files:
model = GLiNER.from_pretrained("urchade/gliner_base")
labels = ["person", "country", "organization", "time", "role"]
for uploaded_file in uploaded_files:
try:
pdf_reader = PdfReader(uploaded_file)
text_data = ""
for page in pdf_reader.pages:
text_data += page.extract_text()
entities = model.predict_entities(text_data, labels)
entity_dict = {}
for label in labels:
entity_dict[label] = [entity["text"] for entity in entities if entity["label"] == label]
data = {"Text": text_data, **entity_dict}
all_extracted_data.append(data)
except Exception as e:
st.error(f"Error processing file {uploaded_file.name}: {e}")
if all_extracted_data:
df_entities = pd.DataFrame(all_extracted_data)
st.subheader(f"Extracted Entities ({title}):")
st.dataframe(df_entities)
all_documents = [job_description_series.iloc[0]] + df_entities['Text'].tolist()
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_documents)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
st.subheader(f"TF-IDF Values ({title}):")
st.dataframe(tfidf_df)
cosine_sim_matrix = cosine_similarity(tfidf_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim_matrix)
st.subheader(f"Cosine Similarity Matrix ({title}):")
st.dataframe(cosine_sim_df)
st.subheader(f"Cosine Similarity Scores (Job Description for {title} vs. Resumes):")
for i, similarity_score in enumerate(cosine_sim_matrix[0][1:]):
st.write(f"Similarity with Candidate Profile {i + 1}: {similarity_score:.4f}")
st.header("Analysis Set 1")
process_documents("text 1", "candidate 1", "Set 1")
st.divider()
st.header("Analysis Set 2")
process_documents("text 2", "candidate 2", "Set 2")