resume-screener / app.py
vernon1224's picture
Update app.py
a438728 verified
# PDFs
from langchain_community.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings as HFE
from langchain.schema import Document
# Groq
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from groq import Groq
# Expanded Queries
import ast
# Cross Encoder
from sentence_transformers import CrossEncoder
# BM25
from rank_bm25 import BM25Okapi
import numpy as np
# Gradio
import gradio as gr
# GROQ_API = userdata.get('GROQ_API')
embed_model = "sentence-transformers/all-MiniLM-L6-v2"
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
prompt = ChatPromptTemplate.from_messages(
[
("system", """
You are a helpful HR assistant specializing in the resume screening phase.
Your goal is to identify the best, most suitable, or highest-potential
candidates whose qualifications align well with the provided job title
and job description. If a question or request falls outside the scope
of resume screening and candidate alignment,
please respond with 'I don't know'.
"""),
MessagesPlaceholder(variable_name="history", optional=True),
("system", "Context: {context}"),
("human", "{question}"),
]
)
query_expansion_prompt = ChatPromptTemplate([
("system", """
You are an expert HR assistant. Given a job description and a user query,
generate 3 alternative, diverse search queries that capture different
aspects of what makes a great candidate for this role. Each query should
focus on a different facet (e.g., skills, leadership, hands-on experience,
certifications, unique achievements).
If the job description is empty, generate a general job description for the role
mentioned in the user query and then create the 3 alternative search queries based on that.
Return ONLY the generated queries as a Python list of strings. Do not include
any other explanatory text or formatting.
"""),
("human", "Job Description: {job_description}\nUser Query: {user_query}")
])
JUDGE_PROMPT = """
You are an expert recruiter. Given the job description, the user query, and the system's answer, rate:
Faithfulness: Does the answer accurately reflect the resume(s) provided? (1-5)
Relevance: Does the answer address the job requirements and user query? (1-5)
Provide your feedback as follows:
Faithfulness: <score>
Relevance: <score>
Justification: <brief explanation>
Job Description:
{job_description}
User Query:
{user_query}
System Answer:
{system_answer}
"""
def load_single_pdf(path):
loader = PyPDFLoader(path)
pages = loader.load()
full_text = "\n".join([page.page_content for page in pages])
return Document(page_content=full_text)
def chunks_embed(chunks, model_name):
"""Create embeds for doc chunks and store in FAISS"""
embeds = HFE(model_name=model_name)
# Create FAISS index
db = FAISS.from_documents(chunks, embeds)
print(f"Created FAISS Index with {len(chunks)} documents.")
return db
def search_docs_mmr(db, query, k, fetch_k, lambda_mult):
"""
Retrieve the most similar docs to the query using MMR
(Maximum Marginal Relevance)
"""
if not db:
print("Error: No document database available")
return []
docs = db.max_marginal_relevance_search(
query, k=fetch_k, lambda_mult=lambda_mult
)
return docs
def combine_results(results):
# Combine the content from results to create context
context = ""
for doc in results:
context += doc.page_content + "\n"
return context
# 1. Prepare corpus for BM25
def prepare_bm25_corpus(docs):
# Tokenize for BM25 (simple whitespace split, can improve)
return [doc.page_content.lower().split() for doc in docs]
# 2. Initialize BM25
def init_bm25(docs):
corpus = prepare_bm25_corpus(docs)
return BM25Okapi(corpus)
# 3. BM25 Search
def bm25_search(bm25, query, docs, top_k=10):
query_tokens = query.lower().split()
scores = bm25.get_scores(query_tokens)
top_indices = np.argsort(scores)[::-1][:top_k]
return [docs[i] for i in top_indices], [scores[i] for i in top_indices]
# Hybrid Merge Functino
def hybrid_merge(semantic_results, bm25_results):
# Merge by union, keeping order (semantic first, then BM25 if not already present)
seen = set()
merged = []
for doc in semantic_results + bm25_results:
if doc.page_content not in seen:
merged.append(doc)
seen.add(doc.page_content)
return merged
def llm_judge_groq(api_key, job_description, user_query, system_answer):
judge_prompt = JUDGE_PROMPT.format(
job_description=job_description,
user_query=user_query,
system_answer=system_answer
)
client = Groq(api_key=api_key)
completion = client.chat.completions.create(
model="deepseek-r1-distill-llama-70b",
messages=[{"role": "user", "content": judge_prompt}],
max_tokens=512
)
return completion.choices[0].message.content
def screen_resumes(api_key, job_description, user_query, files):
embed_model = "sentence-transformers/all-MiniLM-L6-v2"
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
# Model and prompt setup (inside function, using user API key)
model = ChatGroq(model="llama-3.1-8b-instant", api_key=api_key)
history = {}
def get_session_history(session_id: str):
if session_id not in history:
history[session_id] = ChatMessageHistory()
return history[session_id]
with_message_history = RunnableWithMessageHistory(model, get_session_history)
chain = prompt | model
with_message_history = RunnableWithMessageHistory(
chain,
get_session_history,
input_messages_key="question",
history_messages_key="history"
)
# Load and process resumes
resume_paths = [file.name for file in files]
chunks = [load_single_pdf(path) for path in resume_paths]
embeds = chunks_embed(chunks, embed_model)
bm25 = init_bm25(chunks)
# Query Expansion
prompt_value = query_expansion_prompt.invoke({
"job_description": job_description,
"user_query": user_query,
})
expanded_queries_response = model.invoke(prompt_value.messages)
expanded_queries = ast.literal_eval(expanded_queries_response.content)
# Hybrid Retrieval
all_semantic = []
all_bm25 = []
for q in expanded_queries:
semantic_docs = search_docs_mmr(embeds, q, 10, 100, 0.7)
bm25_docs, _ = bm25_search(bm25, q, chunks, top_k=10)
all_semantic.extend(semantic_docs)
all_bm25.extend(bm25_docs)
merged_results = hybrid_merge(all_semantic, all_bm25)
unique_results_list = merged_results
# Cross-encoder Re-ranking
pairs = [(user_query, doc.page_content) for doc in unique_results_list]
scores = cross_encoder.predict(pairs)
ranked = sorted(zip(scores, unique_results_list), key=lambda x: x[0], reverse=True)
top_n = min(5, len(ranked))
ranked_top_n = [doc for score, doc in ranked[:top_n]]
context = "\n\n".join([doc.page_content for doc in ranked_top_n])
# LLM Final Reasoning
inputs = {
"context": context,
"question": user_query,
}
config = {"configurable": {"session_id": "GradioSession"}}
response = with_message_history.invoke(inputs, config=config)
system_output = response.content
# LLM-as-a-Judge Evaluation
judge_feedback = llm_judge_groq(api_key, job_description, user_query, system_output)
return system_output, context, judge_feedback
demo = gr.Interface(
fn=screen_resumes,
inputs=[
gr.Textbox(label="Groq API Key", type="password", lines=1, placeholder="sk..."),
gr.Textbox(lines=4, label="Job Description"),
gr.Textbox(lines=2, label="User Query"),
gr.File(file_count="multiple", label="Upload Resume PDFs")
],
outputs=[
gr.Textbox(label="Screening Result (LLM Output)"),
gr.Textbox(label="Top Ranked Resumes (Raw Text)"),
gr.Textbox(label="LLM-as-a-Judge Evaluation (DeepSeek)")
],
title="Resume Screening Assistant (Hybrid + LLM-as-a-Judge)",
description="Enter your Groq API key, upload resumes, enter a job description and query, get the best candidates with explanations, and see an automated evaluation."
)
demo.launch(share=True)