Spaces:

hfmlsoc
/

policy-docs-qa

Running

App Files Files Community

policy-docs-qa / utils /llm_utils.py

yjernite HF Staff

Upload 2 files

e672262 verified 9 days ago

raw

history blame contribute delete

12.7 kB

	import json
	import os
	import logging

	from huggingface_hub import HfApi, InferenceClient

	import utils.interface_utils as interface_utils

	# Renamed constant to indicate it's a default/fallback
	DEFAULT_LLM_ENDPOINT_URL = (
	"https://r5lahjemc2zuajga.us-east-1.aws.endpoints.huggingface.cloud"
	)

	# Added Endpoint name constant
	LLM_ENDPOINT_NAME = os.getenv(
	"HF_LLM_ENDPOINT_NAME", "phi-4-max"
	) # Get from env or default

	RETRIEVAL_SYSTEM_PROMPT = """Instructions:
	You are a helpful assistant presented with a document excerpts and a question.
	Your job is to retrieve the most relevant passages from the provided document excerpt that contribute to help answer the question.

	For each passage retrieved from the documents, provide:
	- a brief summary of the context leading up to the passage (2 sentences max)
	- the supported passage quoted exactly
	- a brief summary of how the points in the passage are relevant to the question (2 sentences max)

	The supporting passages should be a JSON-formatted list of dictionaries with the keys 'context' 'quote' and 'relevance'.
	Provide up to 4 different supporting passages covering as many different aspects of the topic in question as possible.
	Only include passages that are relevant to the question. If there are fewer or no relevant passages in the document, just return a shorter or empty list.
	"""

	QA_RETRIEVAL_PROMPT = """Find passages from the following documents that help answer the question.

	Document Content:
	```markdown
	{document}
	```

	Question:
	{question}

	JSON Output:"""

	ANSWER_SYSTEM_PROMPT = """Instructions:
	You are a helpful assistant presented with a list of snippets extracted from documents and a question.
	The snippets are presented in a JSON-formatted list that includes a unique id (`id`), context, relevance, and the exact quote.
	Your job is to answer the question based only on the most relevant provided snippet quotes, citing the snippets used for each sentence.

	Output Format:
	Your response must be a JSON-formatted list of dictionaries. Each dictionary represents a sentence in your answer and must have the following keys:
	- `sentence`: A string containing the sentence.
	- `citations`: A list of integers, where each integer is the `id` of a snippet that supports the sentence.

	Example Output:
	```json
	[
	{
	"sentence": "This is the first sentence of the answer.",
	"citations": [1, 3]
	},
	{
	"sentence": "This is the second sentence, supported by another snippet.",
	"citations": [5]
	}
	]
	```

	Constraints:
	- Base your answer only on the information within the provided snippets.
	- Do not use external knowledge.
	- The sentences should flow together coherently.
	- A single sentence can cite multiple snippets.
	- The final answer should be no more than 5-6 sentences long.
	- Ensure the output is valid JSON.
	"""

	ANSWER_PROMPT = """
	Given the following snippets, answer the question.
	```json
	{snippets}
	```

	Question:
	{question}

	JSON Output:"""

	# Initialize client using token from environment variables
	client = InferenceClient(token=os.getenv("HF_TOKEN"))


	# --- Endpoint Status Check Function ---
	def check_endpoint_status(token: str \| None, endpoint_name: str = LLM_ENDPOINT_NAME):
	"""Checks the Inference Endpoint status and returns status dict."""
	# (Function body moved from app.py - Ensure logging is configured)
	logging.info(f"Checking endpoint status for '{endpoint_name}'...")
	if not token:
	logging.warning("HF Token not available, cannot check endpoint status.")
	return {
	"status": "ready",
	"warning": "HF Token not available for status check.",
	}
	try:
	api = HfApi(token=token)
	endpoint = api.get_inference_endpoint(name=endpoint_name, token=token)
	status = endpoint.status
	logging.info(f"Endpoint '{endpoint_name}' status: {status}")
	if status == "running":
	return {"status": "ready"}
	else:
	if status == "scaledToZero":
	logging.info(
	f"Endpoint '{endpoint_name}' is scaled to zero. Attempting to resume..."
	)
	try:
	endpoint.resume()
	user_message = f"The required LLM endpoint ('{endpoint_name}') was scaled to zero and is now restarting. Please wait a few minutes and try submitting your query again."
	logging.info(f"Resume command sent for '{endpoint_name}'.")
	return {"status": "error", "ui_message": user_message}
	except Exception as resume_error:
	logging.error(
	f"Failed to resume endpoint '{endpoint_name}': {resume_error}"
	)
	user_message = f"The required LLM endpoint ('{endpoint_name}') is scaled to zero. An attempt to automatically resume it failed: {resume_error}. Please check the endpoint status on Hugging Face."
	return {"status": "error", "ui_message": user_message}
	else:
	user_message = f"The required LLM endpoint ('{endpoint_name}') is currently {status}. Analysis cannot proceed until it is running. Please check the endpoint status on Hugging Face."
	logging.warning(
	f"Endpoint '{endpoint_name}' is not ready (Status: {status})."
	)
	return {"status": "error", "ui_message": user_message}
	except Exception as e:
	error_msg = f"Error checking endpoint status for {endpoint_name}: {e}"
	logging.error(error_msg)
	return {
	"status": "error",
	"ui_message": f"Failed to check endpoint status. Please verify the endpoint name ('{endpoint_name}') and your token. Error: {e}",
	}


	def retrieve_passages(
	query, doc_embeds, passages, processed_docs, embed_model, max_docs=3
	):
	"""Retrieves relevant passages based on embedding similarity, limited by max_docs."""
	queries = [query]
	query_embeddings = embed_model.encode(queries, prompt_name="query")
	scores = embed_model.similarity(query_embeddings, doc_embeds)
	sorted_scores = scores.sort(descending=True)
	sorted_vals = sorted_scores.values[0].tolist()
	sorted_idx = sorted_scores.indices[0].tolist()
	results = [
	{
	"passage_id": i,
	"document_id": passages[i][0],
	"chunk_id": passages[i][1],
	"document_url": processed_docs[passages[i][0]]["url"],
	"passage_text": passages[i][2],
	"relevance": v,
	}
	for i, v in zip(sorted_idx, sorted_vals)
	]
	# Slice the results here
	return results[:max_docs]


	# --- Excerpt Processing Function ---
	def process_single_excerpt(
	excerpt_index: int, excerpt: dict, query: str, hf_client: InferenceClient
	):
	"""Processes a single retrieved excerpt using an LLM to find citations and spans."""

	passage_text = excerpt.get("passage_text", "")
	if not passage_text:
	return {
	"citations": [],
	"all_spans": [],
	"parse_successful": False,
	"raw_error_response": "Empty passage text",
	}

	citations = []
	all_spans = []
	is_parse_successful = False
	raw_error_response = None

	try:
	retrieval_prompt = QA_RETRIEVAL_PROMPT.format(
	document=passage_text, question=query
	)
	response = hf_client.chat_completion(
	messages=[
	{"role": "system", "content": RETRIEVAL_SYSTEM_PROMPT},
	{"role": "user", "content": retrieval_prompt},
	],
	model=os.getenv("HF_LLM_ENDPOINT_URL", DEFAULT_LLM_ENDPOINT_URL),
	max_tokens=2048,
	temperature=0.01,
	)

	# Attempt to parse JSON
	response_content = response.choices[0].message.content.strip()
	try:
	# Find JSON block
	json_match = response_content.split("```json", 1)
	if len(json_match) > 1:
	json_str = json_match[1].split("```", 1)[0]
	parsed_json = json.loads(json_str)
	citations = parsed_json
	is_parse_successful = True
	# Find spans for each citation
	for cit in citations:
	quote = cit.get("quote", "")
	if quote:
	# Call find_citation_spans from interface_utils
	spans = interface_utils.find_citation_spans(
	document=passage_text, citation=quote
	)
	cit["char_spans"] = spans # Store spans in the citation dict
	all_spans.extend(spans)
	else:
	raise ValueError("No ```json block found in response")
	except (json.JSONDecodeError, ValueError, IndexError) as json_e:
	print(f"Error parsing JSON for excerpt {excerpt_index}: {json_e}")
	is_parse_successful = False
	raw_error_response = f"LLM Response (failed to parse): {response_content}" # Fixed potential newline issue

	except Exception as llm_e:
	print(f"Error during LLM call for excerpt {excerpt_index}: {llm_e}")
	is_parse_successful = False
	raw_error_response = f"LLM API Error: {llm_e}"

	return {
	"citations": citations,
	"all_spans": all_spans,
	"parse_successful": is_parse_successful,
	"raw_error_response": raw_error_response,
	}


	def generate_summary_answer(snippets: list, query: str, hf_client: InferenceClient):
	"""Generates a summarized answer based on provided snippets using an LLM."""
	# NOTE: Removed llm_endpoint_url parameter, using env var directly
	endpoint_url = os.getenv("HF_LLM_ENDPOINT_URL", DEFAULT_LLM_ENDPOINT_URL)
	if not snippets:
	return {
	"answer_sentences": [],
	"parse_successful": False,
	"raw_error_response": "No snippets provided for summarization.",
	}

	try:
	# Ensure snippets are formatted as a JSON string for the prompt
	snippets_json_string = json.dumps(snippets, indent=2)

	answer_prompt_formatted = ANSWER_PROMPT.format(
	snippets=snippets_json_string, question=query
	)

	response = hf_client.chat_completion(
	messages=[
	{"role": "system", "content": ANSWER_SYSTEM_PROMPT},
	{"role": "user", "content": answer_prompt_formatted},
	],
	model=endpoint_url,
	max_tokens=512,
	temperature=0.01,
	)

	# Attempt to parse JSON response
	response_content = response.choices[0].message.content.strip()
	try:
	# Find JSON block (assuming it might be wrapped in ```json ... ```)
	json_match = response_content.split("```json", 1)
	if len(json_match) > 1:
	json_str = json_match[1].split("```", 1)[0]
	else: # Assume the response is the JSON if no backticks found
	json_str = response_content

	parsed_json = json.loads(json_str)

	# Basic validation: check if it's a list of dictionaries with expected keys
	if isinstance(parsed_json, list) and all(
	isinstance(item, dict) and "sentence" in item and "citations" in item
	for item in parsed_json
	):
	return {
	"answer_sentences": parsed_json,
	"parse_successful": True,
	"raw_error_response": None,
	}
	else:
	raise ValueError(
	"Parsed JSON does not match expected format (list of {'sentence':..., 'citations':...})"
	)

	except (json.JSONDecodeError, ValueError, IndexError) as json_e:
	print(f"Error parsing summary JSON: {json_e}")
	return {
	"answer_sentences": [],
	"parse_successful": False,
	"raw_error_response": f"LLM Response (failed to parse summary): {response_content}",
	}

	except Exception as llm_e:
	print(f"Error during LLM summary call: {llm_e}")
	return {
	"answer_sentences": [],
	"parse_successful": False,
	"raw_error_response": f"LLM API Error during summary generation: {llm_e}",
	}


	# REMOVED Comment: This function will now live in app.py or interface_utils.py as it handles single excerpt processing
	# def make_supporting_snippets(...): -> Now handled excerpt by excerpt in app.py