Spaces:

Lhumpal
/

beast-llm

Sleeping

App Files Files Community

beast-llm / app.py

Lhumpal

Update app.py

c6ee45e verified about 1 month ago

raw

history blame

8.82 kB

	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from huggingface_hub import InferenceClient
	import os
	import textwrap
	from google import genai
	from google.genai.types import GenerateContentConfig
	from datasets import load_dataset
	from huggingface_hub import login
	from typing import List, Dict, Any
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS
	import numpy as np

	app = FastAPI()

	# Get the token from the environment variable
	hf_token = os.environ.get("HF_TOKEN")
	google_api_key = os.environ.get("GOOGLE_API_KEY")

	login(token=hf_token)

	def chunk_text(text, chunk_size=250, chunk_overlap=50):
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=[" ", "\n", "."]
	)
	chunks = splitter.split_text(text)
	return chunks

	# Function to build FAISS index
	# embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")

	def build_faiss_vectorstore(chunks):
	vectorstore = FAISS.from_texts(chunks, embedding_model)
	num_documents = len(vectorstore.index_to_docstore_id)
	print(f"Total number of documents: {num_documents}")
	return vectorstore

	# Function to retrieve similar text
	def retrieve(query, vectorstore, top_k=8):
	docs_and_scores = vectorstore.similarity_search_with_score(query=query, k=top_k)

	# Filter results based on score threshold
	filtered_docs_and_scores = [(doc.page_content, float(score)) for doc, score in docs_and_scores if float(score) <= 0.8]

	# Separate docs from the (doc, score) tuples
	docs_content = [doc for doc, _ in filtered_docs_and_scores]

	return docs_content, filtered_docs_and_scores

	class ChatRequest(BaseModel):
	message: str
	system_message: str = """You are Dan Infalt, a friendly public land deer hunting expert specializing in targeting mature bucks in pressured areas, but
	don’t worry, you won’t take yourself too seriously. You respond in a conversational matter but still direct. You have dry humor you mix in every once in a while.
	You focus on buck bedding, terrain reading, and aggressive yet calculated mobile tactics. Your blue-collar, no-nonsense approach
	emphasizes deep scouting, strategic access, and minimalist setups. Through The Hunting Beast, you teach hunters how to kill big bucks
	using terrain, wind, and thermals. You speak from firsthand experience, keeping your advice practical and to the point. Provide detailed
	yet concise responses that fully articulate your experience and answer the user query.
	"""
	temperature: float = 1.5
	max_output_tokens: int = 200
	chat_history: List[Dict[str, Any]] = []
	model_choice: str = "google"

	# grab dataset
	dataset = load_dataset("Lhumpal/youtube-hunting-beast-transcripts", data_files={"concise": "concise/", "raw": "raw/"})
	concise_text = dataset["concise"]["text"]
	concise_text_string = "".join(concise_text)

	# Chunk and index the documents
	chunks = chunk_text(concise_text_string, chunk_size=400)
	# Build the vectorsore
	vectorstore = build_faiss_vectorstore(chunks)

	@app.post("/chat")
	async def chat(request: ChatRequest):
	try:
	if request.model_choice == "google":
	client = genai.Client(api_key=google_api_key)

	# summarize chat history
	summary_thresh = 10
	if len(request.chat_history) > summary_thresh:
	summarize_prompt = f"""Please summarize the following chat history concisely, focusing on the key points and main topics discussed. Avoid
	unnecessary details and provide a clear, straightforward summary. {request.chat_history[:-summary_thresh]}""" # summarize everything except last k items
	summary_response = client.models.generate_content(
	model="gemini-2.0-flash",
	contents=summarize_prompt,
	config=GenerateContentConfig(
	system_instruction=["You are a helpful assistant who is an expert at summarization."],
	max_output_tokens=250,
	temperature=0.5
	),
	)
	request.chat_history = request.chat_history[-(summary_thresh+2):] # keep last k items
	request.chat_history.insert(1,
	{"role": "user",
	"parts": [{"text": f"Here is a summary of this conversation so far: {summary_response.text}"}]})


	# Retrieve relevant text
	docs, filtered_docs_and_scores = retrieve(request.message, vectorstore, top_k=8)
	docs = "\n\n".join(docs)

	rag_prompt = f"""Use the following information to answer the user's query. You do not have to use all the information, just the pieces that directly
	help answer the query most accurately. Start directly with information, NOT with a question, and NOT restating the subject matter of the user query in
	any way, or you will be penalized. Respond in a conversational manner.

	Here are three examples of the style and tone of a response. Notice the good response and bad response. Please respond like the good response and NOT like the bad response:

	User Query: How do big bucks use clear cuts for bedding?

	Bad Response: Alright, so you want to know big bucks use clear cuts for bedding?, eh? Well, a lot of people assume big bucks bed right in the middle of a clear
	cut because it’s thick, but that’s not really the case. The dense regrowth provides food and cover, but bucks still want the upper hand.

	Good Response: Yeah, a lot of guys think big bucks just bed right in the middle of a clear cut because it’s thick, but that’s not really how they use it. The
	thick regrowth is great for food and cover, but those bucks still want an advantage. Most of the time, they’re bedding on the edges, right where the cut
	meets older timber. They’ll set up with the wind at their back so they can smell anything sneaking up behind them, and they’re looking out into the open
	woods, watching for danger.

	You have access to the following relevant information retrieved based on the user's query:

	{docs}

	Using the information above, answer the user's query as accurately as possible in the tone and style of the Good Response:

	User Query: {request.message}
	"""

	# remove the unfformatted user message
	del request.chat_history[-1]
	# add the user message with RAG data
	rag_prompt = textwrap.dedent(rag_prompt)
	request.chat_history.append({"role": "user", "parts": [{"text": rag_prompt}]})

	response = client.models.generate_content(
	model="gemini-2.0-flash",
	contents=request.chat_history,
	config=GenerateContentConfig(
	system_instruction=[request.system_message],
	max_output_tokens=request.max_output_tokens,
	temperature=request.temperature
	),
	)

	# delete the prompt and put back the unformatted user message
	del request.chat_history[-1]
	request.chat_history.append({"role": "user", "parts": [{"text": request.message}]})

	return {"response": response.text, "dataset_str": concise_text_string, "docs": docs, "filtered_docs_and_scores": filtered_docs_and_scores, "history": request.chat_history, "RAG_prompt": rag_prompt, "chunks": chunks}

	if request.model_choice == "HF":
	if hf_token:
	client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct", token=hf_token)
	else:
	raise ValueError("HF_TOKEN environment variable not set. Please add it as a secret in your Hugging Face Space.")

	messages = [
	{"role": "system", "content": request.system_message},
	{"role": "user", "content": request.message},
	]

	response = client.chat_completion(
	messages=messages,
	max_tokens=request.max_tokens,
	temperature=request.temperature,
	top_p=request.top_p,
	)

	return {"response": response.choices[0].message.content}

	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))