Lhumpal commited on
Commit
7cc632c
·
verified ·
1 Parent(s): c6ee45e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -8
app.py CHANGED
@@ -23,14 +23,14 @@ login(token=hf_token)
23
 
24
  def chunk_text(text, chunk_size=250, chunk_overlap=50):
25
  splitter = RecursiveCharacterTextSplitter(
26
- chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=[" ", "\n", "."]
27
  )
28
  chunks = splitter.split_text(text)
29
  return chunks
30
 
31
  # Function to build FAISS index
32
- # embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
33
- embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")
34
 
35
  def build_faiss_vectorstore(chunks):
36
  vectorstore = FAISS.from_texts(chunks, embedding_model)
@@ -65,12 +65,12 @@ class ChatRequest(BaseModel):
65
  model_choice: str = "google"
66
 
67
  # grab dataset
68
- dataset = load_dataset("Lhumpal/youtube-hunting-beast-transcripts", data_files={"concise": "concise/*", "raw": "raw/*"})
69
- concise_text = dataset["concise"]["text"]
70
- concise_text_string = "".join(concise_text)
71
 
72
  # Chunk and index the documents
73
- chunks = chunk_text(concise_text_string, chunk_size=400)
74
  # Build the vectorsore
75
  vectorstore = build_faiss_vectorstore(chunks)
76
 
@@ -149,7 +149,7 @@ async def chat(request: ChatRequest):
149
  del request.chat_history[-1]
150
  request.chat_history.append({"role": "user", "parts": [{"text": request.message}]})
151
 
152
- return {"response": response.text, "dataset_str": concise_text_string, "docs": docs, "filtered_docs_and_scores": filtered_docs_and_scores, "history": request.chat_history, "RAG_prompt": rag_prompt, "chunks": chunks}
153
 
154
  if request.model_choice == "HF":
155
  if hf_token:
 
23
 
24
  def chunk_text(text, chunk_size=250, chunk_overlap=50):
25
  splitter = RecursiveCharacterTextSplitter(
26
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n,\n", " ", "\n", "."]
27
  )
28
  chunks = splitter.split_text(text)
29
  return chunks
30
 
31
  # Function to build FAISS index
32
+ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
33
+ # embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")
34
 
35
  def build_faiss_vectorstore(chunks):
36
  vectorstore = FAISS.from_texts(chunks, embedding_model)
 
65
  model_choice: str = "google"
66
 
67
  # grab dataset
68
+ dataset = load_dataset("Lhumpal/youtube-hunting-beast-transcripts", data_files={"concise": "concise/*", "raw": "raw/*", "facts": "facts/*"})
69
+ text = dataset["facts"]["text"]
70
+ text_string = "".join(text)
71
 
72
  # Chunk and index the documents
73
+ chunks = chunk_text(text_string, chunk_size=400)
74
  # Build the vectorsore
75
  vectorstore = build_faiss_vectorstore(chunks)
76
 
 
149
  del request.chat_history[-1]
150
  request.chat_history.append({"role": "user", "parts": [{"text": request.message}]})
151
 
152
+ return {"response": response.text, "dataset_str": text_string, "docs": docs, "filtered_docs_and_scores": filtered_docs_and_scores, "history": request.chat_history, "RAG_prompt": rag_prompt, "chunks": chunks}
153
 
154
  if request.model_choice == "HF":
155
  if hf_token: