Spaces:

Lhumpal
/

beast-llm

Sleeping

App Files Files Community

Lhumpal commited on Apr 1

Commit

7cc632c

verified ·

1 Parent(s): c6ee45e

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -8

app.py CHANGED Viewed

@@ -23,14 +23,14 @@ login(token=hf_token)
 def chunk_text(text, chunk_size=250, chunk_overlap=50):
     splitter = RecursiveCharacterTextSplitter(
-        chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=[" ", "\n", "."]
     )
     chunks = splitter.split_text(text)
     return chunks
 # Function to build FAISS index
-# embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")
 def build_faiss_vectorstore(chunks):
     vectorstore = FAISS.from_texts(chunks, embedding_model)
@@ -65,12 +65,12 @@ class ChatRequest(BaseModel):
     model_choice: str = "google"
 # grab dataset
-dataset = load_dataset("Lhumpal/youtube-hunting-beast-transcripts", data_files={"concise": "concise/*", "raw": "raw/*"})
-concise_text = dataset["concise"]["text"]
-concise_text_string = "".join(concise_text)
 # Chunk and index the documents
-chunks = chunk_text(concise_text_string, chunk_size=400)
 # Build the vectorsore
 vectorstore = build_faiss_vectorstore(chunks)
@@ -149,7 +149,7 @@ async def chat(request: ChatRequest):
             del request.chat_history[-1]
             request.chat_history.append({"role": "user", "parts": [{"text": request.message}]})
-            return {"response": response.text, "dataset_str": concise_text_string, "docs": docs, "filtered_docs_and_scores": filtered_docs_and_scores, "history": request.chat_history, "RAG_prompt": rag_prompt, "chunks": chunks}
         if request.model_choice == "HF":
             if hf_token:

 def chunk_text(text, chunk_size=250, chunk_overlap=50):
     splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n,\n", " ", "\n", "."]
     )
     chunks = splitter.split_text(text)
     return chunks
 # Function to build FAISS index
+embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+# embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")
 def build_faiss_vectorstore(chunks):
     vectorstore = FAISS.from_texts(chunks, embedding_model)
     model_choice: str = "google"
 # grab dataset
+dataset = load_dataset("Lhumpal/youtube-hunting-beast-transcripts", data_files={"concise": "concise/*", "raw": "raw/*", "facts": "facts/*"})
+text = dataset["facts"]["text"]
+text_string = "".join(text)
 # Chunk and index the documents
+chunks = chunk_text(text_string, chunk_size=400)
 # Build the vectorsore
 vectorstore = build_faiss_vectorstore(chunks)
             del request.chat_history[-1]
             request.chat_history.append({"role": "user", "parts": [{"text": request.message}]})
+            return {"response": response.text, "dataset_str": text_string, "docs": docs, "filtered_docs_and_scores": filtered_docs_and_scores, "history": request.chat_history, "RAG_prompt": rag_prompt, "chunks": chunks}
         if request.model_choice == "HF":
             if hf_token: