Update app.py
Browse files
app.py
CHANGED
@@ -23,14 +23,14 @@ login(token=hf_token)
|
|
23 |
|
24 |
def chunk_text(text, chunk_size=250, chunk_overlap=50):
|
25 |
splitter = RecursiveCharacterTextSplitter(
|
26 |
-
chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=[" ", "\n", "."]
|
27 |
)
|
28 |
chunks = splitter.split_text(text)
|
29 |
return chunks
|
30 |
|
31 |
# Function to build FAISS index
|
32 |
-
|
33 |
-
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")
|
34 |
|
35 |
def build_faiss_vectorstore(chunks):
|
36 |
vectorstore = FAISS.from_texts(chunks, embedding_model)
|
@@ -65,12 +65,12 @@ class ChatRequest(BaseModel):
|
|
65 |
model_choice: str = "google"
|
66 |
|
67 |
# grab dataset
|
68 |
-
dataset = load_dataset("Lhumpal/youtube-hunting-beast-transcripts", data_files={"concise": "concise/*", "raw": "raw/*"})
|
69 |
-
|
70 |
-
|
71 |
|
72 |
# Chunk and index the documents
|
73 |
-
chunks = chunk_text(
|
74 |
# Build the vectorsore
|
75 |
vectorstore = build_faiss_vectorstore(chunks)
|
76 |
|
@@ -149,7 +149,7 @@ async def chat(request: ChatRequest):
|
|
149 |
del request.chat_history[-1]
|
150 |
request.chat_history.append({"role": "user", "parts": [{"text": request.message}]})
|
151 |
|
152 |
-
return {"response": response.text, "dataset_str":
|
153 |
|
154 |
if request.model_choice == "HF":
|
155 |
if hf_token:
|
|
|
23 |
|
24 |
def chunk_text(text, chunk_size=250, chunk_overlap=50):
|
25 |
splitter = RecursiveCharacterTextSplitter(
|
26 |
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n,\n", " ", "\n", "."]
|
27 |
)
|
28 |
chunks = splitter.split_text(text)
|
29 |
return chunks
|
30 |
|
31 |
# Function to build FAISS index
|
32 |
+
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
33 |
+
# embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")
|
34 |
|
35 |
def build_faiss_vectorstore(chunks):
|
36 |
vectorstore = FAISS.from_texts(chunks, embedding_model)
|
|
|
65 |
model_choice: str = "google"
|
66 |
|
67 |
# grab dataset
|
68 |
+
dataset = load_dataset("Lhumpal/youtube-hunting-beast-transcripts", data_files={"concise": "concise/*", "raw": "raw/*", "facts": "facts/*"})
|
69 |
+
text = dataset["facts"]["text"]
|
70 |
+
text_string = "".join(text)
|
71 |
|
72 |
# Chunk and index the documents
|
73 |
+
chunks = chunk_text(text_string, chunk_size=400)
|
74 |
# Build the vectorsore
|
75 |
vectorstore = build_faiss_vectorstore(chunks)
|
76 |
|
|
|
149 |
del request.chat_history[-1]
|
150 |
request.chat_history.append({"role": "user", "parts": [{"text": request.message}]})
|
151 |
|
152 |
+
return {"response": response.text, "dataset_str": text_string, "docs": docs, "filtered_docs_and_scores": filtered_docs_and_scores, "history": request.chat_history, "RAG_prompt": rag_prompt, "chunks": chunks}
|
153 |
|
154 |
if request.model_choice == "HF":
|
155 |
if hf_token:
|