Ralqasimi commited on
Commit
9185bbd
·
verified ·
1 Parent(s): f6c61f9

Update knowledge_base.py

Browse files
Files changed (1) hide show
  1. knowledge_base.py +27 -1
knowledge_base.py CHANGED
@@ -37,4 +37,30 @@ def search_faiss(faiss_index, stored_texts, query, top_k=3):
37
  # Retrieve the corresponding texts
38
  results = [stored_texts[i] for i in indices[0] if i < len(stored_texts)]
39
 
40
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  # Retrieve the corresponding texts
38
  results = [stored_texts[i] for i in indices[0] if i < len(stored_texts)]
39
 
40
+ return results
41
+
42
+ import re
43
+
44
+ def clean_text(text):
45
+ """
46
+ Cleans text by removing unnecessary symbols and whitespace.
47
+ """
48
+ text = re.sub(r"\s+", " ", text) # Replace multiple spaces with one
49
+ text = re.sub(r"[^ء-يa-zA-Z0-9.,!?؛:\-\(\)\n ]+", "", text) # Keep Arabic, English, and punctuation
50
+ return text.strip()
51
+
52
+ def create_faiss_index(texts):
53
+ from sentence_transformers import SentenceTransformer
54
+ import faiss
55
+
56
+ # Clean the text before indexing
57
+ texts = [clean_text(t) for t in texts]
58
+
59
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
60
+ embeddings = model.encode(texts)
61
+
62
+ dimension = embeddings.shape[1]
63
+ index = faiss.IndexFlatL2(dimension)
64
+ index.add(embeddings)
65
+
66
+ return index, texts