Tollef Jørgensen
commited on
Commit
·
acc49a5
1
Parent(s):
d45f5b7
v2
Browse files- .gitignore +1 -0
- app.py +36 -16
- faiss.lookup.csv +0 -0
- faiss.index.zip → files.zip +2 -2
- prep.py +5 -2
.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
*.index
|
|
|
|
1 |
*.index
|
2 |
+
*.csv
|
app.py
CHANGED
@@ -1,16 +1,22 @@
|
|
|
|
1 |
import faiss
|
2 |
import gradio as gr
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
import zipfile
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
index_file = "faiss.index.zip"
|
9 |
|
10 |
-
|
|
|
11 |
z.extractall()
|
12 |
|
13 |
pr_number = 14
|
|
|
14 |
model = SentenceTransformer(
|
15 |
"intfloat/multilingual-e5-small",
|
16 |
revision=f"refs/pr/{pr_number}",
|
@@ -36,7 +42,7 @@ class FaissIndex:
|
|
36 |
def extract_docs(self, indices, k):
|
37 |
indices = list(indices[0])
|
38 |
lookup = self.df.iloc[indices]
|
39 |
-
questions = lookup["
|
40 |
answers = lookup["answer"].values
|
41 |
|
42 |
pairs = list(zip(questions, answers))
|
@@ -51,7 +57,7 @@ class FaissIndex:
|
|
51 |
# format pairs as: f"{answer}\n{kilde: {question}}"
|
52 |
formatted_pairs = []
|
53 |
for pair in filtered_pairs:
|
54 |
-
formatted_pairs.append(f"{pair[1]}
|
55 |
return formatted_pairs
|
56 |
|
57 |
def search(self, query: str, k: int = 5):
|
@@ -62,10 +68,13 @@ class FaissIndex:
|
|
62 |
return self.extract_docs(indices, k)
|
63 |
|
64 |
|
|
|
65 |
index = FaissIndex(model)
|
66 |
|
67 |
|
68 |
def query_faiss_index(søketekst):
|
|
|
|
|
69 |
"""
|
70 |
Queries the FAISS index with the provided search text and returns the top 5 results.
|
71 |
Args:
|
@@ -74,20 +83,31 @@ def query_faiss_index(søketekst):
|
|
74 |
str: A string containing the top 5 search results, separated by double newlines.
|
75 |
"""
|
76 |
|
77 |
-
results = index.search(søketekst, k=
|
78 |
return "\n\n".join(results)
|
79 |
|
80 |
|
81 |
# Create the Gradio interface
|
82 |
-
iface = gr.Interface(
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
)
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
# Launch the Gradio app
|
92 |
-
if __name__ == "__main__":
|
93 |
-
|
|
|
1 |
+
|
2 |
import faiss
|
3 |
import gradio as gr
|
4 |
import numpy as np
|
5 |
import pandas as pd
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
import zipfile
|
8 |
+
import os
|
9 |
+
import logging
|
10 |
+
|
11 |
+
logging.basicConfig(level=logging.ERROR)
|
12 |
|
|
|
13 |
|
14 |
+
# if not os.path.exists("faiss.index"):
|
15 |
+
with zipfile.ZipFile("files.zip", "r") as z:
|
16 |
z.extractall()
|
17 |
|
18 |
pr_number = 14
|
19 |
+
logging.info("Loading embedding model")
|
20 |
model = SentenceTransformer(
|
21 |
"intfloat/multilingual-e5-small",
|
22 |
revision=f"refs/pr/{pr_number}",
|
|
|
42 |
def extract_docs(self, indices, k):
|
43 |
indices = list(indices[0])
|
44 |
lookup = self.df.iloc[indices]
|
45 |
+
questions = lookup["query"].values
|
46 |
answers = lookup["answer"].values
|
47 |
|
48 |
pairs = list(zip(questions, answers))
|
|
|
57 |
# format pairs as: f"{answer}\n{kilde: {question}}"
|
58 |
formatted_pairs = []
|
59 |
for pair in filtered_pairs:
|
60 |
+
formatted_pairs.append(f"{pair[1]}")
|
61 |
return formatted_pairs
|
62 |
|
63 |
def search(self, query: str, k: int = 5):
|
|
|
68 |
return self.extract_docs(indices, k)
|
69 |
|
70 |
|
71 |
+
logging.info("Loading FAISS index")
|
72 |
index = FaissIndex(model)
|
73 |
|
74 |
|
75 |
def query_faiss_index(søketekst):
|
76 |
+
if len(søketekst) < 3:
|
77 |
+
return
|
78 |
"""
|
79 |
Queries the FAISS index with the provided search text and returns the top 5 results.
|
80 |
Args:
|
|
|
83 |
str: A string containing the top 5 search results, separated by double newlines.
|
84 |
"""
|
85 |
|
86 |
+
results = index.search(søketekst, k=3)
|
87 |
return "\n\n".join(results)
|
88 |
|
89 |
|
90 |
# Create the Gradio interface
|
91 |
+
# iface = gr.Interface(
|
92 |
+
# fn=query_faiss_index,
|
93 |
+
# inputs=gr.Textbox(lines=2, placeholder="Søk etter info i SIKT", interactive=True, min_width="30vw"),
|
94 |
+
# outputs=gr.Textbox(label="Søkeresultater", type="text", lines=20, min_width="70vw"),
|
95 |
+
# title="SIKT-FAQ",
|
96 |
+
# description="Semantisk søk i SIKT med Openvino.",
|
97 |
+
# live=True
|
98 |
+
# )
|
99 |
+
|
100 |
+
with gr.Blocks() as blocks:
|
101 |
+
gr.Markdown("## SIKT-FAQ")
|
102 |
+
with gr.Row():
|
103 |
+
box_search = gr.Textbox(label="Søk etter informasjon i SIKT", lines=1, placeholder="Innlogging i FEIDE...", interactive=True)
|
104 |
+
with gr.Row():
|
105 |
+
box_output = gr.Textbox(label="Søkeresultater", type="text", lines=20)
|
106 |
+
|
107 |
+
box_search.change(fn=query_faiss_index, inputs=box_search, outputs=box_output, max_batch_size=1)
|
108 |
+
|
109 |
+
|
110 |
+
blocks.launch()
|
111 |
# Launch the Gradio app
|
112 |
+
# if __name__ == "__main__":
|
113 |
+
# iface.launch()
|
faiss.lookup.csv
DELETED
The diff for this file is too large to render.
See raw diff
|
|
faiss.index.zip → files.zip
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8739c76e681f900923b900c9df0ef75cf421d39cabb54650c4b9ad19b6a76d85
|
3 |
+
size 22
|
prep.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
import zipfile
|
2 |
|
3 |
index_file = "faiss.index"
|
4 |
-
|
5 |
-
|
|
|
|
|
|
|
|
1 |
import zipfile
|
2 |
|
3 |
index_file = "faiss.index"
|
4 |
+
lookup_file = "faiss.lookup.csv"
|
5 |
+
|
6 |
+
with zipfile.ZipFile("files.zip", "w") as z:
|
7 |
+
z.write(index_file)
|
8 |
+
z.write(lookup_file)
|