Tollef Jørgensen
ignore and some updates
77c842b
raw
history blame contribute delete
3.96 kB
import faiss
import gradio as gr
import numpy as np
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
idx = 0
index = None
newdoc = None
dataset = load_dataset("tollefj/rettsavgjoerelser_100samples_embeddings")
model = SentenceTransformer("NbAiLab/nb-sbert-base")
df = dataset["train"].to_pandas()
def build_doc_frame(df, idx):
doc = df.iloc[idx]
# as df:
doc_df = pd.DataFrame(doc).T
# keep only sentences + embedding:
doc_df = doc_df[["url", "sentences", "embedding"]]
# unpack the sentences and embedding in separate rows
doc_df = doc_df.explode(["sentences", "embedding"])
return doc_df
def get_doc_embeddings(doc):
return np.array(doc.embedding.tolist(), dtype="float32")
def faiss_search(query_str, K=5):
global idx
global index
global newdoc
# find idx from url:
# doc_idx = df[df.url == doc_url].index[0]
# idx = int(doc_idx)
target_emb = model.encode([query_str])
target_emb = np.array([target_emb.reshape(-1)])
faiss.normalize_L2(target_emb)
D, I = index.search(np.array(target_emb), K)
print(list(zip(D[0], I[0])))
# prettyprint the results:
pretty_results = []
for idx, score in zip(I[0], D[0]):
pretty_results.append((round(float(score), 3), newdoc.iloc[idx].sentences))
pretty_results_str = "\n".join([f"Score: {score}\t\t{sent}" for score, sent in pretty_results])
top_k_str = f"Top {K} results for: {query_str}"
# return str:
return f"{top_k_str}\n{pretty_results_str}"
# def DropdownSummary():
# next_opts = df.iloc[idx].summary.tolist()
# return gr.Dropdown.update(choices=next_opts, label="Velg fra oppsummeringene")
dropdown_opts = [doc.url for idx, doc in df.iterrows()]
with gr.Blocks() as demo:
gr.HTML(
"""
<h1>Lovdata rettsavgjørelser - semantisk søk</h1>
"""
)
def on_selection_change(selected_case):
global idx
global index
global newdoc
idx = df[df.url == selected_case].index[0]
print("Selection changed!")
print(selected_case)
newdoc = build_doc_frame(df, idx)
embeddings = get_doc_embeddings(newdoc)
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(768)
index.add(embeddings)
summary = df.iloc[idx].summary.tolist()
# make a nice html-formatted ul-li list:
summary_html = "<ul>" + "".join([f"<li>{sent}</li>" for sent in summary]) + "</ul>"
# summary_dropdown.update(choices=summary, label="Velg fra oppsummeringene")
url_html = f"<a href='{selected_case}' target='_blank'>{selected_case}</a>"
return summary_html, url_html
with gr.Row():
with gr.Column():
case_dropdown = gr.Dropdown(label="Velg en rettsavgjørelse", choices=dropdown_opts)
summary_html = gr.HTML(label="Predefinert oppsummering", placeholder="<p>Velg en sak først<p>")
case_url = gr.HTML(label="URL til rettsavgjørelse", placeholder="https://lovdata.no/...")
with gr.Column():
query = gr.Textbox(
label="Søk etter setninger",
lines=1,
placeholder="Kollisjon mellom to kjøretøy.",
)
k_slider = gr.Slider(minimum=1, maximum=10, label="Antall treff", value=5, step=1)
search_btn = gr.Button("Søk")
output = gr.Textbox(label="Resultater", lines=10)
case_dropdown.change(
on_selection_change,
inputs=[case_dropdown],
outputs=[summary_html, case_url],
)
search_btn.click(faiss_search, inputs=[query, k_slider], outputs=[output])
# clear_btn.click(None, inputs=[None, None], outputs=None)
# search_btn.click(faiss_search, inputs=[None, None, None], outputs=["text"])
# search_btn.click(faiss_search, inputs=[idx, query, k_slider], outputs=["text"])
demo.launch()