albertmartinez's picture
Added score block number
e9536a9
raw
history blame
1.56 kB
import time
import pandas as pd
import polars as pl
import torch
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import paraphrase_mining
def mining(path, score):
st = time.time()
data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, names=["text"], sep="\t"))
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(
"sentence-transformers/all-MiniLM-L6-v2",
backend="openvino",
model_kwargs={"file_name": "openvino/openvino_model.xml"},
device=device,
trust_remote_code=True,
)
paraphrases = paraphrase_mining(
model,
data["text"],
corpus_chunk_size=len(data),
show_progress_bar=True,
batch_size=1024,
max_pairs=len(data) ** 2,
)
df_pd = pd.DataFrame(paraphrases)
df = pl.from_pandas(df_pd)
df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"})
union_df = pl.DataFrame(data.to_pandas())
df = df.with_columns([
pl.col("score").round(3).cast(pl.Float32),
union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
]).filter(pl.col("score") > score).sort(["score"], descending=True)
elapsed_time = time.time() - st
print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
return df