Spaces:
Sleeping
Sleeping
File size: 2,136 Bytes
8677815 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
from haystack.utils import convert_files_to_docs
from haystack.nodes import PreProcessor
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset, load_from_disk
import pandas as pd
from haystack.nodes import BM25Retriever
from haystack.document_stores import InMemoryDocumentStore
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TfidfRetriever
import warnings
warnings.filterwarnings('ignore')
def generate_docs(overlap, length):
'''
Takes in split length and split overlap
Saves the docs in a pandas dataframe
'''
all_docs = convert_files_to_docs(dir_path='data')
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_overlap=overlap,
split_length=length,
split_respect_sentence_boundary=False,
)
docs = preprocessor.process(all_docs)
# print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")
df = pd.DataFrame(docs)
dataset = Dataset(pa.Table.from_pandas(df))
dataset.save_to_disk('outputs/docs-dataset')
return None
def retriever1():
'''
Use BM25 Retriever to retrieve data
'''
dataset = load_from_disk('outputs/docs-dataset')
# BM25Retriever with InMemoryDocumentStore
document_store = InMemoryDocumentStore(use_bm25=True)
document_store.write_documents(dataset)
retriever = BM25Retriever(document_store=document_store, top_k=5)
return retriever
# def retriever2():
# document_store = FAISSDocumentStore(similarity="dot_product")
# retriever = DensePassageRetriever(
# document_store=document_store,
# query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
# passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base"
# )
# document_store.update_embeddings(retriever)
# return retriever
# generate_docs(20, 250)
# ret = retriever2()
|