File size: 2,136 Bytes
8677815
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from haystack.utils import convert_files_to_docs
from haystack.nodes import PreProcessor

import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset, load_from_disk
import pandas as pd

from haystack.nodes import BM25Retriever
from haystack.document_stores import InMemoryDocumentStore
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import TfidfRetriever


import warnings
warnings.filterwarnings('ignore')

def generate_docs(overlap, length):

    '''
    Takes in split length and split overlap
    Saves the docs in a pandas dataframe
    '''
    all_docs = convert_files_to_docs(dir_path='data')

    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_overlap=overlap,
        split_length=length,
        split_respect_sentence_boundary=False,
    )

    docs = preprocessor.process(all_docs)

    # print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")

    df = pd.DataFrame(docs)
    dataset = Dataset(pa.Table.from_pandas(df))
    dataset.save_to_disk('outputs/docs-dataset')

    return None


def retriever1():
    '''
    Use BM25 Retriever to retrieve data
    '''

    dataset = load_from_disk('outputs/docs-dataset')

    # BM25Retriever with InMemoryDocumentStore
    document_store = InMemoryDocumentStore(use_bm25=True)
    document_store.write_documents(dataset)
    retriever = BM25Retriever(document_store=document_store, top_k=5)

    return retriever


# def retriever2():
#     document_store = FAISSDocumentStore(similarity="dot_product")
#     retriever = DensePassageRetriever(
#         document_store=document_store,
#         query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
#         passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base"
#     )
#     document_store.update_embeddings(retriever)

#     return retriever
# generate_docs(20, 250)
# ret = retriever2()