# Indexing
Using [Haystack](https://github.com/deepset-ai/haystack), the following steps are performed:
- load and preprocess documents downloaded from Wikipedia
- create document store and write documents
- initialize retriever and generate document embeddings

In [None]:
! pip install farm-haystack[faiss-gpu]==1.7.0

## Load documents

In [2]:
import glob, json

In [3]:
docs = []

for json_file in glob.glob("../input/crawl-rock/rock_wiki/*.json"):
    with open(json_file, "r") as fin:
        doc = json.load(fin)

    docs.append(doc)

In [4]:
len(docs)

453

## Preprocess documents

In [6]:
# preprocess documents, splitting by chunks of 2 sentences

from haystack.nodes import PreProcessor

processor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="sentence",
    split_length=2,
    split_respect_sentence_boundary=False,
    split_overlap=0,
    language="en",
)
preprocessed_docs = processor.process(docs)

Preprocessing:   0%|          | 0/453 [00:00<?, ?docs/s]

In [7]:
len(preprocessed_docs)

50024

In [8]:
preprocessed_docs[:10]

[<Document: {'content': 'Disturbed is an American heavy metal band from Chicago, formed in 1994. The band includes vocalist David Draiman, guitarist/keyboardist Dan Donegan, bassist John Moyer, and drummer Mike Wengren.', 'content_type': 'text', 'score': None, 'meta': {'name': 'Disturbed (band)', 'url': 'https://en.wikipedia.org/wiki/Disturbed_(band)', '_split_id': 0}, 'embedding': None, 'id': '543d4f9f9023bfc277edf307a6aef870'}>,
 <Document: {'content': 'Donegan and Wengren have been involved in the band since its inception, with Moyer replacing former bassist Steve "Fuzz" Kmak and Draiman replacing original lead vocalist Erich Awalt. The band has released seven studio albums, five of which have consecutively debuted at number one on the Billboard 200.', 'content_type': 'text', 'score': None, 'meta': {'name': 'Disturbed (band)', 'url': 'https://en.wikipedia.org/wiki/Disturbed_(band)', '_split_id': 1}, 'embedding': None, 'id': 'dfb0ef877837c95b2e8b03cfe2ae2057'}>,
 <Document: {'content

In [None]:
# select only documents with at least 10 words. Otherwise, the documents are not very informative
preprocessed_docs = [doc for doc in preprocessed_docs if len(doc.content.split()) >= 10]

## Create document store ([FAISS](https://github.com/facebookresearch/faiss)) and write documents

In [9]:
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever

In [10]:
# the document store settings are those compatible with Embedding Retriever
document_store = FAISSDocumentStore(similarity="dot_product", embedding_dim=768)

In [46]:
# write documents
document_store.write_documents(preprocessed_docs)

Writing Documents:   0%|          | 0/50024 [00:00<?, ?it/s]

## Initialize retriever (Embedding Retriever) and generate document embeddings
We choose a Sentence Tranformer model that is suitable for asymmetric semantic search (short query and longer passages), according to [documentation](https://www.sbert.net/examples/applications/semantic-search/README.html#symmetric-vs-asymmetric-semantic-search).

In [None]:
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/msmarco-distilbert-base-tas-b",
    model_format="sentence_transformers",
    embed_meta_fields=["name"],
)

# generate embeddings
document_store.update_embeddings(retriever)

## Save and export index

In [None]:
import shutil
import glob

In [73]:
OUT_DIR = "YOUR-OUT-DIR"

document_store.save("my_faiss_index.faiss")
for f in glob.glob("*faiss*.*") + glob.glob("faiss*.*"):
    shutil.copy(f, OUT_DIR)