File size: 1,520 Bytes
68051dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from typing import Any, List

from pydantic import StrictStr
from qdrant_client import QdrantClient
from qdrant_client.fastembed_common import QueryResponse


client = QdrantClient(host="localhost", port=6333)


def get_chunks(url: StrictStr) -> List[Element]:
    elements: List[Element] = partition(url=url)
    for i in range(len(elements)):
        elements[i].text = clean_non_ascii_chars(text=elements[i].text)
        elements[i].text = replace_unicode_quotes(text=elements[i].text)
        elements[i].text = clean_extra_whitespace(text=elements[i].text)
        elements[i].text = bytes_string_to_string(text=elements[i].text)
    return chunk_by_title(elements=elements)


def add_data(chunks: List[Element]) -> None:
    docs: List[StrictStr] = [chunks[i].text for i in range(len(chunks))]
    metadata: List[dict[str, Any]] = [
        chunks[i].metadata.to_dict() for i in range(len(chunks))
    ]
    ids = list(range(1, len(chunks) + 1))
    client.add(
        collection_name=settings.vector_database_name,
        documents=docs,
        metadata=metadata,
        ids=ids,
    )


def query_db(query: StrictStr) -> List[QueryResponse]:
    return client.query(
        collection_name=settings.vector_database_name,
        query_text=query,
    )


if __name__ == "__main__":
    url = "https://en.wikipedia.org/wiki/Napoleon"
    chunks: List[Element] = get_chunks(url=url)
    add_data(chunks=chunks)
    r: List[QueryResponse] = query_db(query="Napoleon Bonaparte")
    print(len(r))
    print(r)