Spaces:
Sleeping
Sleeping
File size: 1,520 Bytes
68051dd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
from typing import Any, List
from pydantic import StrictStr
from qdrant_client import QdrantClient
from qdrant_client.fastembed_common import QueryResponse
client = QdrantClient(host="localhost", port=6333)
def get_chunks(url: StrictStr) -> List[Element]:
elements: List[Element] = partition(url=url)
for i in range(len(elements)):
elements[i].text = clean_non_ascii_chars(text=elements[i].text)
elements[i].text = replace_unicode_quotes(text=elements[i].text)
elements[i].text = clean_extra_whitespace(text=elements[i].text)
elements[i].text = bytes_string_to_string(text=elements[i].text)
return chunk_by_title(elements=elements)
def add_data(chunks: List[Element]) -> None:
docs: List[StrictStr] = [chunks[i].text for i in range(len(chunks))]
metadata: List[dict[str, Any]] = [
chunks[i].metadata.to_dict() for i in range(len(chunks))
]
ids = list(range(1, len(chunks) + 1))
client.add(
collection_name=settings.vector_database_name,
documents=docs,
metadata=metadata,
ids=ids,
)
def query_db(query: StrictStr) -> List[QueryResponse]:
return client.query(
collection_name=settings.vector_database_name,
query_text=query,
)
if __name__ == "__main__":
url = "https://en.wikipedia.org/wiki/Napoleon"
chunks: List[Element] = get_chunks(url=url)
add_data(chunks=chunks)
r: List[QueryResponse] = query_db(query="Napoleon Bonaparte")
print(len(r))
print(r)
|