File size: 5,149 Bytes
62da328 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
from typing import Any, Dict, List
import numpy as np
from camel.loaders import UnstructuredIO
from camel.retrievers import BaseRetriever
from camel.utils import dependencies_required
DEFAULT_TOP_K_RESULTS = 1
class BM25Retriever(BaseRetriever):
r"""An implementation of the `BaseRetriever` using the `BM25` model.
This class facilitates the retriever of relevant information using a
query-based approach, it ranks documents based on the occurrence and
frequency of the query terms.
Attributes:
bm25 (BM25Okapi): An instance of the BM25Okapi class used for
calculating document scores.
content_input_path (str): The path to the content that has been
processed and stored.
unstructured_modules (UnstructuredIO): A module for parsing files and
URLs and chunking content based on specified parameters.
References:
https://github.com/dorianbrown/rank_bm25
"""
@dependencies_required('rank_bm25')
def __init__(self) -> None:
r"""Initializes the BM25Retriever."""
from rank_bm25 import BM25Okapi
self.bm25: BM25Okapi = None
self.content_input_path: str = ""
self.unstructured_modules: UnstructuredIO = UnstructuredIO()
def process(
self,
content_input_path: str,
chunk_type: str = "chunk_by_title",
**kwargs: Any,
) -> None:
r"""Processes content from a file or URL, divides it into chunks by
using `Unstructured IO`,then stored internally. This method must be
called before executing queries with the retriever.
Args:
content_input_path (str): File path or URL of the content to be
processed.
chunk_type (str): Type of chunking going to apply. Defaults to
"chunk_by_title".
**kwargs (Any): Additional keyword arguments for content parsing.
"""
from rank_bm25 import BM25Okapi
# Load and preprocess documents
self.content_input_path = content_input_path
elements = self.unstructured_modules.parse_file_or_url(
content_input_path, **kwargs
)
if elements:
self.chunks = self.unstructured_modules.chunk_elements(
chunk_type=chunk_type, elements=elements
)
# Convert chunks to a list of strings for tokenization
tokenized_corpus = [str(chunk).split(" ") for chunk in self.chunks]
self.bm25 = BM25Okapi(tokenized_corpus)
else:
self.bm25 = None
def query(
self,
query: str,
top_k: int = DEFAULT_TOP_K_RESULTS,
) -> List[Dict[str, Any]]:
r"""Executes a query and compiles the results.
Args:
query (str): Query string for information retriever.
top_k (int, optional): The number of top results to return during
retriever. Must be a positive integer. Defaults to
`DEFAULT_TOP_K_RESULTS`.
Returns:
List[Dict[str]]: Concatenated list of the query results.
Raises:
ValueError: If `top_k` is less than or equal to 0, if the BM25
model has not been initialized by calling `process`
first.
"""
if top_k <= 0:
raise ValueError("top_k must be a positive integer.")
if self.bm25 is None or not self.chunks:
raise ValueError(
"BM25 model is not initialized. Call `process` first."
)
# Preprocess query similarly to how documents were processed
processed_query = query.split(" ")
# Retrieve documents based on BM25 scores
scores = self.bm25.get_scores(processed_query)
top_k_indices = np.argpartition(scores, -top_k)[-top_k:]
formatted_results = []
for i in top_k_indices:
result_dict = {
'similarity score': scores[i],
'content path': self.content_input_path,
'metadata': self.chunks[i].metadata.to_dict(),
'text': str(self.chunks[i]),
}
formatted_results.append(result_dict)
# Sort the list of dictionaries by 'similarity score' from high to low
formatted_results.sort(
key=lambda x: x['similarity score'], reverse=True
)
return formatted_results
|