Spaces:
Sleeping
Sleeping
File size: 1,594 Bytes
3ff674d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
from pickle import dump
from typing import List
from cohere import Client
from numpy import array
from gossip_semantic_search.models import Article, ProcessedDataset
from gossip_semantic_search.utils import embed_content, CustomUnpickler
class DatasetProcessor:
def __init__(self,
dataset_path: str,
saved_processed_dataset_path: str):
self.dataset_path = dataset_path
self.saved_processed_dataset_path = saved_processed_dataset_path
self.processed_dataset: ProcessedDataset = None
@staticmethod
def load_dataset(dataset_path: str) -> List[Article]:
with open(dataset_path, 'rb') as file:
unpickler = CustomUnpickler(file)
data = unpickler.load()
return data
def process_dataset(self,
data: List[Article]):
client = Client()
y_true = []
questions = []
for i, sample in enumerate(data):
for question in sample.questions:
y_true.append(i)
questions.append(question)
self.processed_dataset = ProcessedDataset(
y_true = array(y_true),
embedded_queries=embed_content(questions, client),
embedded_context=array([sample.embeded_content for sample in data]))
def save_articles(self):
with open(self.saved_processed_dataset_path, 'wb') as f:
dump(self.processed_dataset, f)
def run(self):
data = self.load_dataset(self.dataset_path)
self.process_dataset(data)
self.save_articles()
|