Spaces:

mamogasr
/

llm_engineering

Sleeping

App Files Files Community

llm_engineering / week6 /loaders.py

mamogasr

Upload folder using huggingface_hub

5fdb69e verified 6 days ago

raw

history blame contribute delete

2.78 kB

	from datetime import datetime
	from tqdm import tqdm
	from datasets import load_dataset
	from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
	from items import Item

	CHUNK_SIZE = 1000
	MIN_PRICE = 0.5
	MAX_PRICE = 999.49

	class ItemLoader:


	def __init__(self, name):
	self.name = name
	self.dataset = None

	def from_datapoint(self, datapoint):
	"""
	Try to create an Item from this datapoint
	Return the Item if successful, or None if it shouldn't be included
	"""
	try:
	price_str = datapoint['price']
	if price_str:
	price = float(price_str)
	if MIN_PRICE <= price <= MAX_PRICE:
	item = Item(datapoint, price)
	return item if item.include else None
	except ValueError:
	return None

	def from_chunk(self, chunk):
	"""
	Create a list of Items from this chunk of elements from the Dataset
	"""
	batch = []
	for datapoint in chunk:
	result = self.from_datapoint(datapoint)
	if result:
	batch.append(result)
	return batch

	def chunk_generator(self):
	"""
	Iterate over the Dataset, yielding chunks of datapoints at a time
	"""
	size = len(self.dataset)
	for i in range(0, size, CHUNK_SIZE):
	yield self.dataset.select(range(i, min(i + CHUNK_SIZE, size)))

	def load_in_parallel(self, workers):
	"""
	Use concurrent.futures to farm out the work to process chunks of datapoints -
	This speeds up processing significantly, but will tie up your computer while it's doing so!
	"""
	results = []
	chunk_count = (len(self.dataset) // CHUNK_SIZE) + 1
	with ProcessPoolExecutor(max_workers=workers) as pool:
	for batch in tqdm(pool.map(self.from_chunk, self.chunk_generator()), total=chunk_count):
	results.extend(batch)
	for result in results:
	result.category = self.name
	return results

	def load(self, workers=8):
	"""
	Load in this dataset; the workers parameter specifies how many processes
	should work on loading and scrubbing the data
	"""
	start = datetime.now()
	print(f"Loading dataset {self.name}", flush=True)
	self.dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_meta_{self.name}", split="full", trust_remote_code=True)
	results = self.load_in_parallel(workers)
	finish = datetime.now()
	print(f"Completed {self.name} with {len(results):,} datapoints in {(finish-start).total_seconds()/60:.1f} mins", flush=True)
	return results