|
""" |
|
A buffered iterator for big arrays. |
|
|
|
This module solves the problem of iterating over a big file-based array |
|
without having to read it into memory. The `Arrayterator` class wraps |
|
an array object, and when iterated it will return sub-arrays with at most |
|
a user-specified number of elements. |
|
|
|
""" |
|
from __future__ import division, absolute_import, print_function |
|
|
|
from operator import mul |
|
from functools import reduce |
|
|
|
from numpy.compat import long |
|
|
|
__all__ = ['Arrayterator'] |
|
|
|
|
|
class Arrayterator(object): |
|
""" |
|
Buffered iterator for big arrays. |
|
|
|
`Arrayterator` creates a buffered iterator for reading big arrays in small |
|
contiguous blocks. The class is useful for objects stored in the |
|
file system. It allows iteration over the object *without* reading |
|
everything in memory; instead, small blocks are read and iterated over. |
|
|
|
`Arrayterator` can be used with any object that supports multidimensional |
|
slices. This includes NumPy arrays, but also variables from |
|
Scientific.IO.NetCDF or pynetcdf for example. |
|
|
|
Parameters |
|
---------- |
|
var : array_like |
|
The object to iterate over. |
|
buf_size : int, optional |
|
The buffer size. If `buf_size` is supplied, the maximum amount of |
|
data that will be read into memory is `buf_size` elements. |
|
Default is None, which will read as many element as possible |
|
into memory. |
|
|
|
Attributes |
|
---------- |
|
var |
|
buf_size |
|
start |
|
stop |
|
step |
|
shape |
|
flat |
|
|
|
See Also |
|
-------- |
|
ndenumerate : Multidimensional array iterator. |
|
flatiter : Flat array iterator. |
|
memmap : Create a memory-map to an array stored in a binary file on disk. |
|
|
|
Notes |
|
----- |
|
The algorithm works by first finding a "running dimension", along which |
|
the blocks will be extracted. Given an array of dimensions |
|
``(d1, d2, ..., dn)``, e.g. if `buf_size` is smaller than ``d1``, the |
|
first dimension will be used. If, on the other hand, |
|
``d1 < buf_size < d1*d2`` the second dimension will be used, and so on. |
|
Blocks are extracted along this dimension, and when the last block is |
|
returned the process continues from the next dimension, until all |
|
elements have been read. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6) |
|
>>> a_itor = np.lib.arrayterator.Arrayterator(a, 2) |
|
>>> a_itor.shape |
|
(3, 4, 5, 6) |
|
|
|
Now we can iterate over ``a_itor``, and it will return arrays of size |
|
two. Since `buf_size` was smaller than any dimension, the first |
|
dimension will be iterated over first: |
|
|
|
>>> for subarr in a_itor: |
|
... if not subarr.all(): |
|
... print subarr, subarr.shape |
|
... |
|
[[[[0 1]]]] (1, 1, 1, 2) |
|
|
|
""" |
|
|
|
def __init__(self, var, buf_size=None): |
|
self.var = var |
|
self.buf_size = buf_size |
|
|
|
self.start = [0 for dim in var.shape] |
|
self.stop = [dim for dim in var.shape] |
|
self.step = [1 for dim in var.shape] |
|
|
|
def __getattr__(self, attr): |
|
return getattr(self.var, attr) |
|
|
|
def __getitem__(self, index): |
|
""" |
|
Return a new arrayterator. |
|
|
|
""" |
|
|
|
if not isinstance(index, tuple): |
|
index = (index,) |
|
fixed = [] |
|
length, dims = len(index), len(self.shape) |
|
for slice_ in index: |
|
if slice_ is Ellipsis: |
|
fixed.extend([slice(None)] * (dims-length+1)) |
|
length = len(fixed) |
|
elif isinstance(slice_, (int, long)): |
|
fixed.append(slice(slice_, slice_+1, 1)) |
|
else: |
|
fixed.append(slice_) |
|
index = tuple(fixed) |
|
if len(index) < dims: |
|
index += (slice(None),) * (dims-len(index)) |
|
|
|
|
|
out = self.__class__(self.var, self.buf_size) |
|
for i, (start, stop, step, slice_) in enumerate( |
|
zip(self.start, self.stop, self.step, index)): |
|
out.start[i] = start + (slice_.start or 0) |
|
out.step[i] = step * (slice_.step or 1) |
|
out.stop[i] = start + (slice_.stop or stop-start) |
|
out.stop[i] = min(stop, out.stop[i]) |
|
return out |
|
|
|
def __array__(self): |
|
""" |
|
Return corresponding data. |
|
|
|
""" |
|
slice_ = tuple(slice(*t) for t in zip( |
|
self.start, self.stop, self.step)) |
|
return self.var[slice_] |
|
|
|
@property |
|
def flat(self): |
|
""" |
|
A 1-D flat iterator for Arrayterator objects. |
|
|
|
This iterator returns elements of the array to be iterated over in |
|
`Arrayterator` one by one. It is similar to `flatiter`. |
|
|
|
See Also |
|
-------- |
|
`Arrayterator` |
|
flatiter |
|
|
|
Examples |
|
-------- |
|
>>> a = np.arange(3 * 4 * 5 * 6).reshape(3, 4, 5, 6) |
|
>>> a_itor = np.lib.arrayterator.Arrayterator(a, 2) |
|
|
|
>>> for subarr in a_itor.flat: |
|
... if not subarr: |
|
... print subarr, type(subarr) |
|
... |
|
0 <type 'numpy.int32'> |
|
|
|
""" |
|
for block in self: |
|
for value in block.flat: |
|
yield value |
|
|
|
@property |
|
def shape(self): |
|
""" |
|
The shape of the array to be iterated over. |
|
|
|
For an example, see `Arrayterator`. |
|
|
|
""" |
|
return tuple(((stop-start-1)//step+1) for start, stop, step in |
|
zip(self.start, self.stop, self.step)) |
|
|
|
def __iter__(self): |
|
|
|
if [dim for dim in self.shape if dim <= 0]: |
|
raise StopIteration |
|
|
|
start = self.start[:] |
|
stop = self.stop[:] |
|
step = self.step[:] |
|
ndims = len(self.var.shape) |
|
|
|
while True: |
|
count = self.buf_size or reduce(mul, self.shape) |
|
|
|
|
|
|
|
|
|
rundim = 0 |
|
for i in range(ndims-1, -1, -1): |
|
|
|
|
|
if count == 0: |
|
stop[i] = start[i]+1 |
|
elif count <= self.shape[i]: |
|
|
|
stop[i] = start[i] + count*step[i] |
|
rundim = i |
|
else: |
|
|
|
stop[i] = self.stop[i] |
|
stop[i] = min(self.stop[i], stop[i]) |
|
count = count//self.shape[i] |
|
|
|
|
|
slice_ = tuple(slice(*t) for t in zip(start, stop, step)) |
|
yield self.var[slice_] |
|
|
|
|
|
|
|
start[rundim] = stop[rundim] |
|
for i in range(ndims-1, 0, -1): |
|
if start[i] >= self.stop[i]: |
|
start[i] = self.start[i] |
|
start[i-1] += self.step[i-1] |
|
if start[0] >= self.stop[0]: |
|
raise StopIteration |
|
|