|
""" |
|
Collection of utilities to manipulate structured arrays. |
|
|
|
Most of these functions were initially implemented by John Hunter for |
|
matplotlib. They have been rewritten and extended for convenience. |
|
|
|
""" |
|
from __future__ import division, absolute_import, print_function |
|
|
|
import sys |
|
import itertools |
|
import numpy as np |
|
import numpy.ma as ma |
|
from numpy import ndarray, recarray |
|
from numpy.ma import MaskedArray |
|
from numpy.ma.mrecords import MaskedRecords |
|
from numpy.lib._iotools import _is_string_like |
|
from numpy.compat import basestring |
|
|
|
if sys.version_info[0] < 3: |
|
from future_builtins import zip |
|
|
|
_check_fill_value = np.ma.core._check_fill_value |
|
|
|
|
|
__all__ = [ |
|
'append_fields', 'drop_fields', 'find_duplicates', |
|
'get_fieldstructure', 'join_by', 'merge_arrays', |
|
'rec_append_fields', 'rec_drop_fields', 'rec_join', |
|
'recursive_fill_fields', 'rename_fields', 'stack_arrays', |
|
] |
|
|
|
|
|
def recursive_fill_fields(input, output): |
|
""" |
|
Fills fields from output with fields from input, |
|
with support for nested structures. |
|
|
|
Parameters |
|
---------- |
|
input : ndarray |
|
Input array. |
|
output : ndarray |
|
Output array. |
|
|
|
Notes |
|
----- |
|
* `output` should be at least the same size as `input` |
|
|
|
Examples |
|
-------- |
|
>>> from numpy.lib import recfunctions as rfn |
|
>>> a = np.array([(1, 10.), (2, 20.)], dtype=[('A', int), ('B', float)]) |
|
>>> b = np.zeros((3,), dtype=a.dtype) |
|
>>> rfn.recursive_fill_fields(a, b) |
|
array([(1, 10.0), (2, 20.0), (0, 0.0)], |
|
dtype=[('A', '<i4'), ('B', '<f8')]) |
|
|
|
""" |
|
newdtype = output.dtype |
|
for field in newdtype.names: |
|
try: |
|
current = input[field] |
|
except ValueError: |
|
continue |
|
if current.dtype.names: |
|
recursive_fill_fields(current, output[field]) |
|
else: |
|
output[field][:len(current)] = current |
|
return output |
|
|
|
|
|
def get_names(adtype): |
|
""" |
|
Returns the field names of the input datatype as a tuple. |
|
|
|
Parameters |
|
---------- |
|
adtype : dtype |
|
Input datatype |
|
|
|
Examples |
|
-------- |
|
>>> from numpy.lib import recfunctions as rfn |
|
>>> rfn.get_names(np.empty((1,), dtype=int)) is None |
|
True |
|
>>> rfn.get_names(np.empty((1,), dtype=[('A',int), ('B', float)])) |
|
('A', 'B') |
|
>>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])]) |
|
>>> rfn.get_names(adtype) |
|
('a', ('b', ('ba', 'bb'))) |
|
""" |
|
listnames = [] |
|
names = adtype.names |
|
for name in names: |
|
current = adtype[name] |
|
if current.names: |
|
listnames.append((name, tuple(get_names(current)))) |
|
else: |
|
listnames.append(name) |
|
return tuple(listnames) or None |
|
|
|
|
|
def get_names_flat(adtype): |
|
""" |
|
Returns the field names of the input datatype as a tuple. Nested structure |
|
are flattend beforehand. |
|
|
|
Parameters |
|
---------- |
|
adtype : dtype |
|
Input datatype |
|
|
|
Examples |
|
-------- |
|
>>> from numpy.lib import recfunctions as rfn |
|
>>> rfn.get_names_flat(np.empty((1,), dtype=int)) is None |
|
True |
|
>>> rfn.get_names_flat(np.empty((1,), dtype=[('A',int), ('B', float)])) |
|
('A', 'B') |
|
>>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])]) |
|
>>> rfn.get_names_flat(adtype) |
|
('a', 'b', 'ba', 'bb') |
|
""" |
|
listnames = [] |
|
names = adtype.names |
|
for name in names: |
|
listnames.append(name) |
|
current = adtype[name] |
|
if current.names: |
|
listnames.extend(get_names_flat(current)) |
|
return tuple(listnames) or None |
|
|
|
|
|
def flatten_descr(ndtype): |
|
""" |
|
Flatten a structured data-type description. |
|
|
|
Examples |
|
-------- |
|
>>> from numpy.lib import recfunctions as rfn |
|
>>> ndtype = np.dtype([('a', '<i4'), ('b', [('ba', '<f8'), ('bb', '<i4')])]) |
|
>>> rfn.flatten_descr(ndtype) |
|
(('a', dtype('int32')), ('ba', dtype('float64')), ('bb', dtype('int32'))) |
|
|
|
""" |
|
names = ndtype.names |
|
if names is None: |
|
return ndtype.descr |
|
else: |
|
descr = [] |
|
for field in names: |
|
(typ, _) = ndtype.fields[field] |
|
if typ.names: |
|
descr.extend(flatten_descr(typ)) |
|
else: |
|
descr.append((field, typ)) |
|
return tuple(descr) |
|
|
|
|
|
def zip_descr(seqarrays, flatten=False): |
|
""" |
|
Combine the dtype description of a series of arrays. |
|
|
|
Parameters |
|
---------- |
|
seqarrays : sequence of arrays |
|
Sequence of arrays |
|
flatten : {boolean}, optional |
|
Whether to collapse nested descriptions. |
|
""" |
|
newdtype = [] |
|
if flatten: |
|
for a in seqarrays: |
|
newdtype.extend(flatten_descr(a.dtype)) |
|
else: |
|
for a in seqarrays: |
|
current = a.dtype |
|
names = current.names or () |
|
if len(names) > 1: |
|
newdtype.append(('', current.descr)) |
|
else: |
|
newdtype.extend(current.descr) |
|
return np.dtype(newdtype).descr |
|
|
|
|
|
def get_fieldstructure(adtype, lastname=None, parents=None,): |
|
""" |
|
Returns a dictionary with fields indexing lists of their parent fields. |
|
|
|
This function is used to simplify access to fields nested in other fields. |
|
|
|
Parameters |
|
---------- |
|
adtype : np.dtype |
|
Input datatype |
|
lastname : optional |
|
Last processed field name (used internally during recursion). |
|
parents : dictionary |
|
Dictionary of parent fields (used interbally during recursion). |
|
|
|
Examples |
|
-------- |
|
>>> from numpy.lib import recfunctions as rfn |
|
>>> ndtype = np.dtype([('A', int), |
|
... ('B', [('BA', int), |
|
... ('BB', [('BBA', int), ('BBB', int)])])]) |
|
>>> rfn.get_fieldstructure(ndtype) |
|
... # XXX: possible regression, order of BBA and BBB is swapped |
|
{'A': [], 'B': [], 'BA': ['B'], 'BB': ['B'], 'BBA': ['B', 'BB'], 'BBB': ['B', 'BB']} |
|
|
|
""" |
|
if parents is None: |
|
parents = {} |
|
names = adtype.names |
|
for name in names: |
|
current = adtype[name] |
|
if current.names: |
|
if lastname: |
|
parents[name] = [lastname, ] |
|
else: |
|
parents[name] = [] |
|
parents.update(get_fieldstructure(current, name, parents)) |
|
else: |
|
lastparent = [_ for _ in (parents.get(lastname, []) or [])] |
|
if lastparent: |
|
lastparent.append(lastname) |
|
elif lastname: |
|
lastparent = [lastname, ] |
|
parents[name] = lastparent or [] |
|
return parents or None |
|
|
|
|
|
def _izip_fields_flat(iterable): |
|
""" |
|
Returns an iterator of concatenated fields from a sequence of arrays, |
|
collapsing any nested structure. |
|
|
|
""" |
|
for element in iterable: |
|
if isinstance(element, np.void): |
|
for f in _izip_fields_flat(tuple(element)): |
|
yield f |
|
else: |
|
yield element |
|
|
|
|
|
def _izip_fields(iterable): |
|
""" |
|
Returns an iterator of concatenated fields from a sequence of arrays. |
|
|
|
""" |
|
for element in iterable: |
|
if (hasattr(element, '__iter__') and |
|
not isinstance(element, basestring)): |
|
for f in _izip_fields(element): |
|
yield f |
|
elif isinstance(element, np.void) and len(tuple(element)) == 1: |
|
for f in _izip_fields(element): |
|
yield f |
|
else: |
|
yield element |
|
|
|
|
|
def izip_records(seqarrays, fill_value=None, flatten=True): |
|
""" |
|
Returns an iterator of concatenated items from a sequence of arrays. |
|
|
|
Parameters |
|
---------- |
|
seqarray : sequence of arrays |
|
Sequence of arrays. |
|
fill_value : {None, integer} |
|
Value used to pad shorter iterables. |
|
flatten : {True, False}, |
|
Whether to |
|
""" |
|
|
|
def sentinel(counter=([fill_value] * (len(seqarrays) - 1)).pop): |
|
"Yields the fill_value or raises IndexError" |
|
yield counter() |
|
|
|
fillers = itertools.repeat(fill_value) |
|
iters = [itertools.chain(it, sentinel(), fillers) for it in seqarrays] |
|
|
|
if flatten: |
|
zipfunc = _izip_fields_flat |
|
else: |
|
zipfunc = _izip_fields |
|
|
|
try: |
|
for tup in zip(*iters): |
|
yield tuple(zipfunc(tup)) |
|
except IndexError: |
|
pass |
|
|
|
|
|
def _fix_output(output, usemask=True, asrecarray=False): |
|
""" |
|
Private function: return a recarray, a ndarray, a MaskedArray |
|
or a MaskedRecords depending on the input parameters |
|
""" |
|
if not isinstance(output, MaskedArray): |
|
usemask = False |
|
if usemask: |
|
if asrecarray: |
|
output = output.view(MaskedRecords) |
|
else: |
|
output = ma.filled(output) |
|
if asrecarray: |
|
output = output.view(recarray) |
|
return output |
|
|
|
|
|
def _fix_defaults(output, defaults=None): |
|
""" |
|
Update the fill_value and masked data of `output` |
|
from the default given in a dictionary defaults. |
|
""" |
|
names = output.dtype.names |
|
(data, mask, fill_value) = (output.data, output.mask, output.fill_value) |
|
for (k, v) in (defaults or {}).items(): |
|
if k in names: |
|
fill_value[k] = v |
|
data[k][mask[k]] = v |
|
return output |
|
|
|
|
|
def merge_arrays(seqarrays, fill_value=-1, flatten=False, |
|
usemask=False, asrecarray=False): |
|
""" |
|
Merge arrays field by field. |
|
|
|
Parameters |
|
---------- |
|
seqarrays : sequence of ndarrays |
|
Sequence of arrays |
|
fill_value : {float}, optional |
|
Filling value used to pad missing data on the shorter arrays. |
|
flatten : {False, True}, optional |
|
Whether to collapse nested fields. |
|
usemask : {False, True}, optional |
|
Whether to return a masked array or not. |
|
asrecarray : {False, True}, optional |
|
Whether to return a recarray (MaskedRecords) or not. |
|
|
|
Examples |
|
-------- |
|
>>> from numpy.lib import recfunctions as rfn |
|
>>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.]))) |
|
masked_array(data = [(1, 10.0) (2, 20.0) (--, 30.0)], |
|
mask = [(False, False) (False, False) (True, False)], |
|
fill_value = (999999, 1e+20), |
|
dtype = [('f0', '<i4'), ('f1', '<f8')]) |
|
|
|
>>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.])), |
|
... usemask=False) |
|
array([(1, 10.0), (2, 20.0), (-1, 30.0)], |
|
dtype=[('f0', '<i4'), ('f1', '<f8')]) |
|
>>> rfn.merge_arrays((np.array([1, 2]).view([('a', int)]), |
|
... np.array([10., 20., 30.])), |
|
... usemask=False, asrecarray=True) |
|
rec.array([(1, 10.0), (2, 20.0), (-1, 30.0)], |
|
dtype=[('a', '<i4'), ('f1', '<f8')]) |
|
|
|
Notes |
|
----- |
|
* Without a mask, the missing value will be filled with something, |
|
* depending on what its corresponding type: |
|
-1 for integers |
|
-1.0 for floating point numbers |
|
'-' for characters |
|
'-1' for strings |
|
True for boolean values |
|
* XXX: I just obtained these values empirically |
|
""" |
|
|
|
if (len(seqarrays) == 1): |
|
seqarrays = np.asanyarray(seqarrays[0]) |
|
|
|
if isinstance(seqarrays, (ndarray, np.void)): |
|
seqdtype = seqarrays.dtype |
|
if (not flatten) or \ |
|
(zip_descr((seqarrays,), flatten=True) == seqdtype.descr): |
|
|
|
seqarrays = seqarrays.ravel() |
|
|
|
if not seqdtype.names: |
|
seqdtype = [('', seqdtype)] |
|
|
|
if usemask: |
|
if asrecarray: |
|
seqtype = MaskedRecords |
|
else: |
|
seqtype = MaskedArray |
|
elif asrecarray: |
|
seqtype = recarray |
|
else: |
|
seqtype = ndarray |
|
return seqarrays.view(dtype=seqdtype, type=seqtype) |
|
else: |
|
seqarrays = (seqarrays,) |
|
else: |
|
|
|
seqarrays = [np.asanyarray(_m) for _m in seqarrays] |
|
|
|
sizes = tuple(a.size for a in seqarrays) |
|
maxlength = max(sizes) |
|
|
|
newdtype = zip_descr(seqarrays, flatten=flatten) |
|
|
|
seqdata = [] |
|
seqmask = [] |
|
|
|
if usemask: |
|
for (a, n) in zip(seqarrays, sizes): |
|
nbmissing = (maxlength - n) |
|
|
|
data = a.ravel().__array__() |
|
mask = ma.getmaskarray(a).ravel() |
|
|
|
if nbmissing: |
|
fval = _check_fill_value(fill_value, a.dtype) |
|
if isinstance(fval, (ndarray, np.void)): |
|
if len(fval.dtype) == 1: |
|
fval = fval.item()[0] |
|
fmsk = True |
|
else: |
|
fval = np.array(fval, dtype=a.dtype, ndmin=1) |
|
fmsk = np.ones((1,), dtype=mask.dtype) |
|
else: |
|
fval = None |
|
fmsk = True |
|
|
|
seqdata.append(itertools.chain(data, [fval] * nbmissing)) |
|
seqmask.append(itertools.chain(mask, [fmsk] * nbmissing)) |
|
|
|
data = tuple(izip_records(seqdata, flatten=flatten)) |
|
output = ma.array(np.fromiter(data, dtype=newdtype, count=maxlength), |
|
mask=list(izip_records(seqmask, flatten=flatten))) |
|
if asrecarray: |
|
output = output.view(MaskedRecords) |
|
else: |
|
|
|
for (a, n) in zip(seqarrays, sizes): |
|
nbmissing = (maxlength - n) |
|
data = a.ravel().__array__() |
|
if nbmissing: |
|
fval = _check_fill_value(fill_value, a.dtype) |
|
if isinstance(fval, (ndarray, np.void)): |
|
if len(fval.dtype) == 1: |
|
fval = fval.item()[0] |
|
else: |
|
fval = np.array(fval, dtype=a.dtype, ndmin=1) |
|
else: |
|
fval = None |
|
seqdata.append(itertools.chain(data, [fval] * nbmissing)) |
|
output = np.fromiter(tuple(izip_records(seqdata, flatten=flatten)), |
|
dtype=newdtype, count=maxlength) |
|
if asrecarray: |
|
output = output.view(recarray) |
|
|
|
return output |
|
|
|
|
|
def drop_fields(base, drop_names, usemask=True, asrecarray=False): |
|
""" |
|
Return a new array with fields in `drop_names` dropped. |
|
|
|
Nested fields are supported. |
|
|
|
Parameters |
|
---------- |
|
base : array |
|
Input array |
|
drop_names : string or sequence |
|
String or sequence of strings corresponding to the names of the |
|
fields to drop. |
|
usemask : {False, True}, optional |
|
Whether to return a masked array or not. |
|
asrecarray : string or sequence, optional |
|
Whether to return a recarray or a mrecarray (`asrecarray=True`) or |
|
a plain ndarray or masked array with flexible dtype. The default |
|
is False. |
|
|
|
Examples |
|
-------- |
|
>>> from numpy.lib import recfunctions as rfn |
|
>>> a = np.array([(1, (2, 3.0)), (4, (5, 6.0))], |
|
... dtype=[('a', int), ('b', [('ba', float), ('bb', int)])]) |
|
>>> rfn.drop_fields(a, 'a') |
|
array([((2.0, 3),), ((5.0, 6),)], |
|
dtype=[('b', [('ba', '<f8'), ('bb', '<i4')])]) |
|
>>> rfn.drop_fields(a, 'ba') |
|
array([(1, (3,)), (4, (6,))], |
|
dtype=[('a', '<i4'), ('b', [('bb', '<i4')])]) |
|
>>> rfn.drop_fields(a, ['ba', 'bb']) |
|
array([(1,), (4,)], |
|
dtype=[('a', '<i4')]) |
|
""" |
|
if _is_string_like(drop_names): |
|
drop_names = [drop_names, ] |
|
else: |
|
drop_names = set(drop_names) |
|
|
|
def _drop_descr(ndtype, drop_names): |
|
names = ndtype.names |
|
newdtype = [] |
|
for name in names: |
|
current = ndtype[name] |
|
if name in drop_names: |
|
continue |
|
if current.names: |
|
descr = _drop_descr(current, drop_names) |
|
if descr: |
|
newdtype.append((name, descr)) |
|
else: |
|
newdtype.append((name, current)) |
|
return newdtype |
|
|
|
newdtype = _drop_descr(base.dtype, drop_names) |
|
if not newdtype: |
|
return None |
|
|
|
output = np.empty(base.shape, dtype=newdtype) |
|
output = recursive_fill_fields(base, output) |
|
return _fix_output(output, usemask=usemask, asrecarray=asrecarray) |
|
|
|
|
|
def rec_drop_fields(base, drop_names): |
|
""" |
|
Returns a new numpy.recarray with fields in `drop_names` dropped. |
|
""" |
|
return drop_fields(base, drop_names, usemask=False, asrecarray=True) |
|
|
|
|
|
def rename_fields(base, namemapper): |
|
""" |
|
Rename the fields from a flexible-datatype ndarray or recarray. |
|
|
|
Nested fields are supported. |
|
|
|
Parameters |
|
---------- |
|
base : ndarray |
|
Input array whose fields must be modified. |
|
namemapper : dictionary |
|
Dictionary mapping old field names to their new version. |
|
|
|
Examples |
|
-------- |
|
>>> from numpy.lib import recfunctions as rfn |
|
>>> a = np.array([(1, (2, [3.0, 30.])), (4, (5, [6.0, 60.]))], |
|
... dtype=[('a', int),('b', [('ba', float), ('bb', (float, 2))])]) |
|
>>> rfn.rename_fields(a, {'a':'A', 'bb':'BB'}) |
|
array([(1, (2.0, [3.0, 30.0])), (4, (5.0, [6.0, 60.0]))], |
|
dtype=[('A', '<i4'), ('b', [('ba', '<f8'), ('BB', '<f8', 2)])]) |
|
|
|
""" |
|
def _recursive_rename_fields(ndtype, namemapper): |
|
newdtype = [] |
|
for name in ndtype.names: |
|
newname = namemapper.get(name, name) |
|
current = ndtype[name] |
|
if current.names: |
|
newdtype.append( |
|
(newname, _recursive_rename_fields(current, namemapper)) |
|
) |
|
else: |
|
newdtype.append((newname, current)) |
|
return newdtype |
|
newdtype = _recursive_rename_fields(base.dtype, namemapper) |
|
return base.view(newdtype) |
|
|
|
|
|
def append_fields(base, names, data, dtypes=None, |
|
fill_value=-1, usemask=True, asrecarray=False): |
|
""" |
|
Add new fields to an existing array. |
|
|
|
The names of the fields are given with the `names` arguments, |
|
the corresponding values with the `data` arguments. |
|
If a single field is appended, `names`, `data` and `dtypes` do not have |
|
to be lists but just values. |
|
|
|
Parameters |
|
---------- |
|
base : array |
|
Input array to extend. |
|
names : string, sequence |
|
String or sequence of strings corresponding to the names |
|
of the new fields. |
|
data : array or sequence of arrays |
|
Array or sequence of arrays storing the fields to add to the base. |
|
dtypes : sequence of datatypes, optional |
|
Datatype or sequence of datatypes. |
|
If None, the datatypes are estimated from the `data`. |
|
fill_value : {float}, optional |
|
Filling value used to pad missing data on the shorter arrays. |
|
usemask : {False, True}, optional |
|
Whether to return a masked array or not. |
|
asrecarray : {False, True}, optional |
|
Whether to return a recarray (MaskedRecords) or not. |
|
|
|
""" |
|
|
|
if isinstance(names, (tuple, list)): |
|
if len(names) != len(data): |
|
msg = "The number of arrays does not match the number of names" |
|
raise ValueError(msg) |
|
elif isinstance(names, basestring): |
|
names = [names, ] |
|
data = [data, ] |
|
|
|
if dtypes is None: |
|
data = [np.array(a, copy=False, subok=True) for a in data] |
|
data = [a.view([(name, a.dtype)]) for (name, a) in zip(names, data)] |
|
else: |
|
if not isinstance(dtypes, (tuple, list)): |
|
dtypes = [dtypes, ] |
|
if len(data) != len(dtypes): |
|
if len(dtypes) == 1: |
|
dtypes = dtypes * len(data) |
|
else: |
|
msg = "The dtypes argument must be None, a dtype, or a list." |
|
raise ValueError(msg) |
|
data = [np.array(a, copy=False, subok=True, dtype=d).view([(n, d)]) |
|
for (a, n, d) in zip(data, names, dtypes)] |
|
|
|
base = merge_arrays(base, usemask=usemask, fill_value=fill_value) |
|
if len(data) > 1: |
|
data = merge_arrays(data, flatten=True, usemask=usemask, |
|
fill_value=fill_value) |
|
else: |
|
data = data.pop() |
|
|
|
output = ma.masked_all(max(len(base), len(data)), |
|
dtype=base.dtype.descr + data.dtype.descr) |
|
output = recursive_fill_fields(base, output) |
|
output = recursive_fill_fields(data, output) |
|
|
|
return _fix_output(output, usemask=usemask, asrecarray=asrecarray) |
|
|
|
|
|
def rec_append_fields(base, names, data, dtypes=None): |
|
""" |
|
Add new fields to an existing array. |
|
|
|
The names of the fields are given with the `names` arguments, |
|
the corresponding values with the `data` arguments. |
|
If a single field is appended, `names`, `data` and `dtypes` do not have |
|
to be lists but just values. |
|
|
|
Parameters |
|
---------- |
|
base : array |
|
Input array to extend. |
|
names : string, sequence |
|
String or sequence of strings corresponding to the names |
|
of the new fields. |
|
data : array or sequence of arrays |
|
Array or sequence of arrays storing the fields to add to the base. |
|
dtypes : sequence of datatypes, optional |
|
Datatype or sequence of datatypes. |
|
If None, the datatypes are estimated from the `data`. |
|
|
|
See Also |
|
-------- |
|
append_fields |
|
|
|
Returns |
|
------- |
|
appended_array : np.recarray |
|
""" |
|
return append_fields(base, names, data=data, dtypes=dtypes, |
|
asrecarray=True, usemask=False) |
|
|
|
|
|
def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False, |
|
autoconvert=False): |
|
""" |
|
Superposes arrays fields by fields |
|
|
|
Parameters |
|
---------- |
|
seqarrays : array or sequence |
|
Sequence of input arrays. |
|
defaults : dictionary, optional |
|
Dictionary mapping field names to the corresponding default values. |
|
usemask : {True, False}, optional |
|
Whether to return a MaskedArray (or MaskedRecords is |
|
`asrecarray==True`) or a ndarray. |
|
asrecarray : {False, True}, optional |
|
Whether to return a recarray (or MaskedRecords if `usemask==True`) |
|
or just a flexible-type ndarray. |
|
autoconvert : {False, True}, optional |
|
Whether automatically cast the type of the field to the maximum. |
|
|
|
Examples |
|
-------- |
|
>>> from numpy.lib import recfunctions as rfn |
|
>>> x = np.array([1, 2,]) |
|
>>> rfn.stack_arrays(x) is x |
|
True |
|
>>> z = np.array([('A', 1), ('B', 2)], dtype=[('A', '|S3'), ('B', float)]) |
|
>>> zz = np.array([('a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)], |
|
... dtype=[('A', '|S3'), ('B', float), ('C', float)]) |
|
>>> test = rfn.stack_arrays((z,zz)) |
|
>>> test |
|
masked_array(data = [('A', 1.0, --) ('B', 2.0, --) ('a', 10.0, 100.0) ('b', 20.0, 200.0) |
|
('c', 30.0, 300.0)], |
|
mask = [(False, False, True) (False, False, True) (False, False, False) |
|
(False, False, False) (False, False, False)], |
|
fill_value = ('N/A', 1e+20, 1e+20), |
|
dtype = [('A', '|S3'), ('B', '<f8'), ('C', '<f8')]) |
|
|
|
""" |
|
if isinstance(arrays, ndarray): |
|
return arrays |
|
elif len(arrays) == 1: |
|
return arrays[0] |
|
seqarrays = [np.asanyarray(a).ravel() for a in arrays] |
|
nrecords = [len(a) for a in seqarrays] |
|
ndtype = [a.dtype for a in seqarrays] |
|
fldnames = [d.names for d in ndtype] |
|
|
|
dtype_l = ndtype[0] |
|
newdescr = dtype_l.descr |
|
names = [_[0] for _ in newdescr] |
|
for dtype_n in ndtype[1:]: |
|
for descr in dtype_n.descr: |
|
name = descr[0] or '' |
|
if name not in names: |
|
newdescr.append(descr) |
|
names.append(name) |
|
else: |
|
nameidx = names.index(name) |
|
current_descr = newdescr[nameidx] |
|
if autoconvert: |
|
if np.dtype(descr[1]) > np.dtype(current_descr[-1]): |
|
current_descr = list(current_descr) |
|
current_descr[-1] = descr[1] |
|
newdescr[nameidx] = tuple(current_descr) |
|
elif descr[1] != current_descr[-1]: |
|
raise TypeError("Incompatible type '%s' <> '%s'" % |
|
(dict(newdescr)[name], descr[1])) |
|
|
|
if len(newdescr) == 1: |
|
output = ma.concatenate(seqarrays) |
|
else: |
|
|
|
output = ma.masked_all((np.sum(nrecords),), newdescr) |
|
offset = np.cumsum(np.r_[0, nrecords]) |
|
seen = [] |
|
for (a, n, i, j) in zip(seqarrays, fldnames, offset[:-1], offset[1:]): |
|
names = a.dtype.names |
|
if names is None: |
|
output['f%i' % len(seen)][i:j] = a |
|
else: |
|
for name in n: |
|
output[name][i:j] = a[name] |
|
if name not in seen: |
|
seen.append(name) |
|
|
|
return _fix_output(_fix_defaults(output, defaults), |
|
usemask=usemask, asrecarray=asrecarray) |
|
|
|
|
|
def find_duplicates(a, key=None, ignoremask=True, return_index=False): |
|
""" |
|
Find the duplicates in a structured array along a given key |
|
|
|
Parameters |
|
---------- |
|
a : array-like |
|
Input array |
|
key : {string, None}, optional |
|
Name of the fields along which to check the duplicates. |
|
If None, the search is performed by records |
|
ignoremask : {True, False}, optional |
|
Whether masked data should be discarded or considered as duplicates. |
|
return_index : {False, True}, optional |
|
Whether to return the indices of the duplicated values. |
|
|
|
Examples |
|
-------- |
|
>>> from numpy.lib import recfunctions as rfn |
|
>>> ndtype = [('a', int)] |
|
>>> a = np.ma.array([1, 1, 1, 2, 2, 3, 3], |
|
... mask=[0, 0, 1, 0, 0, 0, 1]).view(ndtype) |
|
>>> rfn.find_duplicates(a, ignoremask=True, return_index=True) |
|
... # XXX: judging by the output, the ignoremask flag has no effect |
|
""" |
|
a = np.asanyarray(a).ravel() |
|
|
|
fields = get_fieldstructure(a.dtype) |
|
|
|
base = a |
|
if key: |
|
for f in fields[key]: |
|
base = base[f] |
|
base = base[key] |
|
|
|
sortidx = base.argsort() |
|
sortedbase = base[sortidx] |
|
sorteddata = sortedbase.filled() |
|
|
|
flag = (sorteddata[:-1] == sorteddata[1:]) |
|
|
|
if ignoremask: |
|
sortedmask = sortedbase.recordmask |
|
flag[sortedmask[1:]] = False |
|
flag = np.concatenate(([False], flag)) |
|
|
|
flag[:-1] = flag[:-1] + flag[1:] |
|
duplicates = a[sortidx][flag] |
|
if return_index: |
|
return (duplicates, sortidx[flag]) |
|
else: |
|
return duplicates |
|
|
|
|
|
def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2', |
|
defaults=None, usemask=True, asrecarray=False): |
|
""" |
|
Join arrays `r1` and `r2` on key `key`. |
|
|
|
The key should be either a string or a sequence of string corresponding |
|
to the fields used to join the array. An exception is raised if the |
|
`key` field cannot be found in the two input arrays. Neither `r1` nor |
|
`r2` should have any duplicates along `key`: the presence of duplicates |
|
will make the output quite unreliable. Note that duplicates are not |
|
looked for by the algorithm. |
|
|
|
Parameters |
|
---------- |
|
key : {string, sequence} |
|
A string or a sequence of strings corresponding to the fields used |
|
for comparison. |
|
r1, r2 : arrays |
|
Structured arrays. |
|
jointype : {'inner', 'outer', 'leftouter'}, optional |
|
If 'inner', returns the elements common to both r1 and r2. |
|
If 'outer', returns the common elements as well as the elements of |
|
r1 not in r2 and the elements of not in r2. |
|
If 'leftouter', returns the common elements and the elements of r1 |
|
not in r2. |
|
r1postfix : string, optional |
|
String appended to the names of the fields of r1 that are present |
|
in r2 but absent of the key. |
|
r2postfix : string, optional |
|
String appended to the names of the fields of r2 that are present |
|
in r1 but absent of the key. |
|
defaults : {dictionary}, optional |
|
Dictionary mapping field names to the corresponding default values. |
|
usemask : {True, False}, optional |
|
Whether to return a MaskedArray (or MaskedRecords is |
|
`asrecarray==True`) or a ndarray. |
|
asrecarray : {False, True}, optional |
|
Whether to return a recarray (or MaskedRecords if `usemask==True`) |
|
or just a flexible-type ndarray. |
|
|
|
Notes |
|
----- |
|
* The output is sorted along the key. |
|
* A temporary array is formed by dropping the fields not in the key for |
|
the two arrays and concatenating the result. This array is then |
|
sorted, and the common entries selected. The output is constructed by |
|
filling the fields with the selected entries. Matching is not |
|
preserved if there are some duplicates... |
|
|
|
""" |
|
|
|
if jointype not in ('inner', 'outer', 'leftouter'): |
|
raise ValueError( |
|
"The 'jointype' argument should be in 'inner', " |
|
"'outer' or 'leftouter' (got '%s' instead)" % jointype |
|
) |
|
|
|
if isinstance(key, basestring): |
|
key = (key,) |
|
|
|
|
|
for name in key: |
|
if name not in r1.dtype.names: |
|
raise ValueError('r1 does not have key field %s' % name) |
|
if name not in r2.dtype.names: |
|
raise ValueError('r2 does not have key field %s' % name) |
|
|
|
|
|
r1 = r1.ravel() |
|
r2 = r2.ravel() |
|
|
|
|
|
nb1 = len(r1) |
|
(r1names, r2names) = (r1.dtype.names, r2.dtype.names) |
|
|
|
|
|
if (set.intersection(set(r1names), set(r2names)).difference(key) and |
|
not (r1postfix or r2postfix)): |
|
msg = "r1 and r2 contain common names, r1postfix and r2postfix " |
|
msg += "can't be empty" |
|
raise ValueError(msg) |
|
|
|
|
|
r1k = drop_fields(r1, [n for n in r1names if n not in key]) |
|
r2k = drop_fields(r2, [n for n in r2names if n not in key]) |
|
|
|
|
|
aux = ma.concatenate((r1k, r2k)) |
|
idx_sort = aux.argsort(order=key) |
|
aux = aux[idx_sort] |
|
|
|
|
|
flag_in = ma.concatenate(([False], aux[1:] == aux[:-1])) |
|
flag_in[:-1] = flag_in[1:] + flag_in[:-1] |
|
idx_in = idx_sort[flag_in] |
|
idx_1 = idx_in[(idx_in < nb1)] |
|
idx_2 = idx_in[(idx_in >= nb1)] - nb1 |
|
(r1cmn, r2cmn) = (len(idx_1), len(idx_2)) |
|
if jointype == 'inner': |
|
(r1spc, r2spc) = (0, 0) |
|
elif jointype == 'outer': |
|
idx_out = idx_sort[~flag_in] |
|
idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)])) |
|
idx_2 = np.concatenate((idx_2, idx_out[(idx_out >= nb1)] - nb1)) |
|
(r1spc, r2spc) = (len(idx_1) - r1cmn, len(idx_2) - r2cmn) |
|
elif jointype == 'leftouter': |
|
idx_out = idx_sort[~flag_in] |
|
idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)])) |
|
(r1spc, r2spc) = (len(idx_1) - r1cmn, 0) |
|
|
|
(s1, s2) = (r1[idx_1], r2[idx_2]) |
|
|
|
|
|
|
|
ndtype = [list(_) for _ in r1k.dtype.descr] |
|
|
|
ndtype.extend(list(_) for _ in r1.dtype.descr if _[0] not in key) |
|
|
|
names = list(_[0] for _ in ndtype) |
|
for desc in r2.dtype.descr: |
|
desc = list(desc) |
|
name = desc[0] |
|
|
|
if name in names: |
|
nameidx = ndtype.index(desc) |
|
current = ndtype[nameidx] |
|
|
|
if name in key: |
|
current[-1] = max(desc[1], current[-1]) |
|
|
|
else: |
|
current[0] += r1postfix |
|
desc[0] += r2postfix |
|
ndtype.insert(nameidx + 1, desc) |
|
|
|
else: |
|
names.extend(desc[0]) |
|
ndtype.append(desc) |
|
|
|
ndtype = [tuple(_) for _ in ndtype] |
|
|
|
|
|
cmn = max(r1cmn, r2cmn) |
|
|
|
output = ma.masked_all((cmn + r1spc + r2spc,), dtype=ndtype) |
|
names = output.dtype.names |
|
for f in r1names: |
|
selected = s1[f] |
|
if f not in names or (f in r2names and not r2postfix and f not in key): |
|
f += r1postfix |
|
current = output[f] |
|
current[:r1cmn] = selected[:r1cmn] |
|
if jointype in ('outer', 'leftouter'): |
|
current[cmn:cmn + r1spc] = selected[r1cmn:] |
|
for f in r2names: |
|
selected = s2[f] |
|
if f not in names or (f in r1names and not r1postfix and f not in key): |
|
f += r2postfix |
|
current = output[f] |
|
current[:r2cmn] = selected[:r2cmn] |
|
if (jointype == 'outer') and r2spc: |
|
current[-r2spc:] = selected[r2cmn:] |
|
|
|
output.sort(order=key) |
|
kwargs = dict(usemask=usemask, asrecarray=asrecarray) |
|
return _fix_output(_fix_defaults(output, defaults), **kwargs) |
|
|
|
|
|
def rec_join(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2', |
|
defaults=None): |
|
""" |
|
Join arrays `r1` and `r2` on keys. |
|
Alternative to join_by, that always returns a np.recarray. |
|
|
|
See Also |
|
-------- |
|
join_by : equivalent function |
|
""" |
|
kwargs = dict(jointype=jointype, r1postfix=r1postfix, r2postfix=r2postfix, |
|
defaults=defaults, usemask=False, asrecarray=True) |
|
return join_by(key, r1, r2, **kwargs) |
|
|