tmp
/
pip-install-ghxuqwgs
/numpy_78e94bf2b6094bf9a1f3d92042f9bf46
/build
/lib.linux-x86_64-cpython-310
/numpy
/lib
/recfunctions.py
""" | |
Collection of utilities to manipulate structured arrays. | |
Most of these functions were initially implemented by John Hunter for | |
matplotlib. They have been rewritten and extended for convenience. | |
""" | |
from __future__ import division, absolute_import, print_function | |
import sys | |
import itertools | |
import numpy as np | |
import numpy.ma as ma | |
from numpy import ndarray, recarray | |
from numpy.ma import MaskedArray | |
from numpy.ma.mrecords import MaskedRecords | |
from numpy.lib._iotools import _is_string_like | |
from numpy.compat import basestring | |
if sys.version_info[0] < 3: | |
from future_builtins import zip | |
_check_fill_value = np.ma.core._check_fill_value | |
__all__ = [ | |
'append_fields', 'drop_fields', 'find_duplicates', | |
'get_fieldstructure', 'join_by', 'merge_arrays', | |
'rec_append_fields', 'rec_drop_fields', 'rec_join', | |
'recursive_fill_fields', 'rename_fields', 'stack_arrays', | |
] | |
def recursive_fill_fields(input, output): | |
""" | |
Fills fields from output with fields from input, | |
with support for nested structures. | |
Parameters | |
---------- | |
input : ndarray | |
Input array. | |
output : ndarray | |
Output array. | |
Notes | |
----- | |
* `output` should be at least the same size as `input` | |
Examples | |
-------- | |
>>> from numpy.lib import recfunctions as rfn | |
>>> a = np.array([(1, 10.), (2, 20.)], dtype=[('A', int), ('B', float)]) | |
>>> b = np.zeros((3,), dtype=a.dtype) | |
>>> rfn.recursive_fill_fields(a, b) | |
array([(1, 10.0), (2, 20.0), (0, 0.0)], | |
dtype=[('A', '<i4'), ('B', '<f8')]) | |
""" | |
newdtype = output.dtype | |
for field in newdtype.names: | |
try: | |
current = input[field] | |
except ValueError: | |
continue | |
if current.dtype.names: | |
recursive_fill_fields(current, output[field]) | |
else: | |
output[field][:len(current)] = current | |
return output | |
def get_names(adtype): | |
""" | |
Returns the field names of the input datatype as a tuple. | |
Parameters | |
---------- | |
adtype : dtype | |
Input datatype | |
Examples | |
-------- | |
>>> from numpy.lib import recfunctions as rfn | |
>>> rfn.get_names(np.empty((1,), dtype=int)) is None | |
True | |
>>> rfn.get_names(np.empty((1,), dtype=[('A',int), ('B', float)])) | |
('A', 'B') | |
>>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])]) | |
>>> rfn.get_names(adtype) | |
('a', ('b', ('ba', 'bb'))) | |
""" | |
listnames = [] | |
names = adtype.names | |
for name in names: | |
current = adtype[name] | |
if current.names: | |
listnames.append((name, tuple(get_names(current)))) | |
else: | |
listnames.append(name) | |
return tuple(listnames) or None | |
def get_names_flat(adtype): | |
""" | |
Returns the field names of the input datatype as a tuple. Nested structure | |
are flattend beforehand. | |
Parameters | |
---------- | |
adtype : dtype | |
Input datatype | |
Examples | |
-------- | |
>>> from numpy.lib import recfunctions as rfn | |
>>> rfn.get_names_flat(np.empty((1,), dtype=int)) is None | |
True | |
>>> rfn.get_names_flat(np.empty((1,), dtype=[('A',int), ('B', float)])) | |
('A', 'B') | |
>>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])]) | |
>>> rfn.get_names_flat(adtype) | |
('a', 'b', 'ba', 'bb') | |
""" | |
listnames = [] | |
names = adtype.names | |
for name in names: | |
listnames.append(name) | |
current = adtype[name] | |
if current.names: | |
listnames.extend(get_names_flat(current)) | |
return tuple(listnames) or None | |
def flatten_descr(ndtype): | |
""" | |
Flatten a structured data-type description. | |
Examples | |
-------- | |
>>> from numpy.lib import recfunctions as rfn | |
>>> ndtype = np.dtype([('a', '<i4'), ('b', [('ba', '<f8'), ('bb', '<i4')])]) | |
>>> rfn.flatten_descr(ndtype) | |
(('a', dtype('int32')), ('ba', dtype('float64')), ('bb', dtype('int32'))) | |
""" | |
names = ndtype.names | |
if names is None: | |
return ndtype.descr | |
else: | |
descr = [] | |
for field in names: | |
(typ, _) = ndtype.fields[field] | |
if typ.names: | |
descr.extend(flatten_descr(typ)) | |
else: | |
descr.append((field, typ)) | |
return tuple(descr) | |
def zip_descr(seqarrays, flatten=False): | |
""" | |
Combine the dtype description of a series of arrays. | |
Parameters | |
---------- | |
seqarrays : sequence of arrays | |
Sequence of arrays | |
flatten : {boolean}, optional | |
Whether to collapse nested descriptions. | |
""" | |
newdtype = [] | |
if flatten: | |
for a in seqarrays: | |
newdtype.extend(flatten_descr(a.dtype)) | |
else: | |
for a in seqarrays: | |
current = a.dtype | |
names = current.names or () | |
if len(names) > 1: | |
newdtype.append(('', current.descr)) | |
else: | |
newdtype.extend(current.descr) | |
return np.dtype(newdtype).descr | |
def get_fieldstructure(adtype, lastname=None, parents=None,): | |
""" | |
Returns a dictionary with fields indexing lists of their parent fields. | |
This function is used to simplify access to fields nested in other fields. | |
Parameters | |
---------- | |
adtype : np.dtype | |
Input datatype | |
lastname : optional | |
Last processed field name (used internally during recursion). | |
parents : dictionary | |
Dictionary of parent fields (used interbally during recursion). | |
Examples | |
-------- | |
>>> from numpy.lib import recfunctions as rfn | |
>>> ndtype = np.dtype([('A', int), | |
... ('B', [('BA', int), | |
... ('BB', [('BBA', int), ('BBB', int)])])]) | |
>>> rfn.get_fieldstructure(ndtype) | |
... # XXX: possible regression, order of BBA and BBB is swapped | |
{'A': [], 'B': [], 'BA': ['B'], 'BB': ['B'], 'BBA': ['B', 'BB'], 'BBB': ['B', 'BB']} | |
""" | |
if parents is None: | |
parents = {} | |
names = adtype.names | |
for name in names: | |
current = adtype[name] | |
if current.names: | |
if lastname: | |
parents[name] = [lastname, ] | |
else: | |
parents[name] = [] | |
parents.update(get_fieldstructure(current, name, parents)) | |
else: | |
lastparent = [_ for _ in (parents.get(lastname, []) or [])] | |
if lastparent: | |
lastparent.append(lastname) | |
elif lastname: | |
lastparent = [lastname, ] | |
parents[name] = lastparent or [] | |
return parents or None | |
def _izip_fields_flat(iterable): | |
""" | |
Returns an iterator of concatenated fields from a sequence of arrays, | |
collapsing any nested structure. | |
""" | |
for element in iterable: | |
if isinstance(element, np.void): | |
for f in _izip_fields_flat(tuple(element)): | |
yield f | |
else: | |
yield element | |
def _izip_fields(iterable): | |
""" | |
Returns an iterator of concatenated fields from a sequence of arrays. | |
""" | |
for element in iterable: | |
if (hasattr(element, '__iter__') and | |
not isinstance(element, basestring)): | |
for f in _izip_fields(element): | |
yield f | |
elif isinstance(element, np.void) and len(tuple(element)) == 1: | |
for f in _izip_fields(element): | |
yield f | |
else: | |
yield element | |
def izip_records(seqarrays, fill_value=None, flatten=True): | |
""" | |
Returns an iterator of concatenated items from a sequence of arrays. | |
Parameters | |
---------- | |
seqarray : sequence of arrays | |
Sequence of arrays. | |
fill_value : {None, integer} | |
Value used to pad shorter iterables. | |
flatten : {True, False}, | |
Whether to | |
""" | |
# OK, that's a complete ripoff from Python2.6 itertools.izip_longest | |
def sentinel(counter=([fill_value] * (len(seqarrays) - 1)).pop): | |
"Yields the fill_value or raises IndexError" | |
yield counter() | |
# | |
fillers = itertools.repeat(fill_value) | |
iters = [itertools.chain(it, sentinel(), fillers) for it in seqarrays] | |
# Should we flatten the items, or just use a nested approach | |
if flatten: | |
zipfunc = _izip_fields_flat | |
else: | |
zipfunc = _izip_fields | |
# | |
try: | |
for tup in zip(*iters): | |
yield tuple(zipfunc(tup)) | |
except IndexError: | |
pass | |
def _fix_output(output, usemask=True, asrecarray=False): | |
""" | |
Private function: return a recarray, a ndarray, a MaskedArray | |
or a MaskedRecords depending on the input parameters | |
""" | |
if not isinstance(output, MaskedArray): | |
usemask = False | |
if usemask: | |
if asrecarray: | |
output = output.view(MaskedRecords) | |
else: | |
output = ma.filled(output) | |
if asrecarray: | |
output = output.view(recarray) | |
return output | |
def _fix_defaults(output, defaults=None): | |
""" | |
Update the fill_value and masked data of `output` | |
from the default given in a dictionary defaults. | |
""" | |
names = output.dtype.names | |
(data, mask, fill_value) = (output.data, output.mask, output.fill_value) | |
for (k, v) in (defaults or {}).items(): | |
if k in names: | |
fill_value[k] = v | |
data[k][mask[k]] = v | |
return output | |
def merge_arrays(seqarrays, fill_value=-1, flatten=False, | |
usemask=False, asrecarray=False): | |
""" | |
Merge arrays field by field. | |
Parameters | |
---------- | |
seqarrays : sequence of ndarrays | |
Sequence of arrays | |
fill_value : {float}, optional | |
Filling value used to pad missing data on the shorter arrays. | |
flatten : {False, True}, optional | |
Whether to collapse nested fields. | |
usemask : {False, True}, optional | |
Whether to return a masked array or not. | |
asrecarray : {False, True}, optional | |
Whether to return a recarray (MaskedRecords) or not. | |
Examples | |
-------- | |
>>> from numpy.lib import recfunctions as rfn | |
>>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.]))) | |
masked_array(data = [(1, 10.0) (2, 20.0) (--, 30.0)], | |
mask = [(False, False) (False, False) (True, False)], | |
fill_value = (999999, 1e+20), | |
dtype = [('f0', '<i4'), ('f1', '<f8')]) | |
>>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.])), | |
... usemask=False) | |
array([(1, 10.0), (2, 20.0), (-1, 30.0)], | |
dtype=[('f0', '<i4'), ('f1', '<f8')]) | |
>>> rfn.merge_arrays((np.array([1, 2]).view([('a', int)]), | |
... np.array([10., 20., 30.])), | |
... usemask=False, asrecarray=True) | |
rec.array([(1, 10.0), (2, 20.0), (-1, 30.0)], | |
dtype=[('a', '<i4'), ('f1', '<f8')]) | |
Notes | |
----- | |
* Without a mask, the missing value will be filled with something, | |
* depending on what its corresponding type: | |
-1 for integers | |
-1.0 for floating point numbers | |
'-' for characters | |
'-1' for strings | |
True for boolean values | |
* XXX: I just obtained these values empirically | |
""" | |
# Only one item in the input sequence ? | |
if (len(seqarrays) == 1): | |
seqarrays = np.asanyarray(seqarrays[0]) | |
# Do we have a single ndarray as input ? | |
if isinstance(seqarrays, (ndarray, np.void)): | |
seqdtype = seqarrays.dtype | |
if (not flatten) or \ | |
(zip_descr((seqarrays,), flatten=True) == seqdtype.descr): | |
# Minimal processing needed: just make sure everythng's a-ok | |
seqarrays = seqarrays.ravel() | |
# Make sure we have named fields | |
if not seqdtype.names: | |
seqdtype = [('', seqdtype)] | |
# Find what type of array we must return | |
if usemask: | |
if asrecarray: | |
seqtype = MaskedRecords | |
else: | |
seqtype = MaskedArray | |
elif asrecarray: | |
seqtype = recarray | |
else: | |
seqtype = ndarray | |
return seqarrays.view(dtype=seqdtype, type=seqtype) | |
else: | |
seqarrays = (seqarrays,) | |
else: | |
# Make sure we have arrays in the input sequence | |
seqarrays = [np.asanyarray(_m) for _m in seqarrays] | |
# Find the sizes of the inputs and their maximum | |
sizes = tuple(a.size for a in seqarrays) | |
maxlength = max(sizes) | |
# Get the dtype of the output (flattening if needed) | |
newdtype = zip_descr(seqarrays, flatten=flatten) | |
# Initialize the sequences for data and mask | |
seqdata = [] | |
seqmask = [] | |
# If we expect some kind of MaskedArray, make a special loop. | |
if usemask: | |
for (a, n) in zip(seqarrays, sizes): | |
nbmissing = (maxlength - n) | |
# Get the data and mask | |
data = a.ravel().__array__() | |
mask = ma.getmaskarray(a).ravel() | |
# Get the filling value (if needed) | |
if nbmissing: | |
fval = _check_fill_value(fill_value, a.dtype) | |
if isinstance(fval, (ndarray, np.void)): | |
if len(fval.dtype) == 1: | |
fval = fval.item()[0] | |
fmsk = True | |
else: | |
fval = np.array(fval, dtype=a.dtype, ndmin=1) | |
fmsk = np.ones((1,), dtype=mask.dtype) | |
else: | |
fval = None | |
fmsk = True | |
# Store an iterator padding the input to the expected length | |
seqdata.append(itertools.chain(data, [fval] * nbmissing)) | |
seqmask.append(itertools.chain(mask, [fmsk] * nbmissing)) | |
# Create an iterator for the data | |
data = tuple(izip_records(seqdata, flatten=flatten)) | |
output = ma.array(np.fromiter(data, dtype=newdtype, count=maxlength), | |
mask=list(izip_records(seqmask, flatten=flatten))) | |
if asrecarray: | |
output = output.view(MaskedRecords) | |
else: | |
# Same as before, without the mask we don't need... | |
for (a, n) in zip(seqarrays, sizes): | |
nbmissing = (maxlength - n) | |
data = a.ravel().__array__() | |
if nbmissing: | |
fval = _check_fill_value(fill_value, a.dtype) | |
if isinstance(fval, (ndarray, np.void)): | |
if len(fval.dtype) == 1: | |
fval = fval.item()[0] | |
else: | |
fval = np.array(fval, dtype=a.dtype, ndmin=1) | |
else: | |
fval = None | |
seqdata.append(itertools.chain(data, [fval] * nbmissing)) | |
output = np.fromiter(tuple(izip_records(seqdata, flatten=flatten)), | |
dtype=newdtype, count=maxlength) | |
if asrecarray: | |
output = output.view(recarray) | |
# And we're done... | |
return output | |
def drop_fields(base, drop_names, usemask=True, asrecarray=False): | |
""" | |
Return a new array with fields in `drop_names` dropped. | |
Nested fields are supported. | |
Parameters | |
---------- | |
base : array | |
Input array | |
drop_names : string or sequence | |
String or sequence of strings corresponding to the names of the | |
fields to drop. | |
usemask : {False, True}, optional | |
Whether to return a masked array or not. | |
asrecarray : string or sequence, optional | |
Whether to return a recarray or a mrecarray (`asrecarray=True`) or | |
a plain ndarray or masked array with flexible dtype. The default | |
is False. | |
Examples | |
-------- | |
>>> from numpy.lib import recfunctions as rfn | |
>>> a = np.array([(1, (2, 3.0)), (4, (5, 6.0))], | |
... dtype=[('a', int), ('b', [('ba', float), ('bb', int)])]) | |
>>> rfn.drop_fields(a, 'a') | |
array([((2.0, 3),), ((5.0, 6),)], | |
dtype=[('b', [('ba', '<f8'), ('bb', '<i4')])]) | |
>>> rfn.drop_fields(a, 'ba') | |
array([(1, (3,)), (4, (6,))], | |
dtype=[('a', '<i4'), ('b', [('bb', '<i4')])]) | |
>>> rfn.drop_fields(a, ['ba', 'bb']) | |
array([(1,), (4,)], | |
dtype=[('a', '<i4')]) | |
""" | |
if _is_string_like(drop_names): | |
drop_names = [drop_names, ] | |
else: | |
drop_names = set(drop_names) | |
def _drop_descr(ndtype, drop_names): | |
names = ndtype.names | |
newdtype = [] | |
for name in names: | |
current = ndtype[name] | |
if name in drop_names: | |
continue | |
if current.names: | |
descr = _drop_descr(current, drop_names) | |
if descr: | |
newdtype.append((name, descr)) | |
else: | |
newdtype.append((name, current)) | |
return newdtype | |
newdtype = _drop_descr(base.dtype, drop_names) | |
if not newdtype: | |
return None | |
output = np.empty(base.shape, dtype=newdtype) | |
output = recursive_fill_fields(base, output) | |
return _fix_output(output, usemask=usemask, asrecarray=asrecarray) | |
def rec_drop_fields(base, drop_names): | |
""" | |
Returns a new numpy.recarray with fields in `drop_names` dropped. | |
""" | |
return drop_fields(base, drop_names, usemask=False, asrecarray=True) | |
def rename_fields(base, namemapper): | |
""" | |
Rename the fields from a flexible-datatype ndarray or recarray. | |
Nested fields are supported. | |
Parameters | |
---------- | |
base : ndarray | |
Input array whose fields must be modified. | |
namemapper : dictionary | |
Dictionary mapping old field names to their new version. | |
Examples | |
-------- | |
>>> from numpy.lib import recfunctions as rfn | |
>>> a = np.array([(1, (2, [3.0, 30.])), (4, (5, [6.0, 60.]))], | |
... dtype=[('a', int),('b', [('ba', float), ('bb', (float, 2))])]) | |
>>> rfn.rename_fields(a, {'a':'A', 'bb':'BB'}) | |
array([(1, (2.0, [3.0, 30.0])), (4, (5.0, [6.0, 60.0]))], | |
dtype=[('A', '<i4'), ('b', [('ba', '<f8'), ('BB', '<f8', 2)])]) | |
""" | |
def _recursive_rename_fields(ndtype, namemapper): | |
newdtype = [] | |
for name in ndtype.names: | |
newname = namemapper.get(name, name) | |
current = ndtype[name] | |
if current.names: | |
newdtype.append( | |
(newname, _recursive_rename_fields(current, namemapper)) | |
) | |
else: | |
newdtype.append((newname, current)) | |
return newdtype | |
newdtype = _recursive_rename_fields(base.dtype, namemapper) | |
return base.view(newdtype) | |
def append_fields(base, names, data, dtypes=None, | |
fill_value=-1, usemask=True, asrecarray=False): | |
""" | |
Add new fields to an existing array. | |
The names of the fields are given with the `names` arguments, | |
the corresponding values with the `data` arguments. | |
If a single field is appended, `names`, `data` and `dtypes` do not have | |
to be lists but just values. | |
Parameters | |
---------- | |
base : array | |
Input array to extend. | |
names : string, sequence | |
String or sequence of strings corresponding to the names | |
of the new fields. | |
data : array or sequence of arrays | |
Array or sequence of arrays storing the fields to add to the base. | |
dtypes : sequence of datatypes, optional | |
Datatype or sequence of datatypes. | |
If None, the datatypes are estimated from the `data`. | |
fill_value : {float}, optional | |
Filling value used to pad missing data on the shorter arrays. | |
usemask : {False, True}, optional | |
Whether to return a masked array or not. | |
asrecarray : {False, True}, optional | |
Whether to return a recarray (MaskedRecords) or not. | |
""" | |
# Check the names | |
if isinstance(names, (tuple, list)): | |
if len(names) != len(data): | |
msg = "The number of arrays does not match the number of names" | |
raise ValueError(msg) | |
elif isinstance(names, basestring): | |
names = [names, ] | |
data = [data, ] | |
# | |
if dtypes is None: | |
data = [np.array(a, copy=False, subok=True) for a in data] | |
data = [a.view([(name, a.dtype)]) for (name, a) in zip(names, data)] | |
else: | |
if not isinstance(dtypes, (tuple, list)): | |
dtypes = [dtypes, ] | |
if len(data) != len(dtypes): | |
if len(dtypes) == 1: | |
dtypes = dtypes * len(data) | |
else: | |
msg = "The dtypes argument must be None, a dtype, or a list." | |
raise ValueError(msg) | |
data = [np.array(a, copy=False, subok=True, dtype=d).view([(n, d)]) | |
for (a, n, d) in zip(data, names, dtypes)] | |
# | |
base = merge_arrays(base, usemask=usemask, fill_value=fill_value) | |
if len(data) > 1: | |
data = merge_arrays(data, flatten=True, usemask=usemask, | |
fill_value=fill_value) | |
else: | |
data = data.pop() | |
# | |
output = ma.masked_all(max(len(base), len(data)), | |
dtype=base.dtype.descr + data.dtype.descr) | |
output = recursive_fill_fields(base, output) | |
output = recursive_fill_fields(data, output) | |
# | |
return _fix_output(output, usemask=usemask, asrecarray=asrecarray) | |
def rec_append_fields(base, names, data, dtypes=None): | |
""" | |
Add new fields to an existing array. | |
The names of the fields are given with the `names` arguments, | |
the corresponding values with the `data` arguments. | |
If a single field is appended, `names`, `data` and `dtypes` do not have | |
to be lists but just values. | |
Parameters | |
---------- | |
base : array | |
Input array to extend. | |
names : string, sequence | |
String or sequence of strings corresponding to the names | |
of the new fields. | |
data : array or sequence of arrays | |
Array or sequence of arrays storing the fields to add to the base. | |
dtypes : sequence of datatypes, optional | |
Datatype or sequence of datatypes. | |
If None, the datatypes are estimated from the `data`. | |
See Also | |
-------- | |
append_fields | |
Returns | |
------- | |
appended_array : np.recarray | |
""" | |
return append_fields(base, names, data=data, dtypes=dtypes, | |
asrecarray=True, usemask=False) | |
def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False, | |
autoconvert=False): | |
""" | |
Superposes arrays fields by fields | |
Parameters | |
---------- | |
seqarrays : array or sequence | |
Sequence of input arrays. | |
defaults : dictionary, optional | |
Dictionary mapping field names to the corresponding default values. | |
usemask : {True, False}, optional | |
Whether to return a MaskedArray (or MaskedRecords is | |
`asrecarray==True`) or a ndarray. | |
asrecarray : {False, True}, optional | |
Whether to return a recarray (or MaskedRecords if `usemask==True`) | |
or just a flexible-type ndarray. | |
autoconvert : {False, True}, optional | |
Whether automatically cast the type of the field to the maximum. | |
Examples | |
-------- | |
>>> from numpy.lib import recfunctions as rfn | |
>>> x = np.array([1, 2,]) | |
>>> rfn.stack_arrays(x) is x | |
True | |
>>> z = np.array([('A', 1), ('B', 2)], dtype=[('A', '|S3'), ('B', float)]) | |
>>> zz = np.array([('a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)], | |
... dtype=[('A', '|S3'), ('B', float), ('C', float)]) | |
>>> test = rfn.stack_arrays((z,zz)) | |
>>> test | |
masked_array(data = [('A', 1.0, --) ('B', 2.0, --) ('a', 10.0, 100.0) ('b', 20.0, 200.0) | |
('c', 30.0, 300.0)], | |
mask = [(False, False, True) (False, False, True) (False, False, False) | |
(False, False, False) (False, False, False)], | |
fill_value = ('N/A', 1e+20, 1e+20), | |
dtype = [('A', '|S3'), ('B', '<f8'), ('C', '<f8')]) | |
""" | |
if isinstance(arrays, ndarray): | |
return arrays | |
elif len(arrays) == 1: | |
return arrays[0] | |
seqarrays = [np.asanyarray(a).ravel() for a in arrays] | |
nrecords = [len(a) for a in seqarrays] | |
ndtype = [a.dtype for a in seqarrays] | |
fldnames = [d.names for d in ndtype] | |
# | |
dtype_l = ndtype[0] | |
newdescr = dtype_l.descr | |
names = [_[0] for _ in newdescr] | |
for dtype_n in ndtype[1:]: | |
for descr in dtype_n.descr: | |
name = descr[0] or '' | |
if name not in names: | |
newdescr.append(descr) | |
names.append(name) | |
else: | |
nameidx = names.index(name) | |
current_descr = newdescr[nameidx] | |
if autoconvert: | |
if np.dtype(descr[1]) > np.dtype(current_descr[-1]): | |
current_descr = list(current_descr) | |
current_descr[-1] = descr[1] | |
newdescr[nameidx] = tuple(current_descr) | |
elif descr[1] != current_descr[-1]: | |
raise TypeError("Incompatible type '%s' <> '%s'" % | |
(dict(newdescr)[name], descr[1])) | |
# Only one field: use concatenate | |
if len(newdescr) == 1: | |
output = ma.concatenate(seqarrays) | |
else: | |
# | |
output = ma.masked_all((np.sum(nrecords),), newdescr) | |
offset = np.cumsum(np.r_[0, nrecords]) | |
seen = [] | |
for (a, n, i, j) in zip(seqarrays, fldnames, offset[:-1], offset[1:]): | |
names = a.dtype.names | |
if names is None: | |
output['f%i' % len(seen)][i:j] = a | |
else: | |
for name in n: | |
output[name][i:j] = a[name] | |
if name not in seen: | |
seen.append(name) | |
# | |
return _fix_output(_fix_defaults(output, defaults), | |
usemask=usemask, asrecarray=asrecarray) | |
def find_duplicates(a, key=None, ignoremask=True, return_index=False): | |
""" | |
Find the duplicates in a structured array along a given key | |
Parameters | |
---------- | |
a : array-like | |
Input array | |
key : {string, None}, optional | |
Name of the fields along which to check the duplicates. | |
If None, the search is performed by records | |
ignoremask : {True, False}, optional | |
Whether masked data should be discarded or considered as duplicates. | |
return_index : {False, True}, optional | |
Whether to return the indices of the duplicated values. | |
Examples | |
-------- | |
>>> from numpy.lib import recfunctions as rfn | |
>>> ndtype = [('a', int)] | |
>>> a = np.ma.array([1, 1, 1, 2, 2, 3, 3], | |
... mask=[0, 0, 1, 0, 0, 0, 1]).view(ndtype) | |
>>> rfn.find_duplicates(a, ignoremask=True, return_index=True) | |
... # XXX: judging by the output, the ignoremask flag has no effect | |
""" | |
a = np.asanyarray(a).ravel() | |
# Get a dictionary of fields | |
fields = get_fieldstructure(a.dtype) | |
# Get the sorting data (by selecting the corresponding field) | |
base = a | |
if key: | |
for f in fields[key]: | |
base = base[f] | |
base = base[key] | |
# Get the sorting indices and the sorted data | |
sortidx = base.argsort() | |
sortedbase = base[sortidx] | |
sorteddata = sortedbase.filled() | |
# Compare the sorting data | |
flag = (sorteddata[:-1] == sorteddata[1:]) | |
# If masked data must be ignored, set the flag to false where needed | |
if ignoremask: | |
sortedmask = sortedbase.recordmask | |
flag[sortedmask[1:]] = False | |
flag = np.concatenate(([False], flag)) | |
# We need to take the point on the left as well (else we're missing it) | |
flag[:-1] = flag[:-1] + flag[1:] | |
duplicates = a[sortidx][flag] | |
if return_index: | |
return (duplicates, sortidx[flag]) | |
else: | |
return duplicates | |
def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2', | |
defaults=None, usemask=True, asrecarray=False): | |
""" | |
Join arrays `r1` and `r2` on key `key`. | |
The key should be either a string or a sequence of string corresponding | |
to the fields used to join the array. An exception is raised if the | |
`key` field cannot be found in the two input arrays. Neither `r1` nor | |
`r2` should have any duplicates along `key`: the presence of duplicates | |
will make the output quite unreliable. Note that duplicates are not | |
looked for by the algorithm. | |
Parameters | |
---------- | |
key : {string, sequence} | |
A string or a sequence of strings corresponding to the fields used | |
for comparison. | |
r1, r2 : arrays | |
Structured arrays. | |
jointype : {'inner', 'outer', 'leftouter'}, optional | |
If 'inner', returns the elements common to both r1 and r2. | |
If 'outer', returns the common elements as well as the elements of | |
r1 not in r2 and the elements of not in r2. | |
If 'leftouter', returns the common elements and the elements of r1 | |
not in r2. | |
r1postfix : string, optional | |
String appended to the names of the fields of r1 that are present | |
in r2 but absent of the key. | |
r2postfix : string, optional | |
String appended to the names of the fields of r2 that are present | |
in r1 but absent of the key. | |
defaults : {dictionary}, optional | |
Dictionary mapping field names to the corresponding default values. | |
usemask : {True, False}, optional | |
Whether to return a MaskedArray (or MaskedRecords is | |
`asrecarray==True`) or a ndarray. | |
asrecarray : {False, True}, optional | |
Whether to return a recarray (or MaskedRecords if `usemask==True`) | |
or just a flexible-type ndarray. | |
Notes | |
----- | |
* The output is sorted along the key. | |
* A temporary array is formed by dropping the fields not in the key for | |
the two arrays and concatenating the result. This array is then | |
sorted, and the common entries selected. The output is constructed by | |
filling the fields with the selected entries. Matching is not | |
preserved if there are some duplicates... | |
""" | |
# Check jointype | |
if jointype not in ('inner', 'outer', 'leftouter'): | |
raise ValueError( | |
"The 'jointype' argument should be in 'inner', " | |
"'outer' or 'leftouter' (got '%s' instead)" % jointype | |
) | |
# If we have a single key, put it in a tuple | |
if isinstance(key, basestring): | |
key = (key,) | |
# Check the keys | |
for name in key: | |
if name not in r1.dtype.names: | |
raise ValueError('r1 does not have key field %s' % name) | |
if name not in r2.dtype.names: | |
raise ValueError('r2 does not have key field %s' % name) | |
# Make sure we work with ravelled arrays | |
r1 = r1.ravel() | |
r2 = r2.ravel() | |
# Fixme: nb2 below is never used. Commenting out for pyflakes. | |
# (nb1, nb2) = (len(r1), len(r2)) | |
nb1 = len(r1) | |
(r1names, r2names) = (r1.dtype.names, r2.dtype.names) | |
# Check the names for collision | |
if (set.intersection(set(r1names), set(r2names)).difference(key) and | |
not (r1postfix or r2postfix)): | |
msg = "r1 and r2 contain common names, r1postfix and r2postfix " | |
msg += "can't be empty" | |
raise ValueError(msg) | |
# Make temporary arrays of just the keys | |
r1k = drop_fields(r1, [n for n in r1names if n not in key]) | |
r2k = drop_fields(r2, [n for n in r2names if n not in key]) | |
# Concatenate the two arrays for comparison | |
aux = ma.concatenate((r1k, r2k)) | |
idx_sort = aux.argsort(order=key) | |
aux = aux[idx_sort] | |
# | |
# Get the common keys | |
flag_in = ma.concatenate(([False], aux[1:] == aux[:-1])) | |
flag_in[:-1] = flag_in[1:] + flag_in[:-1] | |
idx_in = idx_sort[flag_in] | |
idx_1 = idx_in[(idx_in < nb1)] | |
idx_2 = idx_in[(idx_in >= nb1)] - nb1 | |
(r1cmn, r2cmn) = (len(idx_1), len(idx_2)) | |
if jointype == 'inner': | |
(r1spc, r2spc) = (0, 0) | |
elif jointype == 'outer': | |
idx_out = idx_sort[~flag_in] | |
idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)])) | |
idx_2 = np.concatenate((idx_2, idx_out[(idx_out >= nb1)] - nb1)) | |
(r1spc, r2spc) = (len(idx_1) - r1cmn, len(idx_2) - r2cmn) | |
elif jointype == 'leftouter': | |
idx_out = idx_sort[~flag_in] | |
idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)])) | |
(r1spc, r2spc) = (len(idx_1) - r1cmn, 0) | |
# Select the entries from each input | |
(s1, s2) = (r1[idx_1], r2[idx_2]) | |
# | |
# Build the new description of the output array ....... | |
# Start with the key fields | |
ndtype = [list(_) for _ in r1k.dtype.descr] | |
# Add the other fields | |
ndtype.extend(list(_) for _ in r1.dtype.descr if _[0] not in key) | |
# Find the new list of names (it may be different from r1names) | |
names = list(_[0] for _ in ndtype) | |
for desc in r2.dtype.descr: | |
desc = list(desc) | |
name = desc[0] | |
# Have we seen the current name already ? | |
if name in names: | |
nameidx = ndtype.index(desc) | |
current = ndtype[nameidx] | |
# The current field is part of the key: take the largest dtype | |
if name in key: | |
current[-1] = max(desc[1], current[-1]) | |
# The current field is not part of the key: add the suffixes | |
else: | |
current[0] += r1postfix | |
desc[0] += r2postfix | |
ndtype.insert(nameidx + 1, desc) | |
#... we haven't: just add the description to the current list | |
else: | |
names.extend(desc[0]) | |
ndtype.append(desc) | |
# Revert the elements to tuples | |
ndtype = [tuple(_) for _ in ndtype] | |
# Find the largest nb of common fields : | |
# r1cmn and r2cmn should be equal, but... | |
cmn = max(r1cmn, r2cmn) | |
# Construct an empty array | |
output = ma.masked_all((cmn + r1spc + r2spc,), dtype=ndtype) | |
names = output.dtype.names | |
for f in r1names: | |
selected = s1[f] | |
if f not in names or (f in r2names and not r2postfix and f not in key): | |
f += r1postfix | |
current = output[f] | |
current[:r1cmn] = selected[:r1cmn] | |
if jointype in ('outer', 'leftouter'): | |
current[cmn:cmn + r1spc] = selected[r1cmn:] | |
for f in r2names: | |
selected = s2[f] | |
if f not in names or (f in r1names and not r1postfix and f not in key): | |
f += r2postfix | |
current = output[f] | |
current[:r2cmn] = selected[:r2cmn] | |
if (jointype == 'outer') and r2spc: | |
current[-r2spc:] = selected[r2cmn:] | |
# Sort and finalize the output | |
output.sort(order=key) | |
kwargs = dict(usemask=usemask, asrecarray=asrecarray) | |
return _fix_output(_fix_defaults(output, defaults), **kwargs) | |
def rec_join(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2', | |
defaults=None): | |
""" | |
Join arrays `r1` and `r2` on keys. | |
Alternative to join_by, that always returns a np.recarray. | |
See Also | |
-------- | |
join_by : equivalent function | |
""" | |
kwargs = dict(jointype=jointype, r1postfix=r1postfix, r2postfix=r2postfix, | |
defaults=defaults, usemask=False, asrecarray=True) | |
return join_by(key, r1, r2, **kwargs) | |