JayKimDevolved's picture
JayKimDevolved/deepseek
c011401 verified
/* -*- c -*- */
/*
*****************************************************************************
** INCLUDES **
*****************************************************************************
*/
#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#include "Python.h"
#include "numpy/arrayobject.h"
#include "numpy/ufuncobject.h"
#include "npy_pycompat.h"
#include "npy_config.h"
#include <stddef.h>
#include <stdio.h>
#include <assert.h>
#include <math.h>
static const char* umath_linalg_version_string = "0.1.4";
/*
****************************************************************************
* Debugging support *
****************************************************************************
*/
#define TRACE_TXT(...) do { fprintf (stderr, __VA_ARGS__); } while (0)
#define STACK_TRACE do {} while (0)
#define TRACE\
do { \
fprintf (stderr, \
"%s:%d:%s\n", \
__FILE__, \
__LINE__, \
__FUNCTION__); \
STACK_TRACE; \
} while (0)
#if 0
#include <execinfo.h>
void
dbg_stack_trace()
{
void *trace[32];
size_t size;
size = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
backtrace_symbols_fd(trace, size, 1);
}
#undef STACK_TRACE
#define STACK_TRACE do { dbg_stack_trace(); } while (0)
#endif
/*
*****************************************************************************
* BLAS/LAPACK calling macros *
*****************************************************************************
*/
#ifdef NO_APPEND_FORTRAN
# define FNAME(x) x
#else
# define FNAME(x) x##_
#endif
typedef struct { float r, i; } f2c_complex;
typedef struct { double r, i; } f2c_doublecomplex;
/* typedef long int (*L_fp)(); */
extern int
FNAME(sgeev)(char *jobvl, char *jobvr, int *n,
float a[], int *lda, float wr[], float wi[],
float vl[], int *ldvl, float vr[], int *ldvr,
float work[], int lwork[],
int *info);
extern int
FNAME(dgeev)(char *jobvl, char *jobvr, int *n,
double a[], int *lda, double wr[], double wi[],
double vl[], int *ldvl, double vr[], int *ldvr,
double work[], int lwork[],
int *info);
extern int
FNAME(cgeev)(char *jobvl, char *jobvr, int *n,
f2c_doublecomplex a[], int *lda,
f2c_doublecomplex w[],
f2c_doublecomplex vl[], int *ldvl,
f2c_doublecomplex vr[], int *ldvr,
f2c_doublecomplex work[], int *lwork,
double rwork[],
int *info);
extern int
FNAME(zgeev)(char *jobvl, char *jobvr, int *n,
f2c_doublecomplex a[], int *lda,
f2c_doublecomplex w[],
f2c_doublecomplex vl[], int *ldvl,
f2c_doublecomplex vr[], int *ldvr,
f2c_doublecomplex work[], int *lwork,
double rwork[],
int *info);
extern int
FNAME(ssyevd)(char *jobz, char *uplo, int *n,
float a[], int *lda, float w[], float work[],
int *lwork, int iwork[], int *liwork,
int *info);
extern int
FNAME(dsyevd)(char *jobz, char *uplo, int *n,
double a[], int *lda, double w[], double work[],
int *lwork, int iwork[], int *liwork,
int *info);
extern int
FNAME(cheevd)(char *jobz, char *uplo, int *n,
f2c_complex a[], int *lda,
float w[], f2c_complex work[],
int *lwork, float rwork[], int *lrwork, int iwork[],
int *liwork,
int *info);
extern int
FNAME(zheevd)(char *jobz, char *uplo, int *n,
f2c_doublecomplex a[], int *lda,
double w[], f2c_doublecomplex work[],
int *lwork, double rwork[], int *lrwork, int iwork[],
int *liwork,
int *info);
extern int
FNAME(dgelsd)(int *m, int *n, int *nrhs,
double a[], int *lda, double b[], int *ldb,
double s[], double *rcond, int *rank,
double work[], int *lwork, int iwork[],
int *info);
extern int
FNAME(zgelsd)(int *m, int *n, int *nrhs,
f2c_doublecomplex a[], int *lda,
f2c_doublecomplex b[], int *ldb,
double s[], double *rcond, int *rank,
f2c_doublecomplex work[], int *lwork,
double rwork[], int iwork[],
int *info);
extern int
FNAME(sgesv)(int *n, int *nrhs,
float a[], int *lda,
int ipiv[],
float b[], int *ldb,
int *info);
extern int
FNAME(dgesv)(int *n, int *nrhs,
double a[], int *lda,
int ipiv[],
double b[], int *ldb,
int *info);
extern int
FNAME(cgesv)(int *n, int *nrhs,
f2c_complex a[], int *lda,
int ipiv[],
f2c_complex b[], int *ldb,
int *info);
extern int
FNAME(zgesv)(int *n, int *nrhs,
f2c_doublecomplex a[], int *lda,
int ipiv[],
f2c_doublecomplex b[], int *ldb,
int *info);
extern int
FNAME(sgetrf)(int *m, int *n,
float a[], int *lda,
int ipiv[],
int *info);
extern int
FNAME(dgetrf)(int *m, int *n,
double a[], int *lda,
int ipiv[],
int *info);
extern int
FNAME(cgetrf)(int *m, int *n,
f2c_complex a[], int *lda,
int ipiv[],
int *info);
extern int
FNAME(zgetrf)(int *m, int *n,
f2c_doublecomplex a[], int *lda,
int ipiv[],
int *info);
extern int
FNAME(spotrf)(char *uplo, int *n,
float a[], int *lda,
int *info);
extern int
FNAME(dpotrf)(char *uplo, int *n,
double a[], int *lda,
int *info);
extern int
FNAME(cpotrf)(char *uplo, int *n,
f2c_complex a[], int *lda,
int *info);
extern int
FNAME(zpotrf)(char *uplo, int *n,
f2c_doublecomplex a[], int *lda,
int *info);
extern int
FNAME(sgesdd)(char *jobz, int *m, int *n,
float a[], int *lda, float s[], float u[],
int *ldu, float vt[], int *ldvt, float work[],
int *lwork, int iwork[], int *info);
extern int
FNAME(dgesdd)(char *jobz, int *m, int *n,
double a[], int *lda, double s[], double u[],
int *ldu, double vt[], int *ldvt, double work[],
int *lwork, int iwork[], int *info);
extern int
FNAME(cgesdd)(char *jobz, int *m, int *n,
f2c_complex a[], int *lda,
float s[], f2c_complex u[], int *ldu,
f2c_complex vt[], int *ldvt,
f2c_complex work[], int *lwork,
float rwork[], int iwork[], int *info);
extern int
FNAME(zgesdd)(char *jobz, int *m, int *n,
f2c_doublecomplex a[], int *lda,
double s[], f2c_doublecomplex u[], int *ldu,
f2c_doublecomplex vt[], int *ldvt,
f2c_doublecomplex work[], int *lwork,
double rwork[], int iwork[], int *info);
extern int
FNAME(spotrs)(char *uplo, int *n, int *nrhs,
float a[], int *lda,
float b[], int *ldb,
int *info);
extern int
FNAME(dpotrs)(char *uplo, int *n, int *nrhs,
double a[], int *lda,
double b[], int *ldb,
int *info);
extern int
FNAME(cpotrs)(char *uplo, int *n, int *nrhs,
f2c_complex a[], int *lda,
f2c_complex b[], int *ldb,
int *info);
extern int
FNAME(zpotrs)(char *uplo, int *n, int *nrhs,
f2c_doublecomplex a[], int *lda,
f2c_doublecomplex b[], int *ldb,
int *info);
extern int
FNAME(spotri)(char *uplo, int *n,
float a[], int *lda,
int *info);
extern int
FNAME(dpotri)(char *uplo, int *n,
double a[], int *lda,
int *info);
extern int
FNAME(cpotri)(char *uplo, int *n,
f2c_complex a[], int *lda,
int *info);
extern int
FNAME(zpotri)(char *uplo, int *n,
f2c_doublecomplex a[], int *lda,
int *info);
extern int
FNAME(scopy)(int *n,
float *sx, int *incx,
float *sy, int *incy);
extern int
FNAME(dcopy)(int *n,
double *sx, int *incx,
double *sy, int *incy);
extern int
FNAME(ccopy)(int *n,
f2c_complex *sx, int *incx,
f2c_complex *sy, int *incy);
extern int
FNAME(zcopy)(int *n,
f2c_doublecomplex *sx, int *incx,
f2c_doublecomplex *sy, int *incy);
extern float
FNAME(sdot)(int *n,
float *sx, int *incx,
float *sy, int *incy);
extern double
FNAME(ddot)(int *n,
double *sx, int *incx,
double *sy, int *incy);
extern f2c_complex
FNAME(cdotu)(int *n,
f2c_complex *sx, int *incx,
f2c_complex *sy, int *incy);
extern f2c_doublecomplex
FNAME(zdotu)(int *n,
f2c_doublecomplex *sx, int *incx,
f2c_doublecomplex *sy, int *incy);
extern f2c_complex
FNAME(cdotc)(int *n,
f2c_complex *sx, int *incx,
f2c_complex *sy, int *incy);
extern f2c_doublecomplex
FNAME(zdotc)(int *n,
f2c_doublecomplex *sx, int *incx,
f2c_doublecomplex *sy, int *incy);
extern int
FNAME(sgemm)(char *transa, char *transb,
int *m, int *n, int *k,
float *alpha,
float *a, int *lda,
float *b, int *ldb,
float *beta,
float *c, int *ldc);
extern int
FNAME(dgemm)(char *transa, char *transb,
int *m, int *n, int *k,
double *alpha,
double *a, int *lda,
double *b, int *ldb,
double *beta,
double *c, int *ldc);
extern int
FNAME(cgemm)(char *transa, char *transb,
int *m, int *n, int *k,
f2c_complex *alpha,
f2c_complex *a, int *lda,
f2c_complex *b, int *ldb,
f2c_complex *beta,
f2c_complex *c, int *ldc);
extern int
FNAME(zgemm)(char *transa, char *transb,
int *m, int *n, int *k,
f2c_doublecomplex *alpha,
f2c_doublecomplex *a, int *lda,
f2c_doublecomplex *b, int *ldb,
f2c_doublecomplex *beta,
f2c_doublecomplex *c, int *ldc);
#define LAPACK_T(FUNC) \
TRACE_TXT("Calling LAPACK ( " # FUNC " )\n"); \
FNAME(FUNC)
#define BLAS(FUNC) \
FNAME(FUNC)
#define LAPACK(FUNC) \
FNAME(FUNC)
typedef int fortran_int;
typedef float fortran_real;
typedef double fortran_doublereal;
typedef f2c_complex fortran_complex;
typedef f2c_doublecomplex fortran_doublecomplex;
/*
*****************************************************************************
** Some handy functions **
*****************************************************************************
*/
static inline void *
offset_ptr(void* ptr, ptrdiff_t offset)
{
return (void*)((npy_uint8*)ptr + offset);
}
static inline int
get_fp_invalid_and_clear(void)
{
int status;
status = npy_clear_floatstatus();
return !!(status & NPY_FPE_INVALID);
}
static inline void
set_fp_invalid_or_clear(int error_occurred)
{
if (error_occurred) {
npy_set_floatstatus_invalid();
}
else {
npy_clear_floatstatus();
}
}
/*
*****************************************************************************
** Some handy constants **
*****************************************************************************
*/
#define UMATH_LINALG_MODULE_NAME "_umath_linalg"
typedef union {
fortran_complex f;
npy_cfloat npy;
float array[2];
} COMPLEX_t;
typedef union {
fortran_doublecomplex f;
npy_cdouble npy;
double array[2];
} DOUBLECOMPLEX_t;
static float s_one;
static float s_zero;
static float s_minus_one;
static float s_ninf;
static float s_nan;
static double d_one;
static double d_zero;
static double d_minus_one;
static double d_ninf;
static double d_nan;
static COMPLEX_t c_one;
static COMPLEX_t c_zero;
static COMPLEX_t c_minus_one;
static COMPLEX_t c_ninf;
static COMPLEX_t c_nan;
static DOUBLECOMPLEX_t z_one;
static DOUBLECOMPLEX_t z_zero;
static DOUBLECOMPLEX_t z_minus_one;
static DOUBLECOMPLEX_t z_ninf;
static DOUBLECOMPLEX_t z_nan;
static void init_constants(void)
{
/*
this is needed as NPY_INFINITY and NPY_NAN macros
can't be used as initializers. I prefer to just set
all the constants the same way.
*/
s_one = 1.0f;
s_zero = 0.0f;
s_minus_one = -1.0f;
s_ninf = -NPY_INFINITYF;
s_nan = NPY_NANF;
d_one = 1.0;
d_zero = 0.0;
d_minus_one = -1.0;
d_ninf = -NPY_INFINITY;
d_nan = NPY_NAN;
c_one.array[0] = 1.0f;
c_one.array[1] = 0.0f;
c_zero.array[0] = 0.0f;
c_zero.array[1] = 0.0f;
c_minus_one.array[0] = -1.0f;
c_minus_one.array[1] = 0.0f;
c_ninf.array[0] = -NPY_INFINITYF;
c_ninf.array[1] = 0.0f;
c_nan.array[0] = NPY_NANF;
c_nan.array[1] = NPY_NANF;
z_one.array[0] = 1.0;
z_one.array[1] = 0.0;
z_zero.array[0] = 0.0;
z_zero.array[1] = 0.0;
z_minus_one.array[0] = -1.0;
z_minus_one.array[1] = 0.0;
z_ninf.array[0] = -NPY_INFINITY;
z_ninf.array[1] = 0.0;
z_nan.array[0] = NPY_NAN;
z_nan.array[1] = NPY_NAN;
}
/*
*****************************************************************************
** Structs used for data rearrangement **
*****************************************************************************
*/
/* this struct contains information about how to linearize in a local buffer
a matrix so that it can be used by blas functions.
All strides are specified in number of elements (similar to what blas
expects)
dst_row_strides: number of elements between different row. Matrix is
considered row-major
dst_column_strides: number of elements between differnt columns in the
destination buffer
rows: number of rows of the matrix
columns: number of columns of the matrix
src_row_strides: strides needed to access the next row in the source matrix
src_column_strides: strides needed to access the next column in the source
matrix
*/
typedef struct linearize_data_struct
{
size_t rows;
size_t columns;
ptrdiff_t row_strides;
ptrdiff_t column_strides;
} LINEARIZE_DATA_t;
static inline void
init_linearize_data(LINEARIZE_DATA_t *lin_data,
int rows,
int columns,
ptrdiff_t row_strides,
ptrdiff_t column_strides)
{
lin_data->rows = rows;
lin_data->columns = columns;
lin_data->row_strides = row_strides;
lin_data->column_strides = column_strides;
}
static inline void
dump_ufunc_object(PyUFuncObject* ufunc)
{
TRACE_TXT("\n\n%s '%s' (%d input(s), %d output(s), %d specialization(s).\n",
ufunc->core_enabled? "generalized ufunc" : "scalar ufunc",
ufunc->name, ufunc->nin, ufunc->nout, ufunc->ntypes);
if (ufunc->core_enabled) {
int arg;
int dim;
TRACE_TXT("\t%s (%d dimension(s) detected).\n",
ufunc->core_signature, ufunc->core_num_dim_ix);
for (arg = 0; arg < ufunc->nargs; arg++){
int * arg_dim_ix = ufunc->core_dim_ixs + ufunc->core_offsets[arg];
TRACE_TXT("\t\targ %d (%s) has %d dimension(s): (",
arg, arg < ufunc->nin? "INPUT" : "OUTPUT",
ufunc->core_num_dims[arg]);
for (dim = 0; dim < ufunc->core_num_dims[arg]; dim ++) {
TRACE_TXT(" %d", arg_dim_ix[dim]);
}
TRACE_TXT(" )\n");
}
}
}
static inline void
dump_linearize_data(const char* name, const LINEARIZE_DATA_t* params)
{
TRACE_TXT("\n\t%s rows: %zd columns: %zd"\
"\n\t\trow_strides: %td column_strides: %td"\
"\n", name, params->rows, params->columns,
params->row_strides, params->column_strides);
}
static inline float
FLOAT_add(float op1, float op2)
{
return op1 + op2;
}
static inline double
DOUBLE_add(double op1, double op2)
{
return op1 + op2;
}
static inline COMPLEX_t
CFLOAT_add(COMPLEX_t op1, COMPLEX_t op2)
{
COMPLEX_t result;
result.array[0] = op1.array[0] + op2.array[0];
result.array[1] = op1.array[1] + op2.array[1];
return result;
}
static inline DOUBLECOMPLEX_t
CDOUBLE_add(DOUBLECOMPLEX_t op1, DOUBLECOMPLEX_t op2)
{
DOUBLECOMPLEX_t result;
result.array[0] = op1.array[0] + op2.array[0];
result.array[1] = op1.array[1] + op2.array[1];
return result;
}
static inline float
FLOAT_mul(float op1, float op2)
{
return op1*op2;
}
static inline double
DOUBLE_mul(double op1, double op2)
{
return op1*op2;
}
static inline COMPLEX_t
CFLOAT_mul(COMPLEX_t op1, COMPLEX_t op2)
{
COMPLEX_t result;
result.array[0] = op1.array[0]*op2.array[0] - op1.array[1]*op2.array[1];
result.array[1] = op1.array[1]*op2.array[0] + op1.array[0]*op2.array[1];
return result;
}
static inline DOUBLECOMPLEX_t
CDOUBLE_mul(DOUBLECOMPLEX_t op1, DOUBLECOMPLEX_t op2)
{
DOUBLECOMPLEX_t result;
result.array[0] = op1.array[0]*op2.array[0] - op1.array[1]*op2.array[1];
result.array[1] = op1.array[1]*op2.array[0] + op1.array[0]*op2.array[1];
return result;
}
static inline float
FLOAT_mulc(float op1, float op2)
{
return op1*op2;
}
static inline double
DOUBLE_mulc(float op1, float op2)
{
return op1*op2;
}
static inline COMPLEX_t
CFLOAT_mulc(COMPLEX_t op1, COMPLEX_t op2)
{
COMPLEX_t result;
result.array[0] = op1.array[0]*op2.array[0] + op1.array[1]*op2.array[1];
result.array[1] = op1.array[0]*op2.array[1] - op1.array[1]*op2.array[0];
return result;
}
static inline DOUBLECOMPLEX_t
CDOUBLE_mulc(DOUBLECOMPLEX_t op1, DOUBLECOMPLEX_t op2)
{
DOUBLECOMPLEX_t result;
result.array[0] = op1.array[0]*op2.array[0] + op1.array[1]*op2.array[1];
result.array[1] = op1.array[0]*op2.array[1] - op1.array[1]*op2.array[0];
return result;
}
static inline void
print_FLOAT(npy_float s)
{
TRACE_TXT(" %8.4f", s);
}
static inline void
print_DOUBLE(npy_double d)
{
TRACE_TXT(" %10.6f", d);
}
static inline void
print_CFLOAT(npy_cfloat c)
{
float* c_parts = (float*)&c;
TRACE_TXT("(%8.4f, %8.4fj)", c_parts[0], c_parts[1]);
}
static inline void
print_CDOUBLE(npy_cdouble z)
{
double* z_parts = (double*)&z;
TRACE_TXT("(%8.4f, %8.4fj)", z_parts[0], z_parts[1]);
}
/**begin repeat
#TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
#typ=npy_float,npy_double,npy_cfloat,npy_cdouble#
*/
static inline void
dump_@TYPE@_matrix(const char* name,
size_t rows, size_t columns,
const @typ@* ptr)
{
size_t i,j;
TRACE_TXT("\n%s %p (%zd, %zd)\n", name, ptr, rows, columns);
for (i=0; i<rows; i++)
{
TRACE_TXT("| ");
for (j=0; j<columns; j++)
{
print_@TYPE@(ptr[j*rows + i]);
TRACE_TXT(", ");
}
TRACE_TXT(" |\n");
}
}
/**end repeat**/
/*
*****************************************************************************
** Basics **
*****************************************************************************
*/
#define INIT_OUTER_LOOP_1 \
npy_intp dN = *dimensions++;\
npy_intp N_;\
npy_intp s0 = *steps++;
#define INIT_OUTER_LOOP_2 \
INIT_OUTER_LOOP_1\
npy_intp s1 = *steps++;
#define INIT_OUTER_LOOP_3 \
INIT_OUTER_LOOP_2\
npy_intp s2 = *steps++;
#define INIT_OUTER_LOOP_4 \
INIT_OUTER_LOOP_3\
npy_intp s3 = *steps++;
#define INIT_OUTER_LOOP_5 \
INIT_OUTER_LOOP_4\
npy_intp s4 = *steps++;
#define INIT_OUTER_LOOP_6 \
INIT_OUTER_LOOP_5\
npy_intp s5 = *steps++;
#define BEGIN_OUTER_LOOP_2 \
for (N_ = 0;\
N_ < dN;\
N_++, args[0] += s0,\
args[1] += s1) {
#define BEGIN_OUTER_LOOP_3 \
for (N_ = 0;\
N_ < dN;\
N_++, args[0] += s0,\
args[1] += s1,\
args[2] += s2) {
#define BEGIN_OUTER_LOOP_4 \
for (N_ = 0;\
N_ < dN;\
N_++, args[0] += s0,\
args[1] += s1,\
args[2] += s2,\
args[3] += s3) {
#define BEGIN_OUTER_LOOP_5 \
for (N_ = 0;\
N_ < dN;\
N_++, args[0] += s0,\
args[1] += s1,\
args[2] += s2,\
args[3] += s3,\
args[4] += s4) {
#define BEGIN_OUTER_LOOP_6 \
for (N_ = 0;\
N_ < dN;\
N_++, args[0] += s0,\
args[1] += s1,\
args[2] += s2,\
args[3] += s3,\
args[4] += s4,\
args[5] += s5) {
#define END_OUTER_LOOP }
static inline void
update_pointers(npy_uint8** bases, ptrdiff_t* offsets, size_t count)
{
size_t i;
for (i=0; i < count; ++i) {
bases[i] += offsets[i];
}
}
/* disable -Wmaybe-uninitialized as there is some code that generate false
positives with this warning
*/
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
/*
*****************************************************************************
** HELPER FUNCS **
*****************************************************************************
*/
/* rearranging of 2D matrices using blas */
/**begin repeat
#TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
#typ=float,double,COMPLEX_t,DOUBLECOMPLEX_t#
#copy=scopy,dcopy,ccopy,zcopy#
#nan=s_nan, d_nan, c_nan, z_nan#
*/
static inline void *
linearize_@TYPE@_matrix(void *dst_in,
void *src_in,
const LINEARIZE_DATA_t* data)
{
@typ@ *src = (@typ@ *) src_in;
@typ@ *dst = (@typ@ *) dst_in;
if (dst) {
int i, j;
@typ@* rv = dst;
fortran_int columns = (fortran_int)data->columns;
fortran_int column_strides =
(fortran_int)(data->column_strides/sizeof(@typ@));
fortran_int one = 1;
for (i=0; i< data->rows; i++) {
if (column_strides > 0) {
FNAME(@copy@)(&columns,
(void*)src, &column_strides,
(void*)dst, &one);
}
else if (column_strides < 0) {
FNAME(@copy@)(&columns,
(void*)((@typ@*)src + (columns-1)*column_strides),
&column_strides,
(void*)dst, &one);
}
else {
/*
* Zero stride has undefined behavior in some BLAS
* implementations (e.g. OSX Accelerate), so do it
* manually
*/
for (j = 0; j < columns; ++j) {
memcpy((@typ@*)dst + j, (@typ@*)src, sizeof(@typ@));
}
}
src += data->row_strides/sizeof(@typ@);
dst += data->columns;
}
return rv;
} else {
return src;
}
}
static inline void *
delinearize_@TYPE@_matrix(void *dst_in,
void *src_in,
const LINEARIZE_DATA_t* data)
{
@typ@ *src = (@typ@ *) src_in;
@typ@ *dst = (@typ@ *) dst_in;
if (src) {
int i;
@typ@ *rv = src;
fortran_int columns = (fortran_int)data->columns;
fortran_int column_strides =
(fortran_int)(data->column_strides/sizeof(@typ@));
fortran_int one = 1;
for (i=0; i < data->rows; i++) {
if (column_strides > 0) {
FNAME(@copy@)(&columns,
(void*)src, &one,
(void*)dst, &column_strides);
}
else if (column_strides < 0) {
FNAME(@copy@)(&columns,
(void*)src, &one,
(void*)((@typ@*)dst + (columns-1)*column_strides),
&column_strides);
}
else {
/*
* Zero stride has undefined behavior in some BLAS
* implementations (e.g. OSX Accelerate), so do it
* manually
*/
if (columns > 0) {
memcpy((@typ@*)dst, (@typ@*)src + (columns-1), sizeof(@typ@));
}
}
src += data->columns;
dst += data->row_strides/sizeof(@typ@);
}
return rv;
} else {
return src;
}
}
static inline void
nan_@TYPE@_matrix(void *dst_in, const LINEARIZE_DATA_t* data)
{
@typ@ *dst = (@typ@ *) dst_in;
int i,j;
for (i=0; i < data->rows; i++) {
@typ@ *cp = dst;
ptrdiff_t cs = data->column_strides/sizeof(@typ@);
for (j=0; j< data->columns; ++j) {
*cp = @nan@;
cp += cs;
}
dst += data->row_strides/sizeof(@typ@);
}
}
/**end repeat**/
/* identity square matrix generation */
/**begin repeat
#TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
#typ=float,double,COMPLEX_t,DOUBLECOMPLEX_t#
#cblas_type=s,d,c,z#
*/
static inline void
identity_@TYPE@_matrix(void *ptr, size_t n)
{
size_t i;
@typ@ *matrix = (@typ@*) ptr;
/* in IEEE floating point, zeroes are represented as bitwise 0 */
memset(matrix, 0, n*n*sizeof(@typ@));
for (i = 0; i < n; ++i)
{
*matrix = @cblas_type@_one;
matrix += n+1;
}
}
/**end repeat**/
/* lower/upper triangular matrix using blas (in place) */
/**begin repeat
#TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
#typ=float,double,COMPLEX_t,DOUBLECOMPLEX_t#
#cblas_type=s,d,c,z#
*/
static inline void
triu_@TYPE@_matrix(void *ptr, size_t n)
{
size_t i,j;
@typ@ *matrix = (@typ@*)ptr;
matrix += n;
for (i=1; i < n; ++i) {
for (j=0; j<i; ++j) {
matrix[j] = @cblas_type@_zero;
}
matrix += n;
}
}
/**end repeat**/
/* -------------------------------------------------------------------------- */
/* Determinants */
/**begin repeat
#TYPE=FLOAT,DOUBLE#
#typ=npy_float, npy_double#
#log_func=npy_logf,npy_log#
#exp_func=npy_expf,npy_exp#
#zero=0.0f,0.0#
*/
static inline void
@TYPE@_slogdet_from_factored_diagonal(@typ@* src,
fortran_int m,
@typ@ *sign,
@typ@ *logdet)
{
@typ@ acc_sign = *sign;
@typ@ acc_logdet = @zero@;
int i;
for (i = 0; i < m; i++) {
@typ@ abs_element = *src;
if (abs_element < @zero@) {
acc_sign = -acc_sign;
abs_element = -abs_element;
}
acc_logdet += @log_func@(abs_element);
src += m+1;
}
*sign = acc_sign;
*logdet = acc_logdet;
}
static inline @typ@
@TYPE@_det_from_slogdet(@typ@ sign, @typ@ logdet)
{
@typ@ result = sign * @exp_func@(logdet);
return result;
}
/**end repeat**/
/**begin repeat
#TYPE=CFLOAT,CDOUBLE#
#typ=npy_cfloat, npy_cdouble#
#basetyp=npy_float, npy_double#
#abs_func=npy_cabsf, npy_cabs#
#log_func=npy_logf, npy_log#
#exp_func=npy_expf, npy_exp#
#zero=0.0f,0.0#
*/
#define RE(COMPLEX) (((@basetyp@*)(&COMPLEX))[0])
#define IM(COMPLEX) (((@basetyp@*)(&COMPLEX))[1])
static inline @typ@
@TYPE@_mult(@typ@ op1, @typ@ op2)
{
@typ@ rv;
RE(rv) = RE(op1)*RE(op2) - IM(op1)*IM(op2);
IM(rv) = RE(op1)*IM(op2) + IM(op1)*RE(op2);
return rv;
}
static inline void
@TYPE@_slogdet_from_factored_diagonal(@typ@* src,
fortran_int m,
@typ@ *sign,
@basetyp@ *logdet)
{
int i;
@typ@ sign_acc = *sign;
@basetyp@ logdet_acc = @zero@;
for (i = 0; i < m; i++)
{
@basetyp@ abs_element = @abs_func@(*src);
@typ@ sign_element;
RE(sign_element) = RE(*src) / abs_element;
IM(sign_element) = IM(*src) / abs_element;
sign_acc = @TYPE@_mult(sign_acc, sign_element);
logdet_acc += @log_func@(abs_element);
src += m + 1;
}
*sign = sign_acc;
*logdet = logdet_acc;
}
static inline @typ@
@TYPE@_det_from_slogdet(@typ@ sign, @basetyp@ logdet)
{
@typ@ tmp;
RE(tmp) = @exp_func@(logdet);
IM(tmp) = @zero@;
return @TYPE@_mult(sign, tmp);
}
#undef RE
#undef IM
/**end repeat**/
/* As in the linalg package, the determinant is computed via LU factorization
* using LAPACK.
* slogdet computes sign + log(determinant).
* det computes sign * exp(slogdet).
*/
/**begin repeat
#TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
#typ=npy_float,npy_double,npy_cfloat,npy_cdouble#
#basetyp=npy_float,npy_double,npy_float,npy_double#
#cblas_type=s,d,c,z#
*/
static inline void
@TYPE@_slogdet_single_element(fortran_int m,
void* src,
fortran_int* pivots,
@typ@ *sign,
@basetyp@ *logdet)
{
fortran_int info = 0;
int i;
/* note: done in place */
LAPACK(@cblas_type@getrf)(&m, &m, (void *)src, &m, pivots, &info);
if (info == 0)
{
int change_sign = 0;
/* note: fortran uses 1 based indexing */
for (i=0; i < m; i++)
{
change_sign += (pivots[i] != (i+1));
}
memcpy(sign,
(change_sign % 2)?
&@cblas_type@_minus_one :
&@cblas_type@_one
, sizeof(*sign));
@TYPE@_slogdet_from_factored_diagonal(src, m, sign, logdet);
} else {
/*
if getrf fails, use 0 as sign and -inf as logdet
*/
memcpy(sign, &@cblas_type@_zero, sizeof(*sign));
memcpy(logdet, &@cblas_type@_ninf, sizeof(*logdet));
}
}
static void
@TYPE@_slogdet(char **args,
npy_intp *dimensions,
npy_intp *steps,
void *NPY_UNUSED(func))
{
fortran_int m;
npy_uint8 *tmp_buff = NULL;
size_t matrix_size;
size_t pivot_size;
/* notes:
* matrix will need to be copied always, as factorization in lapack is
* made inplace
* matrix will need to be in column-major order, as expected by lapack
* code (fortran)
* always a square matrix
* need to allocate memory for both, matrix_buffer and pivot buffer
*/
INIT_OUTER_LOOP_3
m = (fortran_int) dimensions[0];
matrix_size = m*m*sizeof(@typ@);
pivot_size = m*sizeof(fortran_int);
tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size);
if (tmp_buff)
{
LINEARIZE_DATA_t lin_data;
/* swapped steps to get matrix in FORTRAN order */
init_linearize_data(&lin_data, m, m,
(ptrdiff_t)steps[1],
(ptrdiff_t)steps[0]);
BEGIN_OUTER_LOOP_3
linearize_@TYPE@_matrix(tmp_buff, args[0], &lin_data);
@TYPE@_slogdet_single_element(m,
(void*)tmp_buff,
(fortran_int*)(tmp_buff+matrix_size),
(@typ@*)args[1],
(@basetyp@*)args[2]);
END_OUTER_LOOP
free(tmp_buff);
}
}
static void
@TYPE@_det(char **args,
npy_intp *dimensions,
npy_intp *steps,
void *NPY_UNUSED(func))
{
fortran_int m;
npy_uint8 *tmp_buff;
size_t matrix_size;
size_t pivot_size;
/* notes:
* matrix will need to be copied always, as factorization in lapack is
* made inplace
* matrix will need to be in column-major order, as expected by lapack
* code (fortran)
* always a square matrix
* need to allocate memory for both, matrix_buffer and pivot buffer
*/
INIT_OUTER_LOOP_2
m = (fortran_int) dimensions[0];
matrix_size = m*m*sizeof(@typ@);
pivot_size = m*sizeof(fortran_int);
tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size);
if (tmp_buff)
{
LINEARIZE_DATA_t lin_data;
@typ@ sign;
@basetyp@ logdet;
/* swapped steps to get matrix in FORTRAN order */
init_linearize_data(&lin_data, m, m,
(ptrdiff_t)steps[1],
(ptrdiff_t)steps[0]);
BEGIN_OUTER_LOOP_2
linearize_@TYPE@_matrix(tmp_buff, args[0], &lin_data);
@TYPE@_slogdet_single_element(m,
(void*)tmp_buff,
(fortran_int*)(tmp_buff+matrix_size),
&sign,
&logdet);
*(@typ@ *)args[1] = @TYPE@_det_from_slogdet(sign, logdet);
END_OUTER_LOOP
free(tmp_buff);
}
}
/**end repeat**/
/* -------------------------------------------------------------------------- */
/* Eigh family */
typedef struct eigh_params_struct {
void *A; /* matrix */
void *W; /* eigenvalue vector */
void *WORK; /* main work buffer */
void *RWORK; /* secondary work buffer (for complex versions) */
void *IWORK;
fortran_int N;
fortran_int LWORK;
fortran_int LRWORK;
fortran_int LIWORK;
char JOBZ;
char UPLO;
} EIGH_PARAMS_t;
/**begin repeat
#TYPE=FLOAT,DOUBLE#
#typ=npy_float,npy_double#
#ftyp=fortran_real,fortran_doublereal#
#lapack_func=ssyevd,dsyevd#
*/
/*
* Initialize the parameters to use in for the lapack function _syevd
* Handles buffer allocation
*/
static inline int
init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO,
fortran_int N)
{
npy_uint8 *mem_buff = NULL;
npy_uint8 *mem_buff2 = NULL;
@typ@ query_work_size;
fortran_int query_iwork_size;
fortran_int lwork = -1;
fortran_int liwork = -1;
fortran_int info;
npy_uint8 *a, *w, *work, *iwork;
size_t alloc_size = N*(N+1)*sizeof(@typ@);
mem_buff = malloc(alloc_size);
if (!mem_buff)
goto error;
a = mem_buff;
w = mem_buff + N*N*sizeof(@typ@);
LAPACK(@lapack_func@)(&JOBZ, &UPLO, &N,
(@ftyp@*)a, &N, (@ftyp@*)w,
&query_work_size, &lwork,
&query_iwork_size, &liwork,
&info);
if (info != 0)
goto error;
work = mem_buff;
lwork = (fortran_int)query_work_size;
liwork = query_iwork_size;
mem_buff2 = malloc(lwork*sizeof(@typ@) + liwork*sizeof(fortran_int));
if (!mem_buff2)
goto error;
work = mem_buff2;
iwork = mem_buff2 + lwork*sizeof(@typ@);
params->A = a;
params->W = w;
params->WORK = work;
params->RWORK = NULL; /* unused */
params->IWORK = iwork;
params->N = N;
params->LWORK = lwork;
params->LRWORK = 0; /* unused */
params->LIWORK = liwork;
params->JOBZ = JOBZ;
params->UPLO = UPLO;
return 1;
error:
/* something failed */
memset(params, 0, sizeof(*params));
free(mem_buff2);
free(mem_buff);
return 0;
}
static inline fortran_int
call_@lapack_func@(EIGH_PARAMS_t *params)
{
fortran_int rv;
LAPACK(@lapack_func@)(&params->JOBZ, &params->UPLO, &params->N,
params->A, &params->N, params->W,
params->WORK, &params->LWORK,
params->IWORK, &params->LIWORK,
&rv);
return rv;
}
/**end repeat**/
/**begin repeat
#TYPE=CFLOAT,CDOUBLE#
#typ=npy_cfloat,npy_cdouble#
#basetyp=npy_float,npy_double#
#ftyp=fortran_complex,fortran_doublecomplex#
#fbasetyp=fortran_real,fortran_doublereal#
#lapack_func=cheevd,zheevd#
*/
/*
* Initialize the parameters to use in for the lapack function _heev
* Handles buffer allocation
*/
static inline int
init_@lapack_func@(EIGH_PARAMS_t *params,
char JOBZ,
char UPLO,
fortran_int N)
{
npy_uint8 *mem_buff = NULL;
npy_uint8 *mem_buff2 = NULL;
@ftyp@ query_work_size;
@fbasetyp@ query_rwork_size;
fortran_int query_iwork_size;
fortran_int lwork = -1;
fortran_int lrwork = -1;
fortran_int liwork = -1;
npy_uint8 *a, *w, *work, *rwork, *iwork;
fortran_int info;
mem_buff = malloc(N*N*sizeof(@typ@)+N*sizeof(@basetyp@));
if (!mem_buff)
goto error;
a = mem_buff;
w = mem_buff+N*N*sizeof(@typ@);
LAPACK(@lapack_func@)(&JOBZ, &UPLO, &N,
(@ftyp@*)a, &N, (@fbasetyp@*)w,
&query_work_size, &lwork,
&query_rwork_size, &lrwork,
&query_iwork_size, &liwork,
&info);
if (info != 0)
goto error;
lwork = (fortran_int)*(@fbasetyp@*)&query_work_size;
lrwork = (fortran_int)query_rwork_size;
liwork = query_iwork_size;
mem_buff2 = malloc(lwork*sizeof(@typ@) +
lrwork*sizeof(@basetyp@) +
liwork*sizeof(fortran_int));
if (!mem_buff2)
goto error;
work = mem_buff2;
rwork = work + lwork*sizeof(@typ@);
iwork = rwork + lrwork*sizeof(@basetyp@);
params->A = a;
params->W = w;
params->WORK = work;
params->RWORK = rwork;
params->IWORK = iwork;
params->N = N;
params->LWORK = lwork;
params->LRWORK = lrwork;
params->LIWORK = liwork;
params->JOBZ = JOBZ;
params->UPLO = UPLO;
return 1;
/* something failed */
error:
memset(params, 0, sizeof(*params));
free(mem_buff2);
free(mem_buff);
return 0;
}
static inline fortran_int
call_@lapack_func@(EIGH_PARAMS_t *params)
{
fortran_int rv;
LAPACK(@lapack_func@)(&params->JOBZ, &params->UPLO, &params->N,
params->A, &params->N, params->W,
params->WORK, &params->LWORK,
params->RWORK, &params->LRWORK,
params->IWORK, &params->LIWORK,
&rv);
return rv;
}
/**end repeat**/
/**begin repeat
#TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
#BASETYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
#typ=npy_float,npy_double,npy_cfloat,npy_cdouble#
#basetyp=npy_float,npy_double,npy_float,npy_double#
#lapack_func=ssyevd,dsyevd,cheevd,zheevd#
**/
/*
* (M,M)->(M,)(M,M)
* dimensions[1] -> M
* args[0] -> A[in]
* args[1] -> W
* args[2] -> A[out]
*/
static inline void
release_@lapack_func@(EIGH_PARAMS_t *params)
{
/* allocated memory in A and WORK */
free(params->A);
free(params->WORK);
memset(params, 0, sizeof(*params));
}
static inline void
@TYPE@_eigh_wrapper(char JOBZ,
char UPLO,
char**args,
npy_intp* dimensions,
npy_intp* steps)
{
ptrdiff_t outer_steps[3];
size_t iter;
size_t outer_dim = *dimensions++;
size_t op_count = (JOBZ=='N')?2:3;
EIGH_PARAMS_t eigh_params;
int error_occurred = get_fp_invalid_and_clear();
for (iter=0; iter < op_count; ++iter) {
outer_steps[iter] = (ptrdiff_t) steps[iter];
}
steps += op_count;
if (init_@lapack_func@(&eigh_params,
JOBZ,
UPLO,
(fortran_int)dimensions[0])) {
LINEARIZE_DATA_t matrix_in_ld;
LINEARIZE_DATA_t eigenvectors_out_ld;
LINEARIZE_DATA_t eigenvalues_out_ld;
init_linearize_data(&matrix_in_ld,
eigh_params.N, eigh_params.N,
steps[1], steps[0]);
init_linearize_data(&eigenvalues_out_ld,
1, eigh_params.N,
0, steps[2]);
if ('V' == eigh_params.JOBZ) {
init_linearize_data(&eigenvectors_out_ld,
eigh_params.N, eigh_params.N,
steps[4], steps[3]);
}
for (iter = 0; iter < outer_dim; ++iter) {
int not_ok;
/* copy the matrix in */
linearize_@TYPE@_matrix(eigh_params.A, args[0], &matrix_in_ld);
not_ok = call_@lapack_func@(&eigh_params);
if (!not_ok) {
/* lapack ok, copy result out */
delinearize_@BASETYPE@_matrix(args[1],
eigh_params.W,
&eigenvalues_out_ld);
if ('V' == eigh_params.JOBZ) {
delinearize_@TYPE@_matrix(args[2],
eigh_params.A,
&eigenvectors_out_ld);
}
} else {
/* lapack fail, set result to nan */
error_occurred = 1;
nan_@BASETYPE@_matrix(args[1], &eigenvalues_out_ld);
if ('V' == eigh_params.JOBZ) {
nan_@TYPE@_matrix(args[2], &eigenvectors_out_ld);
}
}
update_pointers((npy_uint8**)args, outer_steps, op_count);
}
release_@lapack_func@(&eigh_params);
}
set_fp_invalid_or_clear(error_occurred);
}
/**end repeat**/
/**begin repeat
#TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
*/
static void
@TYPE@_eighlo(char **args,
npy_intp *dimensions,
npy_intp *steps,
void *NPY_UNUSED(func))
{
@TYPE@_eigh_wrapper('V', 'L', args, dimensions, steps);
}
static void
@TYPE@_eighup(char **args,
npy_intp *dimensions,
npy_intp *steps,
void* NPY_UNUSED(func))
{
@TYPE@_eigh_wrapper('V', 'U', args, dimensions, steps);
}
static void
@TYPE@_eigvalshlo(char **args,
npy_intp *dimensions,
npy_intp *steps,
void* NPY_UNUSED(func))
{
@TYPE@_eigh_wrapper('N', 'L', args, dimensions, steps);
}
static void
@TYPE@_eigvalshup(char **args,
npy_intp *dimensions,
npy_intp *steps,
void* NPY_UNUSED(func))
{
@TYPE@_eigh_wrapper('N', 'U', args, dimensions, steps);
}
/**end repeat**/
/* -------------------------------------------------------------------------- */
/* Solve family (includes inv) */
typedef struct gesv_params_struct
{
void *A; /* A is (N,N) of base type */
void *B; /* B is (N,NRHS) of base type */
fortran_int * IPIV; /* IPIV is (N) */
fortran_int N;
fortran_int NRHS;
fortran_int LDA;
fortran_int LDB;
} GESV_PARAMS_t;
/**begin repeat
#TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
#typ=npy_float,npy_double,npy_cfloat,npy_cdouble#
#ftyp=fortran_real,fortran_doublereal,fortran_complex,fortran_doublecomplex#
#lapack_func=sgesv,dgesv,cgesv,zgesv#
*/
/*
* Initialize the parameters to use in for the lapack function _heev
* Handles buffer allocation
*/
static inline int
init_@lapack_func@(GESV_PARAMS_t *params, fortran_int N, fortran_int NRHS)
{
npy_uint8 *mem_buff = NULL;
npy_uint8 *a, *b, *ipiv;
mem_buff = malloc(N*N*sizeof(@ftyp@) +
N*NRHS*sizeof(@ftyp@) +
N*sizeof(fortran_int));
if (!mem_buff)
goto error;
a = mem_buff;
b = a + N*N*sizeof(@ftyp@);
ipiv = b + N*NRHS*sizeof(@ftyp@);
params->A = a;
params->B = b;
params->IPIV = (fortran_int*)ipiv;
params->N = N;
params->NRHS = NRHS;
params->LDA = N;
params->LDB = N;
return 1;
error:
free(mem_buff);
memset(params, 0, sizeof(*params));
return 0;
}
static inline void
release_@lapack_func@(GESV_PARAMS_t *params)
{
/* memory block base is in A */
free(params->A);
memset(params, 0, sizeof(*params));
}
static inline fortran_int
call_@lapack_func@(GESV_PARAMS_t *params)
{
fortran_int rv;
LAPACK(@lapack_func@)(&params->N, &params->NRHS,
params->A, &params->LDA,
params->IPIV,
params->B, &params->LDB,
&rv);
return rv;
}
static void
@TYPE@_solve(char **args, npy_intp *dimensions, npy_intp *steps,
void *NPY_UNUSED(func))
{
GESV_PARAMS_t params;
fortran_int n, nrhs;
int error_occurred = get_fp_invalid_and_clear();
INIT_OUTER_LOOP_3
n = (fortran_int)dimensions[0];
nrhs = (fortran_int)dimensions[1];
if (init_@lapack_func@(&params, n, nrhs)) {
LINEARIZE_DATA_t a_in, b_in, r_out;
init_linearize_data(&a_in, n, n, steps[1], steps[0]);
init_linearize_data(&b_in, nrhs, n, steps[3], steps[2]);
init_linearize_data(&r_out, nrhs, n, steps[5], steps[4]);
BEGIN_OUTER_LOOP_3
int not_ok;
linearize_@TYPE@_matrix(params.A, args[0], &a_in);
linearize_@TYPE@_matrix(params.B, args[1], &b_in);
not_ok =call_@lapack_func@(&params);
if (!not_ok) {
delinearize_@TYPE@_matrix(args[2], params.B, &r_out);
} else {
error_occurred = 1;
nan_@TYPE@_matrix(args[2], &r_out);
}
END_OUTER_LOOP
release_@lapack_func@(&params);
}
set_fp_invalid_or_clear(error_occurred);
}
static void
@TYPE@_solve1(char **args, npy_intp *dimensions, npy_intp *steps,
void *NPY_UNUSED(func))
{
GESV_PARAMS_t params;
int error_occurred = get_fp_invalid_and_clear();
fortran_int n;
INIT_OUTER_LOOP_3
n = (fortran_int)dimensions[0];
if (init_@lapack_func@(&params, n, 1)) {
LINEARIZE_DATA_t a_in, b_in, r_out;
init_linearize_data(&a_in, n, n, steps[1], steps[0]);
init_linearize_data(&b_in, 1, n, 1, steps[2]);
init_linearize_data(&r_out, 1, n, 1, steps[3]);
BEGIN_OUTER_LOOP_3
int not_ok;
linearize_@TYPE@_matrix(params.A, args[0], &a_in);
linearize_@TYPE@_matrix(params.B, args[1], &b_in);
not_ok = call_@lapack_func@(&params);
if (!not_ok) {
delinearize_@TYPE@_matrix(args[2], params.B, &r_out);
} else {
error_occurred = 1;
nan_@TYPE@_matrix(args[2], &r_out);
}
END_OUTER_LOOP
release_@lapack_func@(&params);
}
set_fp_invalid_or_clear(error_occurred);
}
static void
@TYPE@_inv(char **args, npy_intp *dimensions, npy_intp *steps,
void *NPY_UNUSED(func))
{
GESV_PARAMS_t params;
fortran_int n;
int error_occurred = get_fp_invalid_and_clear();
INIT_OUTER_LOOP_2
n = (fortran_int)dimensions[0];
if (init_@lapack_func@(&params, n, n)) {
LINEARIZE_DATA_t a_in, r_out;
init_linearize_data(&a_in, n, n, steps[1], steps[0]);
init_linearize_data(&r_out, n, n, steps[3], steps[2]);
BEGIN_OUTER_LOOP_2
int not_ok;
linearize_@TYPE@_matrix(params.A, args[0], &a_in);
identity_@TYPE@_matrix(params.B, n);
not_ok = call_@lapack_func@(&params);
if (!not_ok) {
delinearize_@TYPE@_matrix(args[1], params.B, &r_out);
} else {
error_occurred = 1;
nan_@TYPE@_matrix(args[1], &r_out);
}
END_OUTER_LOOP
release_@lapack_func@(&params);
}
set_fp_invalid_or_clear(error_occurred);
}
/**end repeat**/
/* -------------------------------------------------------------------------- */
/* Cholesky decomposition */
typedef struct potr_params_struct
{
void *A;
fortran_int N;
fortran_int LDA;
char UPLO;
} POTR_PARAMS_t;
/**begin repeat
#TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
#ftyp=fortran_real, fortran_doublereal,
fortran_complex, fortran_doublecomplex#
#lapack_func=spotrf,dpotrf,cpotrf,zpotrf#
*/
static inline int
init_@lapack_func@(POTR_PARAMS_t *params, char UPLO, fortran_int N)
{
npy_uint8 *mem_buff = NULL;
npy_uint8 *a;
mem_buff = malloc(N*N*sizeof(@ftyp@));
if (!mem_buff)
goto error;
a = mem_buff;
params->A = a;
params->N = N;
params->LDA = N;
params->UPLO = UPLO;
return 1;
error:
free(mem_buff);
memset(params, 0, sizeof(*params));
return 0;
}
static inline void
release_@lapack_func@(POTR_PARAMS_t *params)
{
/* memory block base in A */
free(params->A);
memset(params, 0, sizeof(*params));
}
static inline fortran_int
call_@lapack_func@(POTR_PARAMS_t *params)
{
fortran_int rv;
LAPACK(@lapack_func@)(&params->UPLO,
&params->N, params->A, &params->LDA,
&rv);
return rv;
}
static void
@TYPE@_cholesky(char uplo, char **args, npy_intp *dimensions, npy_intp *steps)
{
POTR_PARAMS_t params;
int error_occurred = get_fp_invalid_and_clear();
fortran_int n;
INIT_OUTER_LOOP_2
assert(uplo == 'L');
n = (fortran_int)dimensions[0];
if (init_@lapack_func@(&params, uplo, n))
{
LINEARIZE_DATA_t a_in, r_out;
init_linearize_data(&a_in, n, n, steps[1], steps[0]);
init_linearize_data(&r_out, n, n, steps[3], steps[2]);
BEGIN_OUTER_LOOP_2
int not_ok;
linearize_@TYPE@_matrix(params.A, args[0], &a_in);
not_ok = call_@lapack_func@(&params);
if (!not_ok) {
triu_@TYPE@_matrix(params.A, params.N);
delinearize_@TYPE@_matrix(args[1], params.A, &r_out);
} else {
error_occurred = 1;
nan_@TYPE@_matrix(args[1], &r_out);
}
END_OUTER_LOOP
release_@lapack_func@(&params);
}
set_fp_invalid_or_clear(error_occurred);
}
static void
@TYPE@_cholesky_lo(char **args, npy_intp *dimensions, npy_intp *steps,
void *NPY_UNUSED(func))
{
@TYPE@_cholesky('L', args, dimensions, steps);
}
/**end repeat**/
/* -------------------------------------------------------------------------- */
/* eig family */
typedef struct geev_params_struct {
void *A;
void *WR; /* RWORK in complex versions, REAL W buffer for (sd)geev*/
void *WI;
void *VLR; /* REAL VL buffers for _geev where _ is s, d */
void *VRR; /* REAL VR buffers for _geev hwere _ is s, d */
void *WORK;
void *W; /* final w */
void *VL; /* final vl */
void *VR; /* final vr */
fortran_int N;
fortran_int LDA;
fortran_int LDVL;
fortran_int LDVR;
fortran_int LWORK;
char JOBVL;
char JOBVR;
} GEEV_PARAMS_t;
static inline void
dump_geev_params(const char *name, GEEV_PARAMS_t* params)
{
TRACE_TXT("\n%s\n"
"\t%10s: %p\n"\
"\t%10s: %p\n"\
"\t%10s: %p\n"\
"\t%10s: %p\n"\
"\t%10s: %p\n"\
"\t%10s: %p\n"\
"\t%10s: %p\n"\
"\t%10s: %p\n"\
"\t%10s: %p\n"\
"\t%10s: %d\n"\
"\t%10s: %d\n"\
"\t%10s: %d\n"\
"\t%10s: %d\n"\
"\t%10s: %d\n"\
"\t%10s: %c\n"\
"\t%10s: %c\n",
name,
"A", params->A,
"WR", params->WR,
"WI", params->WI,
"VLR", params->VLR,
"VRR", params->VRR,
"WORK", params->WORK,
"W", params->W,
"VL", params->VL,
"VR", params->VR,
"N", (int)params->N,
"LDA", (int)params->LDA,
"LDVL", (int)params->LDVL,
"LDVR", (int)params->LDVR,
"LWORK", (int)params->LWORK,
"JOBVL", params->JOBVL,
"JOBVR", params->JOBVR);
}
/**begin repeat
#TYPE=FLOAT,DOUBLE#
#CTYPE=CFLOAT,CDOUBLE#
#typ=float,double#
#complextyp=COMPLEX_t,DOUBLECOMPLEX_t#
#lapack_func=sgeev,dgeev#
#zero=0.0f,0.0#
*/
static inline int
init_@lapack_func@(GEEV_PARAMS_t *params, char jobvl, char jobvr, fortran_int n)
{
npy_uint8 *mem_buff=NULL;
npy_uint8 *mem_buff2=NULL;
npy_uint8 *a, *wr, *wi, *vlr, *vrr, *work, *w, *vl, *vr;
size_t a_size = n*n*sizeof(@typ@);
size_t wr_size = n*sizeof(@typ@);
size_t wi_size = n*sizeof(@typ@);
size_t vlr_size = jobvl=='V' ? n*n*sizeof(@typ@) : 0;
size_t vrr_size = jobvr=='V' ? n*n*sizeof(@typ@) : 0;
size_t w_size = wr_size*2;
size_t vl_size = vlr_size*2;
size_t vr_size = vrr_size*2;
size_t work_count = 0;
@typ@ work_size_query;
fortran_int do_size_query = -1;
fortran_int rv;
/* allocate data for known sizes (all but work) */
mem_buff = malloc(a_size + wr_size + wi_size +
vlr_size + vrr_size +
w_size + vl_size + vr_size);
if (!mem_buff)
goto error;
a = mem_buff;
wr = a + a_size;
wi = wr + wr_size;
vlr = wi + wi_size;
vrr = vlr + vlr_size;
w = vrr + vrr_size;
vl = w + w_size;
vr = vl + vl_size;
LAPACK(@lapack_func@)(&jobvl, &jobvr, &n,
(void *)a, &n, (void *)wr, (void *)wi,
(void *)vl, &n, (void *)vr, &n,
&work_size_query, &do_size_query,
&rv);
if (0 != rv)
goto error;
work_count = (size_t)work_size_query;
mem_buff2 = malloc(work_count*sizeof(@typ@));
if (!mem_buff2)
goto error;
work = mem_buff2;
params->A = a;
params->WR = wr;
params->WI = wi;
params->VLR = vlr;
params->VRR = vrr;
params->WORK = work;
params->W = w;
params->VL = vl;
params->VR = vr;
params->N = n;
params->LDA = n;
params->LDVL = n;
params->LDVR = n;
params->LWORK = (fortran_int)work_count;
params->JOBVL = jobvl;
params->JOBVR = jobvr;
return 1;
error:
free(mem_buff2);
free(mem_buff);
memset(params, 0, sizeof(*params));
return 0;
}
static inline fortran_int
call_@lapack_func@(GEEV_PARAMS_t* params)
{
fortran_int rv;
LAPACK(@lapack_func@)(&params->JOBVL, &params->JOBVR,
&params->N, params->A, &params->LDA,
params->WR, params->WI,
params->VLR, &params->LDVL,
params->VRR, &params->LDVR,
params->WORK, &params->LWORK,
&rv);
return rv;
}
static inline void
mk_@TYPE@_complex_array_from_real(@complextyp@ *c, const @typ@ *re, size_t n)
{
size_t iter;
for (iter = 0; iter < n; ++iter) {
c[iter].array[0] = re[iter];
c[iter].array[1] = @zero@;
}
}
static inline void
mk_@TYPE@_complex_array(@complextyp@ *c,
const @typ@ *re,
const @typ@ *im,
size_t n)
{
size_t iter;
for (iter = 0; iter < n; ++iter) {
c[iter].array[0] = re[iter];
c[iter].array[1] = im[iter];
}
}
static inline void
mk_@TYPE@_complex_array_conjugate_pair(@complextyp@ *c,
const @typ@ *r,
size_t n)
{
size_t iter;
for (iter = 0; iter < n; ++iter) {
@typ@ re = r[iter];
@typ@ im = r[iter+n];
c[iter].array[0] = re;
c[iter].array[1] = im;
c[iter+n].array[0] = re;
c[iter+n].array[1] = -im;
}
}
/*
* make the complex eigenvectors from the real array produced by sgeev/zgeev.
* c is the array where the results will be left.
* r is the source array of reals produced by sgeev/zgeev
* i is the eigenvalue imaginary part produced by sgeev/zgeev
* n is so that the order of the matrix is n by n
*/
static inline void
mk_@lapack_func@_complex_eigenvectors(@complextyp@ *c,
const @typ@ *r,
const @typ@ *i,
size_t n)
{
size_t iter = 0;
while (iter < n)
{
if (i[iter] == @zero@) {
/* eigenvalue was real, eigenvectors as well... */
mk_@TYPE@_complex_array_from_real(c, r, n);
c += n;
r += n;
iter ++;
} else {
/* eigenvalue was complex, generate a pair of eigenvectors */
mk_@TYPE@_complex_array_conjugate_pair(c, r, n);
c += 2*n;
r += 2*n;
iter += 2;
}
}
}
static inline void
process_@lapack_func@_results(GEEV_PARAMS_t *params)
{
/* REAL versions of geev need the results to be translated
* into complex versions. This is the way to deal with imaginary
* results. In our gufuncs we will always return complex arrays!
*/
mk_@TYPE@_complex_array(params->W, params->WR, params->WI, params->N);
/* handle the eigenvectors */
if ('V' == params->JOBVL) {
mk_@lapack_func@_complex_eigenvectors(params->VL, params->VLR,
params->WI, params->N);
}
if ('V' == params->JOBVR) {
mk_@lapack_func@_complex_eigenvectors(params->VR, params->VRR,
params->WI, params->N);
}
}
/**end repeat**/
/**begin repeat
#TYPE=CFLOAT,CDOUBLE#
#typ=COMPLEX_t,DOUBLECOMPLEX_t#
#ftyp=fortran_complex,fortran_doublecomplex#
#realtyp=float,double#
#lapack_func=cgeev,zgeev#
*/
static inline int
init_@lapack_func@(GEEV_PARAMS_t* params,
char jobvl,
char jobvr,
fortran_int n)
{
npy_uint8 *mem_buff = NULL;
npy_uint8 *mem_buff2 = NULL;
npy_uint8 *a, *w, *vl, *vr, *work, *rwork;
size_t a_size = n*n*sizeof(@ftyp@);
size_t w_size = n*sizeof(@ftyp@);
size_t vl_size = jobvl=='V'? n*n*sizeof(@ftyp@) : 0;
size_t vr_size = jobvr=='V'? n*n*sizeof(@ftyp@) : 0;
size_t rwork_size = 2*n*sizeof(@realtyp@);
size_t work_count = 0;
@typ@ work_size_query;
fortran_int do_size_query = -1;
fortran_int rv;
size_t total_size = a_size + w_size + vl_size + vr_size + rwork_size;
mem_buff = malloc(total_size);
if (!mem_buff)
goto error;
a = mem_buff;
w = a + a_size;
vl = w + w_size;
vr = vl + vl_size;
rwork = vr + vr_size;
LAPACK(@lapack_func@)(&jobvl, &jobvr, &n,
(void *)a, &n, (void *)w,
(void *)vl, &n, (void *)vr, &n,
(void *)&work_size_query, &do_size_query,
(void *)rwork,
&rv);
if (0 != rv)
goto error;
work_count = (size_t) work_size_query.array[0];
mem_buff2 = malloc(work_count*sizeof(@ftyp@));
if (!mem_buff2)
goto error;
work = mem_buff2;
params->A = a;
params->WR = rwork;
params->WI = NULL;
params->VLR = NULL;
params->VRR = NULL;
params->VL = vl;
params->VR = vr;
params->WORK = work;
params->W = w;
params->N = n;
params->LDA = n;
params->LDVL = n;
params->LDVR = n;
params->LWORK = (fortran_int)work_count;
params->JOBVL = jobvl;
params->JOBVR = jobvr;
return 1;
error:
free(mem_buff2);
free(mem_buff);
memset(params, 0, sizeof(*params));
return 0;
}
static inline fortran_int
call_@lapack_func@(GEEV_PARAMS_t* params)
{
fortran_int rv;
LAPACK(@lapack_func@)(&params->JOBVL, &params->JOBVR,
&params->N, params->A, &params->LDA,
params->W,
params->VL, &params->LDVL,
params->VR, &params->LDVR,
params->WORK, &params->LWORK,
params->WR, /* actually RWORK */
&rv);
return rv;
}
static inline void
process_@lapack_func@_results(GEEV_PARAMS_t *NPY_UNUSED(params))
{
/* nothing to do here, complex versions are ready to copy out */
}
/**end repeat**/
/**begin repeat
#TYPE=FLOAT,DOUBLE,CDOUBLE#
#COMPLEXTYPE=CFLOAT,CDOUBLE,CDOUBLE#
#ftype=fortran_real,fortran_doublereal,fortran_doublecomplex#
#lapack_func=sgeev,dgeev,zgeev#
*/
static inline void
release_@lapack_func@(GEEV_PARAMS_t *params)
{
free(params->WORK);
free(params->A);
memset(params, 0, sizeof(*params));
}
static inline void
@TYPE@_eig_wrapper(char JOBVL,
char JOBVR,
char**args,
npy_intp* dimensions,
npy_intp* steps)
{
ptrdiff_t outer_steps[4];
size_t iter;
size_t outer_dim = *dimensions++;
size_t op_count = 2;
int error_occurred = get_fp_invalid_and_clear();
GEEV_PARAMS_t geev_params;
assert(JOBVL == 'N');
STACK_TRACE;
op_count += 'V'==JOBVL?1:0;
op_count += 'V'==JOBVR?1:0;
for (iter=0; iter < op_count; ++iter) {
outer_steps[iter] = (ptrdiff_t) steps[iter];
}
steps += op_count;
if (init_@lapack_func@(&geev_params,
JOBVL, JOBVR,
(fortran_int)dimensions[0])) {
LINEARIZE_DATA_t a_in;
LINEARIZE_DATA_t w_out;
LINEARIZE_DATA_t vl_out;
LINEARIZE_DATA_t vr_out;
init_linearize_data(&a_in,
geev_params.N, geev_params.N,
steps[1], steps[0]);
steps += 2;
init_linearize_data(&w_out,
1, geev_params.N,
0, steps[0]);
steps += 1;
if ('V' == geev_params.JOBVL) {
init_linearize_data(&vl_out,
geev_params.N, geev_params.N,
steps[1], steps[0]);
steps += 2;
}
if ('V' == geev_params.JOBVR) {
init_linearize_data(&vr_out,
geev_params.N, geev_params.N,
steps[1], steps[0]);
}
for (iter = 0; iter < outer_dim; ++iter) {
int not_ok;
char **arg_iter = args;
/* copy the matrix in */
linearize_@TYPE@_matrix(geev_params.A, *arg_iter++, &a_in);
not_ok = call_@lapack_func@(&geev_params);
if (!not_ok) {
process_@lapack_func@_results(&geev_params);
delinearize_@COMPLEXTYPE@_matrix(*arg_iter++,
geev_params.W,
&w_out);
if ('V' == geev_params.JOBVL)
delinearize_@COMPLEXTYPE@_matrix(*arg_iter++,
geev_params.VL,
&vl_out);
if ('V' == geev_params.JOBVR)
delinearize_@COMPLEXTYPE@_matrix(*arg_iter++,
geev_params.VR,
&vr_out);
} else {
/* geev failed */
error_occurred = 1;
nan_@COMPLEXTYPE@_matrix(*arg_iter++, &w_out);
if ('V' == geev_params.JOBVL)
nan_@COMPLEXTYPE@_matrix(*arg_iter++, &vl_out);
if ('V' == geev_params.JOBVR)
nan_@COMPLEXTYPE@_matrix(*arg_iter++, &vr_out);
}
update_pointers((npy_uint8**)args, outer_steps, op_count);
}
release_@lapack_func@(&geev_params);
}
set_fp_invalid_or_clear(error_occurred);
}
static void
@TYPE@_eig(char **args,
npy_intp *dimensions,
npy_intp *steps,
void *NPY_UNUSED(func))
{
@TYPE@_eig_wrapper('N', 'V', args, dimensions, steps);
}
static void
@TYPE@_eigvals(char **args,
npy_intp *dimensions,
npy_intp *steps,
void *NPY_UNUSED(func))
{
@TYPE@_eig_wrapper('N', 'N', args, dimensions, steps);
}
/**end repeat**/
/* -------------------------------------------------------------------------- */
/* singular value decomposition */
typedef struct gessd_params_struct
{
void *A;
void *S;
void *U;
void *VT;
void *WORK;
void *RWORK;
void *IWORK;
fortran_int M;
fortran_int N;
fortran_int LDA;
fortran_int LDU;
fortran_int LDVT;
fortran_int LWORK;
char JOBZ;
} GESDD_PARAMS_t;
static inline void
dump_gesdd_params(const char *name,
GESDD_PARAMS_t *params)
{
TRACE_TXT("\n%s:\n"\
"%14s: %18p\n"\
"%14s: %18p\n"\
"%14s: %18p\n"\
"%14s: %18p\n"\
"%14s: %18p\n"\
"%14s: %18p\n"\
"%14s: %18p\n"\
"%14s: %18d\n"\
"%14s: %18d\n"\
"%14s: %18d\n"\
"%14s: %18d\n"\
"%14s: %18d\n"\
"%14s: %18d\n"\
"%14s: %15c'%c'\n",
name,
"A", params->A,
"S", params->S,
"U", params->U,
"VT", params->VT,
"WORK", params->WORK,
"RWORK", params->RWORK,
"IWORK", params->IWORK,
"M", (int)params->M,
"N", (int)params->N,
"LDA", (int)params->LDA,
"LDU", (int)params->LDU,
"LDVT", (int)params->LDVT,
"LWORK", (int)params->LWORK,
"JOBZ", ' ',params->JOBZ);
}
static inline int
compute_urows_vtcolumns(char jobz,
fortran_int m, fortran_int n,
fortran_int *urows, fortran_int *vtcolumns)
{
fortran_int min_m_n = m<n?m:n;
switch(jobz)
{
case 'N':
*urows = 0;
*vtcolumns = 0;
break;
case 'A':
*urows = m;
*vtcolumns = n;
break;
case 'S':
{
*urows = min_m_n;
*vtcolumns = min_m_n;
}
break;
default:
return 0;
}
return 1;
}
/**begin repeat
#TYPE=FLOAT,DOUBLE#
#lapack_func=sgesdd,dgesdd#
#ftyp=fortran_real,fortran_doublereal#
*/
static inline int
init_@lapack_func@(GESDD_PARAMS_t *params,
char jobz,
fortran_int m,
fortran_int n)
{
npy_uint8 *mem_buff = NULL;
npy_uint8 *mem_buff2 = NULL;
npy_uint8 *a, *s, *u, *vt, *work, *iwork;
size_t a_size = (size_t)m*(size_t)n*sizeof(@ftyp@);
fortran_int min_m_n = m<n?m:n;
size_t s_size = ((size_t)min_m_n)*sizeof(@ftyp@);
fortran_int u_row_count, vt_column_count;
size_t u_size, vt_size;
fortran_int work_count;
size_t work_size;
size_t iwork_size = 8*((size_t)min_m_n)*sizeof(fortran_int);
if (!compute_urows_vtcolumns(jobz, m, n, &u_row_count, &vt_column_count))
goto error;
u_size = ((size_t)u_row_count)*m*sizeof(@ftyp@);
vt_size = n*((size_t)vt_column_count)*sizeof(@ftyp@);
mem_buff = malloc(a_size + s_size + u_size + vt_size + iwork_size);
if (!mem_buff)
goto error;
a = mem_buff;
s = a + a_size;
u = s + s_size;
vt = u + u_size;
iwork = vt + vt_size;
/* fix vt_column_count so that it is a valid lapack parameter (0 is not) */
vt_column_count = vt_column_count < 1? 1 : vt_column_count;
{
/* compute optimal work size */
@ftyp@ work_size_query;
fortran_int do_query = -1;
fortran_int rv;
LAPACK(@lapack_func@)(&jobz, &m, &n,
(void*)a, &m, (void*)s, (void*)u, &m,
(void*)vt, &vt_column_count,
&work_size_query, &do_query,
(void*)iwork, &rv);
if (0!=rv)
goto error;
work_count = (fortran_int)work_size_query;
work_size = (size_t)work_count * sizeof(@ftyp@);
}
mem_buff2 = malloc(work_size);
if (!mem_buff2)
goto error;
work = mem_buff2;
params->M = m;
params->N = n;
params->A = a;
params->S = s;
params->U = u;
params->VT = vt;
params->WORK = work;
params->RWORK = NULL;
params->IWORK = iwork;
params->M = m;
params->N = n;
params->LDA = m;
params->LDU = m;
params->LDVT = vt_column_count;
params->LWORK = work_count;
params->JOBZ = jobz;
return 1;
error:
TRACE_TXT("%s failed init\n", __FUNCTION__);
free(mem_buff);
free(mem_buff2);
memset(params, 0, sizeof(*params));
return 0;
}
static inline fortran_int
call_@lapack_func@(GESDD_PARAMS_t *params)
{
fortran_int rv;
LAPACK(@lapack_func@)(&params->JOBZ, &params->M, &params->N,
params->A, &params->LDA,
params->S,
params->U, &params->LDU,
params->VT, &params->LDVT,
params->WORK, &params->LWORK,
params->IWORK,
&rv);
return rv;
}
/**end repeat**/
/**begin repeat
#TYPE=CFLOAT,CDOUBLE#
#ftyp=fortran_complex,fortran_doublecomplex#
#frealtyp=fortran_real,fortran_doublereal#
#typ=COMPLEX_t,DOUBLECOMPLEX_t#
#lapack_func=cgesdd,zgesdd#
*/
static inline int
init_@lapack_func@(GESDD_PARAMS_t *params,
char jobz,
fortran_int m,
fortran_int n)
{
npy_uint8 *mem_buff = NULL, *mem_buff2 = NULL;
npy_uint8 *a,*s, *u, *vt, *work, *rwork, *iwork;
size_t a_size, s_size, u_size, vt_size, work_size, rwork_size, iwork_size;
fortran_int u_row_count, vt_column_count, work_count;
fortran_int min_m_n = m<n?m:n;
if (!compute_urows_vtcolumns(jobz, m, n, &u_row_count, &vt_column_count))
goto error;
a_size = ((size_t)m)*((size_t)n)*sizeof(@ftyp@);
s_size = ((size_t)min_m_n)*sizeof(@frealtyp@);
u_size = ((size_t)u_row_count)*m*sizeof(@ftyp@);
vt_size = n*((size_t)vt_column_count)*sizeof(@ftyp@);
rwork_size = 'N'==jobz?
7*((size_t)min_m_n) :
(5*(size_t)min_m_n*(size_t)min_m_n + 5*(size_t)min_m_n);
rwork_size *= sizeof(@ftyp@);
iwork_size = 8*((size_t)min_m_n)*sizeof(fortran_int);
mem_buff = malloc(a_size +
s_size +
u_size +
vt_size +
rwork_size +
iwork_size);
if (!mem_buff)
goto error;
a = mem_buff;
s = a + a_size;
u = s + s_size;
vt = u + u_size;
rwork = vt + vt_size;
iwork = rwork + rwork_size;
/* fix vt_column_count so that it is a valid lapack parameter (0 is not) */
vt_column_count = vt_column_count < 1? 1 : vt_column_count;
{
/* compute optimal work size */
@ftyp@ work_size_query;
fortran_int do_query = -1;
fortran_int rv;
LAPACK(@lapack_func@)(&jobz, &m, &n,
(void*)a, &m, (void*)s, (void*)u, &m,
(void*)vt, &vt_column_count,
&work_size_query, &do_query,
(void*)rwork,
(void*)iwork, &rv);
if (0!=rv)
goto error;
work_count = (fortran_int)((@typ@*)&work_size_query)->array[0];
work_size = (size_t)work_count * sizeof(@ftyp@);
}
mem_buff2 = malloc(work_size);
if (!mem_buff2)
goto error;
work = mem_buff2;
params->A = a;
params->S = s;
params->U = u;
params->VT = vt;
params->WORK = work;
params->RWORK = rwork;
params->IWORK = iwork;
params->M = m;
params->N = n;
params->LDA = m;
params->LDU = m;
params->LDVT = vt_column_count;
params->LWORK = work_count;
params->JOBZ = jobz;
return 1;
error:
TRACE_TXT("%s failed init\n", __FUNCTION__);
free(mem_buff2);
free(mem_buff);
memset(params, 0, sizeof(*params));
return 0;
}
static inline fortran_int
call_@lapack_func@(GESDD_PARAMS_t *params)
{
fortran_int rv;
LAPACK(@lapack_func@)(&params->JOBZ, &params->M, &params->N,
params->A, &params->LDA,
params->S,
params->U, &params->LDU,
params->VT, &params->LDVT,
params->WORK, &params->LWORK,
params->RWORK,
params->IWORK,
&rv);
return rv;
}
/**end repeat**/
/**begin repeat
#TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
#REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
#lapack_func=sgesdd,dgesdd,cgesdd,zgesdd#
*/
static inline void
release_@lapack_func@(GESDD_PARAMS_t* params)
{
/* A and WORK contain allocated blocks */
free(params->A);
free(params->WORK);
memset(params, 0, sizeof(*params));
}
static inline void
@TYPE@_svd_wrapper(char JOBZ,
char **args,
npy_intp* dimensions,
npy_intp* steps)
{
ptrdiff_t outer_steps[4];
int error_occurred = get_fp_invalid_and_clear();
size_t iter;
size_t outer_dim = *dimensions++;
size_t op_count = (JOBZ=='N')?2:4;
GESDD_PARAMS_t params;
for (iter=0; iter < op_count; ++iter) {
outer_steps[iter] = (ptrdiff_t) steps[iter];
}
steps += op_count;
if (init_@lapack_func@(&params,
JOBZ,
(fortran_int)dimensions[0],
(fortran_int)dimensions[1])) {
LINEARIZE_DATA_t a_in, u_out, s_out, v_out;
init_linearize_data(&a_in, params.N, params.M, steps[1], steps[0]);
if ('N' == params.JOBZ) {
/* only the singular values are wanted */
fortran_int min_m_n = params.M < params.N? params.M : params.N;
init_linearize_data(&s_out, 1, min_m_n, 0, steps[2]);
} else {
fortran_int u_columns, v_rows;
fortran_int min_m_n = params.M < params.N? params.M : params.N;
if ('S' == params.JOBZ) {
u_columns = min_m_n;
v_rows = min_m_n;
} else {
u_columns = params.M;
v_rows = params.N;
}
init_linearize_data(&u_out,
u_columns, params.M,
steps[3], steps[2]);
init_linearize_data(&s_out,
1, min_m_n,
0, steps[4]);
init_linearize_data(&v_out,
params.N, v_rows,
steps[6], steps[5]);
}
for (iter = 0; iter < outer_dim; ++iter) {
int not_ok;
/* copy the matrix in */
linearize_@TYPE@_matrix(params.A, args[0], &a_in);
not_ok = call_@lapack_func@(&params);
if (!not_ok) {
if ('N' == params.JOBZ) {
delinearize_@REALTYPE@_matrix(args[1], params.S, &s_out);
} else {
delinearize_@TYPE@_matrix(args[1], params.U, &u_out);
delinearize_@REALTYPE@_matrix(args[2], params.S, &s_out);
delinearize_@TYPE@_matrix(args[3], params.VT, &v_out);
}
} else {
error_occurred = 1;
if ('N' == params.JOBZ) {
nan_@REALTYPE@_matrix(args[1], &s_out);
} else {
nan_@TYPE@_matrix(args[1], &u_out);
nan_@REALTYPE@_matrix(args[2], &s_out);
nan_@TYPE@_matrix(args[3], &v_out);
}
}
update_pointers((npy_uint8**)args, outer_steps, op_count);
}
release_@lapack_func@(&params);
}
set_fp_invalid_or_clear(error_occurred);
}
/**end repeat*/
/* svd gufunc entry points */
/**begin repeat
#TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
*/
static void
@TYPE@_svd_N(char **args,
npy_intp *dimensions,
npy_intp *steps,
void *NPY_UNUSED(func))
{
@TYPE@_svd_wrapper('N', args, dimensions, steps);
}
static void
@TYPE@_svd_S(char **args,
npy_intp *dimensions,
npy_intp *steps,
void *NPY_UNUSED(func))
{
@TYPE@_svd_wrapper('S', args, dimensions, steps);
}
static void
@TYPE@_svd_A(char **args,
npy_intp *dimensions,
npy_intp *steps,
void *NPY_UNUSED(func))
{
@TYPE@_svd_wrapper('A', args, dimensions, steps);
}
/**end repeat**/
#pragma GCC diagnostic pop
/* -------------------------------------------------------------------------- */
/* gufunc registration */
static void *array_of_nulls[] = {
(void *)NULL,
(void *)NULL,
(void *)NULL,
(void *)NULL,
(void *)NULL,
(void *)NULL,
(void *)NULL,
(void *)NULL,
(void *)NULL,
(void *)NULL,
(void *)NULL,
(void *)NULL,
(void *)NULL,
(void *)NULL,
(void *)NULL,
(void *)NULL
};
#define FUNC_ARRAY_NAME(NAME) NAME ## _funcs
#define GUFUNC_FUNC_ARRAY_REAL(NAME) \
static PyUFuncGenericFunction \
FUNC_ARRAY_NAME(NAME)[] = { \
FLOAT_ ## NAME, \
DOUBLE_ ## NAME \
}
#define GUFUNC_FUNC_ARRAY_REAL_COMPLEX(NAME) \
static PyUFuncGenericFunction \
FUNC_ARRAY_NAME(NAME)[] = { \
FLOAT_ ## NAME, \
DOUBLE_ ## NAME, \
CFLOAT_ ## NAME, \
CDOUBLE_ ## NAME \
}
/* There are problems with eig in complex single precision.
* That kernel is disabled
*/
#define GUFUNC_FUNC_ARRAY_EIG(NAME) \
static PyUFuncGenericFunction \
FUNC_ARRAY_NAME(NAME)[] = { \
FLOAT_ ## NAME, \
DOUBLE_ ## NAME, \
CDOUBLE_ ## NAME \
}
GUFUNC_FUNC_ARRAY_REAL_COMPLEX(slogdet);
GUFUNC_FUNC_ARRAY_REAL_COMPLEX(det);
GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eighlo);
GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eighup);
GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eigvalshlo);
GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eigvalshup);
GUFUNC_FUNC_ARRAY_REAL_COMPLEX(solve);
GUFUNC_FUNC_ARRAY_REAL_COMPLEX(solve1);
GUFUNC_FUNC_ARRAY_REAL_COMPLEX(inv);
GUFUNC_FUNC_ARRAY_REAL_COMPLEX(cholesky_lo);
GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_N);
GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_S);
GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_A);
GUFUNC_FUNC_ARRAY_EIG(eig);
GUFUNC_FUNC_ARRAY_EIG(eigvals);
static char equal_2_types[] = {
NPY_FLOAT, NPY_FLOAT,
NPY_DOUBLE, NPY_DOUBLE,
NPY_CFLOAT, NPY_CFLOAT,
NPY_CDOUBLE, NPY_CDOUBLE
};
static char equal_3_types[] = {
NPY_FLOAT, NPY_FLOAT, NPY_FLOAT,
NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE,
NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT,
NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE
};
/* second result is logdet, that will always be a REAL */
static char slogdet_types[] = {
NPY_FLOAT, NPY_FLOAT, NPY_FLOAT,
NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE,
NPY_CFLOAT, NPY_CFLOAT, NPY_FLOAT,
NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE
};
static char eigh_types[] = {
NPY_FLOAT, NPY_FLOAT, NPY_FLOAT,
NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE,
NPY_CFLOAT, NPY_FLOAT, NPY_CFLOAT,
NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE
};
static char eighvals_types[] = {
NPY_FLOAT, NPY_FLOAT,
NPY_DOUBLE, NPY_DOUBLE,
NPY_CFLOAT, NPY_FLOAT,
NPY_CDOUBLE, NPY_DOUBLE
};
static char eig_types[] = {
NPY_FLOAT, NPY_CFLOAT, NPY_CFLOAT,
NPY_DOUBLE, NPY_CDOUBLE, NPY_CDOUBLE,
NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE
};
static char eigvals_types[] = {
NPY_FLOAT, NPY_CFLOAT,
NPY_DOUBLE, NPY_CDOUBLE,
NPY_CDOUBLE, NPY_CDOUBLE
};
static char svd_1_1_types[] = {
NPY_FLOAT, NPY_FLOAT,
NPY_DOUBLE, NPY_DOUBLE,
NPY_CFLOAT, NPY_FLOAT,
NPY_CDOUBLE, NPY_DOUBLE
};
static char svd_1_3_types[] = {
NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT,
NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE,
NPY_CFLOAT, NPY_CFLOAT, NPY_FLOAT, NPY_CFLOAT,
NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE
};
typedef struct gufunc_descriptor_struct {
char *name;
char *signature;
char *doc;
int ntypes;
int nin;
int nout;
PyUFuncGenericFunction *funcs;
char *types;
} GUFUNC_DESCRIPTOR_t;
GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = {
{
"slogdet",
"(m,m)->(),()",
"slogdet on the last two dimensions and broadcast on the rest. \n"\
"Results in two arrays, one with sign and the other with log of the"\
" determinants. \n"\
" \"(m,m)->(),()\" \n",
4, 1, 2,
FUNC_ARRAY_NAME(slogdet),
slogdet_types
},
{
"det",
"(m,m)->()",
"det of the last two dimensions and broadcast on the rest. \n"\
" \"(m,m)->()\" \n",
4, 1, 1,
FUNC_ARRAY_NAME(det),
equal_2_types
},
{
"eigh_lo",
"(m,m)->(m),(m,m)",
"eigh on the last two dimension and broadcast to the rest, using"\
" lower triangle \n"\
"Results in a vector of eigenvalues and a matrix with the"\
"eigenvectors. \n"\
" \"(m,m)->(m),(m,m)\" \n",
4, 1, 2,
FUNC_ARRAY_NAME(eighlo),
eigh_types
},
{
"eigh_up",
"(m,m)->(m),(m,m)",
"eigh on the last two dimension and broadcast to the rest, using"\
" upper triangle. \n"\
"Results in a vector of eigenvalues and a matrix with the"\
" eigenvectors. \n"\
" \"(m,m)->(m),(m,m)\" \n",
4, 1, 2,
FUNC_ARRAY_NAME(eighup),
eigh_types
},
{
"eigvalsh_lo",
"(m,m)->(m)",
"eigh on the last two dimension and broadcast to the rest, using"\
" lower triangle. \n"\
"Results in a vector of eigenvalues and a matrix with the"\
"eigenvectors. \n"\
" \"(m,m)->(m)\" \n",
4, 1, 1,
FUNC_ARRAY_NAME(eigvalshlo),
eighvals_types
},
{
"eigvalsh_up",
"(m,m)->(m)",
"eigvalsh on the last two dimension and broadcast to the rest,"\
" using upper triangle. \n"\
"Results in a vector of eigenvalues and a matrix with the"\
"eigenvectors.\n"\
" \"(m,m)->(m)\" \n",
4, 1, 1,
FUNC_ARRAY_NAME(eigvalshup),
eighvals_types
},
{
"solve",
"(m,m),(m,n)->(m,n)",
"solve the system a x = b, on the last two dimensions, broadcast"\
" to the rest. \n"\
"Results in a matrices with the solutions. \n"\
" \"(m,m),(m,n)->(m,n)\" \n",
4, 2, 1,
FUNC_ARRAY_NAME(solve),
equal_3_types
},
{
"solve1",
"(m,m),(m)->(m)",
"solve the system a x = b, for b being a vector, broadcast in"\
" the outer dimensions. \n"\
"Results in vectors with the solutions. \n"\
" \"(m,m),(m)->(m)\" \n",
4,2,1,
FUNC_ARRAY_NAME(solve1),
equal_3_types
},
{
"inv",
"(m,m)->(m,m)",
"compute the inverse of the last two dimensions and broadcast"\
" to the rest. \n"\
"Results in the inverse matrices. \n"\
" \"(m,m)->(m,m)\" \n",
4,1,1,
FUNC_ARRAY_NAME(inv),
equal_2_types
},
{
"cholesky_lo",
"(m,m)->(m,m)",
"cholesky decomposition of hermitian positive-definite matrices. \n"\
"Broadcast to all outer dimensions. \n"\
" \"(m,m)->(m,m)\" \n",
4, 1, 1,
FUNC_ARRAY_NAME(cholesky_lo),
equal_2_types
},
{
"svd_m",
"(m,n)->(m)",
"svd when n>=m. ",
4, 1, 1,
FUNC_ARRAY_NAME(svd_N),
svd_1_1_types
},
{
"svd_n",
"(m,n)->(n)",
"svd when n<=m",
4, 1, 1,
FUNC_ARRAY_NAME(svd_N),
svd_1_1_types
},
{
"svd_m_s",
"(m,n)->(m,m),(m),(m,n)",
"svd when m>=n",
4, 1, 3,
FUNC_ARRAY_NAME(svd_S),
svd_1_3_types
},
{
"svd_n_s",
"(m,n)->(m,n),(n),(n,n)",
"svd when m>=n",
4, 1, 3,
FUNC_ARRAY_NAME(svd_S),
svd_1_3_types
},
{
"svd_m_f",
"(m,n)->(m,m),(m),(n,n)",
"svd when m>=n",
4, 1, 3,
FUNC_ARRAY_NAME(svd_A),
svd_1_3_types
},
{
"svd_n_f",
"(m,n)->(m,m),(n),(n,n)",
"svd when m>=n",
4, 1, 3,
FUNC_ARRAY_NAME(svd_A),
svd_1_3_types
},
{
"eig",
"(m,m)->(m),(m,m)",
"eig on the last two dimension and broadcast to the rest. \n"\
"Results in a vector with the eigenvalues and a matrix with the"\
" eigenvectors. \n"\
" \"(m,m)->(m),(m,m)\" \n",
3, 1, 2,
FUNC_ARRAY_NAME(eig),
eig_types
},
{
"eigvals",
"(m,m)->(m)",
"eigvals on the last two dimension and broadcast to the rest. \n"\
"Results in a vector of eigenvalues. \n"\
" \"(m,m)->(m),(m,m)\" \n",
3, 1, 1,
FUNC_ARRAY_NAME(eigvals),
eigvals_types
},
};
static void
addUfuncs(PyObject *dictionary) {
PyObject *f;
int i;
const int gufunc_count = sizeof(gufunc_descriptors)/
sizeof(gufunc_descriptors[0]);
for (i=0; i < gufunc_count; i++) {
GUFUNC_DESCRIPTOR_t* d = &gufunc_descriptors[i];
f = PyUFunc_FromFuncAndDataAndSignature(d->funcs,
array_of_nulls,
d->types,
d->ntypes,
d->nin,
d->nout,
PyUFunc_None,
d->name,
d->doc,
0,
d->signature);
PyDict_SetItemString(dictionary, d->name, f);
#if 0
dump_ufunc_object((PyUFuncObject*) f);
#endif
Py_DECREF(f);
}
}
/* -------------------------------------------------------------------------- */
/* Module initialization stuff */
static PyMethodDef UMath_LinAlgMethods[] = {
{NULL, NULL, 0, NULL} /* Sentinel */
};
#if defined(NPY_PY3K)
static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
UMATH_LINALG_MODULE_NAME,
NULL,
-1,
UMath_LinAlgMethods,
NULL,
NULL,
NULL,
NULL
};
#endif
#if defined(NPY_PY3K)
#define RETVAL m
PyObject *PyInit__umath_linalg(void)
#else
#define RETVAL
PyMODINIT_FUNC
init_umath_linalg(void)
#endif
{
PyObject *m;
PyObject *d;
PyObject *version;
init_constants();
#if defined(NPY_PY3K)
m = PyModule_Create(&moduledef);
#else
m = Py_InitModule(UMATH_LINALG_MODULE_NAME, UMath_LinAlgMethods);
#endif
if (m == NULL)
return RETVAL;
import_array();
import_ufunc();
d = PyModule_GetDict(m);
version = PyString_FromString(umath_linalg_version_string);
PyDict_SetItemString(d, "__version__", version);
Py_DECREF(version);
/* Load the ufunc operators into the module's namespace */
addUfuncs(d);
if (PyErr_Occurred()) {
PyErr_SetString(PyExc_RuntimeError,
"cannot load _umath_linalg module.");
}
return RETVAL;
}