/* -*- c -*- */ /* ***************************************************************************** ** INCLUDES ** ***************************************************************************** */ #define NPY_NO_DEPRECATED_API NPY_API_VERSION #include "Python.h" #include "numpy/arrayobject.h" #include "numpy/ufuncobject.h" #include "npy_pycompat.h" #include "npy_config.h" #include #include #include #include static const char* umath_linalg_version_string = "0.1.4"; /* **************************************************************************** * Debugging support * **************************************************************************** */ #define TRACE_TXT(...) do { fprintf (stderr, __VA_ARGS__); } while (0) #define STACK_TRACE do {} while (0) #define TRACE\ do { \ fprintf (stderr, \ "%s:%d:%s\n", \ __FILE__, \ __LINE__, \ __FUNCTION__); \ STACK_TRACE; \ } while (0) #if 0 #include void dbg_stack_trace() { void *trace[32]; size_t size; size = backtrace(trace, sizeof(trace)/sizeof(trace[0])); backtrace_symbols_fd(trace, size, 1); } #undef STACK_TRACE #define STACK_TRACE do { dbg_stack_trace(); } while (0) #endif /* ***************************************************************************** * BLAS/LAPACK calling macros * ***************************************************************************** */ #ifdef NO_APPEND_FORTRAN # define FNAME(x) x #else # define FNAME(x) x##_ #endif typedef struct { float r, i; } f2c_complex; typedef struct { double r, i; } f2c_doublecomplex; /* typedef long int (*L_fp)(); */ extern int FNAME(sgeev)(char *jobvl, char *jobvr, int *n, float a[], int *lda, float wr[], float wi[], float vl[], int *ldvl, float vr[], int *ldvr, float work[], int lwork[], int *info); extern int FNAME(dgeev)(char *jobvl, char *jobvr, int *n, double a[], int *lda, double wr[], double wi[], double vl[], int *ldvl, double vr[], int *ldvr, double work[], int lwork[], int *info); extern int FNAME(cgeev)(char *jobvl, char *jobvr, int *n, f2c_doublecomplex a[], int *lda, f2c_doublecomplex w[], f2c_doublecomplex vl[], int *ldvl, f2c_doublecomplex vr[], int *ldvr, f2c_doublecomplex work[], int *lwork, double rwork[], int *info); extern int FNAME(zgeev)(char *jobvl, char *jobvr, int *n, f2c_doublecomplex a[], int *lda, f2c_doublecomplex w[], f2c_doublecomplex vl[], int *ldvl, f2c_doublecomplex vr[], int *ldvr, f2c_doublecomplex work[], int *lwork, double rwork[], int *info); extern int FNAME(ssyevd)(char *jobz, char *uplo, int *n, float a[], int *lda, float w[], float work[], int *lwork, int iwork[], int *liwork, int *info); extern int FNAME(dsyevd)(char *jobz, char *uplo, int *n, double a[], int *lda, double w[], double work[], int *lwork, int iwork[], int *liwork, int *info); extern int FNAME(cheevd)(char *jobz, char *uplo, int *n, f2c_complex a[], int *lda, float w[], f2c_complex work[], int *lwork, float rwork[], int *lrwork, int iwork[], int *liwork, int *info); extern int FNAME(zheevd)(char *jobz, char *uplo, int *n, f2c_doublecomplex a[], int *lda, double w[], f2c_doublecomplex work[], int *lwork, double rwork[], int *lrwork, int iwork[], int *liwork, int *info); extern int FNAME(dgelsd)(int *m, int *n, int *nrhs, double a[], int *lda, double b[], int *ldb, double s[], double *rcond, int *rank, double work[], int *lwork, int iwork[], int *info); extern int FNAME(zgelsd)(int *m, int *n, int *nrhs, f2c_doublecomplex a[], int *lda, f2c_doublecomplex b[], int *ldb, double s[], double *rcond, int *rank, f2c_doublecomplex work[], int *lwork, double rwork[], int iwork[], int *info); extern int FNAME(sgesv)(int *n, int *nrhs, float a[], int *lda, int ipiv[], float b[], int *ldb, int *info); extern int FNAME(dgesv)(int *n, int *nrhs, double a[], int *lda, int ipiv[], double b[], int *ldb, int *info); extern int FNAME(cgesv)(int *n, int *nrhs, f2c_complex a[], int *lda, int ipiv[], f2c_complex b[], int *ldb, int *info); extern int FNAME(zgesv)(int *n, int *nrhs, f2c_doublecomplex a[], int *lda, int ipiv[], f2c_doublecomplex b[], int *ldb, int *info); extern int FNAME(sgetrf)(int *m, int *n, float a[], int *lda, int ipiv[], int *info); extern int FNAME(dgetrf)(int *m, int *n, double a[], int *lda, int ipiv[], int *info); extern int FNAME(cgetrf)(int *m, int *n, f2c_complex a[], int *lda, int ipiv[], int *info); extern int FNAME(zgetrf)(int *m, int *n, f2c_doublecomplex a[], int *lda, int ipiv[], int *info); extern int FNAME(spotrf)(char *uplo, int *n, float a[], int *lda, int *info); extern int FNAME(dpotrf)(char *uplo, int *n, double a[], int *lda, int *info); extern int FNAME(cpotrf)(char *uplo, int *n, f2c_complex a[], int *lda, int *info); extern int FNAME(zpotrf)(char *uplo, int *n, f2c_doublecomplex a[], int *lda, int *info); extern int FNAME(sgesdd)(char *jobz, int *m, int *n, float a[], int *lda, float s[], float u[], int *ldu, float vt[], int *ldvt, float work[], int *lwork, int iwork[], int *info); extern int FNAME(dgesdd)(char *jobz, int *m, int *n, double a[], int *lda, double s[], double u[], int *ldu, double vt[], int *ldvt, double work[], int *lwork, int iwork[], int *info); extern int FNAME(cgesdd)(char *jobz, int *m, int *n, f2c_complex a[], int *lda, float s[], f2c_complex u[], int *ldu, f2c_complex vt[], int *ldvt, f2c_complex work[], int *lwork, float rwork[], int iwork[], int *info); extern int FNAME(zgesdd)(char *jobz, int *m, int *n, f2c_doublecomplex a[], int *lda, double s[], f2c_doublecomplex u[], int *ldu, f2c_doublecomplex vt[], int *ldvt, f2c_doublecomplex work[], int *lwork, double rwork[], int iwork[], int *info); extern int FNAME(spotrs)(char *uplo, int *n, int *nrhs, float a[], int *lda, float b[], int *ldb, int *info); extern int FNAME(dpotrs)(char *uplo, int *n, int *nrhs, double a[], int *lda, double b[], int *ldb, int *info); extern int FNAME(cpotrs)(char *uplo, int *n, int *nrhs, f2c_complex a[], int *lda, f2c_complex b[], int *ldb, int *info); extern int FNAME(zpotrs)(char *uplo, int *n, int *nrhs, f2c_doublecomplex a[], int *lda, f2c_doublecomplex b[], int *ldb, int *info); extern int FNAME(spotri)(char *uplo, int *n, float a[], int *lda, int *info); extern int FNAME(dpotri)(char *uplo, int *n, double a[], int *lda, int *info); extern int FNAME(cpotri)(char *uplo, int *n, f2c_complex a[], int *lda, int *info); extern int FNAME(zpotri)(char *uplo, int *n, f2c_doublecomplex a[], int *lda, int *info); extern int FNAME(scopy)(int *n, float *sx, int *incx, float *sy, int *incy); extern int FNAME(dcopy)(int *n, double *sx, int *incx, double *sy, int *incy); extern int FNAME(ccopy)(int *n, f2c_complex *sx, int *incx, f2c_complex *sy, int *incy); extern int FNAME(zcopy)(int *n, f2c_doublecomplex *sx, int *incx, f2c_doublecomplex *sy, int *incy); extern float FNAME(sdot)(int *n, float *sx, int *incx, float *sy, int *incy); extern double FNAME(ddot)(int *n, double *sx, int *incx, double *sy, int *incy); extern f2c_complex FNAME(cdotu)(int *n, f2c_complex *sx, int *incx, f2c_complex *sy, int *incy); extern f2c_doublecomplex FNAME(zdotu)(int *n, f2c_doublecomplex *sx, int *incx, f2c_doublecomplex *sy, int *incy); extern f2c_complex FNAME(cdotc)(int *n, f2c_complex *sx, int *incx, f2c_complex *sy, int *incy); extern f2c_doublecomplex FNAME(zdotc)(int *n, f2c_doublecomplex *sx, int *incx, f2c_doublecomplex *sy, int *incy); extern int FNAME(sgemm)(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc); extern int FNAME(dgemm)(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc); extern int FNAME(cgemm)(char *transa, char *transb, int *m, int *n, int *k, f2c_complex *alpha, f2c_complex *a, int *lda, f2c_complex *b, int *ldb, f2c_complex *beta, f2c_complex *c, int *ldc); extern int FNAME(zgemm)(char *transa, char *transb, int *m, int *n, int *k, f2c_doublecomplex *alpha, f2c_doublecomplex *a, int *lda, f2c_doublecomplex *b, int *ldb, f2c_doublecomplex *beta, f2c_doublecomplex *c, int *ldc); #define LAPACK_T(FUNC) \ TRACE_TXT("Calling LAPACK ( " # FUNC " )\n"); \ FNAME(FUNC) #define BLAS(FUNC) \ FNAME(FUNC) #define LAPACK(FUNC) \ FNAME(FUNC) typedef int fortran_int; typedef float fortran_real; typedef double fortran_doublereal; typedef f2c_complex fortran_complex; typedef f2c_doublecomplex fortran_doublecomplex; /* ***************************************************************************** ** Some handy functions ** ***************************************************************************** */ static inline void * offset_ptr(void* ptr, ptrdiff_t offset) { return (void*)((npy_uint8*)ptr + offset); } static inline int get_fp_invalid_and_clear(void) { int status; status = npy_clear_floatstatus(); return !!(status & NPY_FPE_INVALID); } static inline void set_fp_invalid_or_clear(int error_occurred) { if (error_occurred) { npy_set_floatstatus_invalid(); } else { npy_clear_floatstatus(); } } /* ***************************************************************************** ** Some handy constants ** ***************************************************************************** */ #define UMATH_LINALG_MODULE_NAME "_umath_linalg" typedef union { fortran_complex f; npy_cfloat npy; float array[2]; } COMPLEX_t; typedef union { fortran_doublecomplex f; npy_cdouble npy; double array[2]; } DOUBLECOMPLEX_t; static float s_one; static float s_zero; static float s_minus_one; static float s_ninf; static float s_nan; static double d_one; static double d_zero; static double d_minus_one; static double d_ninf; static double d_nan; static COMPLEX_t c_one; static COMPLEX_t c_zero; static COMPLEX_t c_minus_one; static COMPLEX_t c_ninf; static COMPLEX_t c_nan; static DOUBLECOMPLEX_t z_one; static DOUBLECOMPLEX_t z_zero; static DOUBLECOMPLEX_t z_minus_one; static DOUBLECOMPLEX_t z_ninf; static DOUBLECOMPLEX_t z_nan; static void init_constants(void) { /* this is needed as NPY_INFINITY and NPY_NAN macros can't be used as initializers. I prefer to just set all the constants the same way. */ s_one = 1.0f; s_zero = 0.0f; s_minus_one = -1.0f; s_ninf = -NPY_INFINITYF; s_nan = NPY_NANF; d_one = 1.0; d_zero = 0.0; d_minus_one = -1.0; d_ninf = -NPY_INFINITY; d_nan = NPY_NAN; c_one.array[0] = 1.0f; c_one.array[1] = 0.0f; c_zero.array[0] = 0.0f; c_zero.array[1] = 0.0f; c_minus_one.array[0] = -1.0f; c_minus_one.array[1] = 0.0f; c_ninf.array[0] = -NPY_INFINITYF; c_ninf.array[1] = 0.0f; c_nan.array[0] = NPY_NANF; c_nan.array[1] = NPY_NANF; z_one.array[0] = 1.0; z_one.array[1] = 0.0; z_zero.array[0] = 0.0; z_zero.array[1] = 0.0; z_minus_one.array[0] = -1.0; z_minus_one.array[1] = 0.0; z_ninf.array[0] = -NPY_INFINITY; z_ninf.array[1] = 0.0; z_nan.array[0] = NPY_NAN; z_nan.array[1] = NPY_NAN; } /* ***************************************************************************** ** Structs used for data rearrangement ** ***************************************************************************** */ /* this struct contains information about how to linearize in a local buffer a matrix so that it can be used by blas functions. All strides are specified in number of elements (similar to what blas expects) dst_row_strides: number of elements between different row. Matrix is considered row-major dst_column_strides: number of elements between differnt columns in the destination buffer rows: number of rows of the matrix columns: number of columns of the matrix src_row_strides: strides needed to access the next row in the source matrix src_column_strides: strides needed to access the next column in the source matrix */ typedef struct linearize_data_struct { size_t rows; size_t columns; ptrdiff_t row_strides; ptrdiff_t column_strides; } LINEARIZE_DATA_t; static inline void init_linearize_data(LINEARIZE_DATA_t *lin_data, int rows, int columns, ptrdiff_t row_strides, ptrdiff_t column_strides) { lin_data->rows = rows; lin_data->columns = columns; lin_data->row_strides = row_strides; lin_data->column_strides = column_strides; } static inline void dump_ufunc_object(PyUFuncObject* ufunc) { TRACE_TXT("\n\n%s '%s' (%d input(s), %d output(s), %d specialization(s).\n", ufunc->core_enabled? "generalized ufunc" : "scalar ufunc", ufunc->name, ufunc->nin, ufunc->nout, ufunc->ntypes); if (ufunc->core_enabled) { int arg; int dim; TRACE_TXT("\t%s (%d dimension(s) detected).\n", ufunc->core_signature, ufunc->core_num_dim_ix); for (arg = 0; arg < ufunc->nargs; arg++){ int * arg_dim_ix = ufunc->core_dim_ixs + ufunc->core_offsets[arg]; TRACE_TXT("\t\targ %d (%s) has %d dimension(s): (", arg, arg < ufunc->nin? "INPUT" : "OUTPUT", ufunc->core_num_dims[arg]); for (dim = 0; dim < ufunc->core_num_dims[arg]; dim ++) { TRACE_TXT(" %d", arg_dim_ix[dim]); } TRACE_TXT(" )\n"); } } } static inline void dump_linearize_data(const char* name, const LINEARIZE_DATA_t* params) { TRACE_TXT("\n\t%s rows: %zd columns: %zd"\ "\n\t\trow_strides: %td column_strides: %td"\ "\n", name, params->rows, params->columns, params->row_strides, params->column_strides); } static inline float FLOAT_add(float op1, float op2) { return op1 + op2; } static inline double DOUBLE_add(double op1, double op2) { return op1 + op2; } static inline COMPLEX_t CFLOAT_add(COMPLEX_t op1, COMPLEX_t op2) { COMPLEX_t result; result.array[0] = op1.array[0] + op2.array[0]; result.array[1] = op1.array[1] + op2.array[1]; return result; } static inline DOUBLECOMPLEX_t CDOUBLE_add(DOUBLECOMPLEX_t op1, DOUBLECOMPLEX_t op2) { DOUBLECOMPLEX_t result; result.array[0] = op1.array[0] + op2.array[0]; result.array[1] = op1.array[1] + op2.array[1]; return result; } static inline float FLOAT_mul(float op1, float op2) { return op1*op2; } static inline double DOUBLE_mul(double op1, double op2) { return op1*op2; } static inline COMPLEX_t CFLOAT_mul(COMPLEX_t op1, COMPLEX_t op2) { COMPLEX_t result; result.array[0] = op1.array[0]*op2.array[0] - op1.array[1]*op2.array[1]; result.array[1] = op1.array[1]*op2.array[0] + op1.array[0]*op2.array[1]; return result; } static inline DOUBLECOMPLEX_t CDOUBLE_mul(DOUBLECOMPLEX_t op1, DOUBLECOMPLEX_t op2) { DOUBLECOMPLEX_t result; result.array[0] = op1.array[0]*op2.array[0] - op1.array[1]*op2.array[1]; result.array[1] = op1.array[1]*op2.array[0] + op1.array[0]*op2.array[1]; return result; } static inline float FLOAT_mulc(float op1, float op2) { return op1*op2; } static inline double DOUBLE_mulc(float op1, float op2) { return op1*op2; } static inline COMPLEX_t CFLOAT_mulc(COMPLEX_t op1, COMPLEX_t op2) { COMPLEX_t result; result.array[0] = op1.array[0]*op2.array[0] + op1.array[1]*op2.array[1]; result.array[1] = op1.array[0]*op2.array[1] - op1.array[1]*op2.array[0]; return result; } static inline DOUBLECOMPLEX_t CDOUBLE_mulc(DOUBLECOMPLEX_t op1, DOUBLECOMPLEX_t op2) { DOUBLECOMPLEX_t result; result.array[0] = op1.array[0]*op2.array[0] + op1.array[1]*op2.array[1]; result.array[1] = op1.array[0]*op2.array[1] - op1.array[1]*op2.array[0]; return result; } static inline void print_FLOAT(npy_float s) { TRACE_TXT(" %8.4f", s); } static inline void print_DOUBLE(npy_double d) { TRACE_TXT(" %10.6f", d); } static inline void print_CFLOAT(npy_cfloat c) { float* c_parts = (float*)&c; TRACE_TXT("(%8.4f, %8.4fj)", c_parts[0], c_parts[1]); } static inline void print_CDOUBLE(npy_cdouble z) { double* z_parts = (double*)&z; TRACE_TXT("(%8.4f, %8.4fj)", z_parts[0], z_parts[1]); } /**begin repeat #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE# #typ=npy_float,npy_double,npy_cfloat,npy_cdouble# */ static inline void dump_@TYPE@_matrix(const char* name, size_t rows, size_t columns, const @typ@* ptr) { size_t i,j; TRACE_TXT("\n%s %p (%zd, %zd)\n", name, ptr, rows, columns); for (i=0; icolumns; fortran_int column_strides = (fortran_int)(data->column_strides/sizeof(@typ@)); fortran_int one = 1; for (i=0; i< data->rows; i++) { if (column_strides > 0) { FNAME(@copy@)(&columns, (void*)src, &column_strides, (void*)dst, &one); } else if (column_strides < 0) { FNAME(@copy@)(&columns, (void*)((@typ@*)src + (columns-1)*column_strides), &column_strides, (void*)dst, &one); } else { /* * Zero stride has undefined behavior in some BLAS * implementations (e.g. OSX Accelerate), so do it * manually */ for (j = 0; j < columns; ++j) { memcpy((@typ@*)dst + j, (@typ@*)src, sizeof(@typ@)); } } src += data->row_strides/sizeof(@typ@); dst += data->columns; } return rv; } else { return src; } } static inline void * delinearize_@TYPE@_matrix(void *dst_in, void *src_in, const LINEARIZE_DATA_t* data) { @typ@ *src = (@typ@ *) src_in; @typ@ *dst = (@typ@ *) dst_in; if (src) { int i; @typ@ *rv = src; fortran_int columns = (fortran_int)data->columns; fortran_int column_strides = (fortran_int)(data->column_strides/sizeof(@typ@)); fortran_int one = 1; for (i=0; i < data->rows; i++) { if (column_strides > 0) { FNAME(@copy@)(&columns, (void*)src, &one, (void*)dst, &column_strides); } else if (column_strides < 0) { FNAME(@copy@)(&columns, (void*)src, &one, (void*)((@typ@*)dst + (columns-1)*column_strides), &column_strides); } else { /* * Zero stride has undefined behavior in some BLAS * implementations (e.g. OSX Accelerate), so do it * manually */ if (columns > 0) { memcpy((@typ@*)dst, (@typ@*)src + (columns-1), sizeof(@typ@)); } } src += data->columns; dst += data->row_strides/sizeof(@typ@); } return rv; } else { return src; } } static inline void nan_@TYPE@_matrix(void *dst_in, const LINEARIZE_DATA_t* data) { @typ@ *dst = (@typ@ *) dst_in; int i,j; for (i=0; i < data->rows; i++) { @typ@ *cp = dst; ptrdiff_t cs = data->column_strides/sizeof(@typ@); for (j=0; j< data->columns; ++j) { *cp = @nan@; cp += cs; } dst += data->row_strides/sizeof(@typ@); } } /**end repeat**/ /* identity square matrix generation */ /**begin repeat #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE# #typ=float,double,COMPLEX_t,DOUBLECOMPLEX_t# #cblas_type=s,d,c,z# */ static inline void identity_@TYPE@_matrix(void *ptr, size_t n) { size_t i; @typ@ *matrix = (@typ@*) ptr; /* in IEEE floating point, zeroes are represented as bitwise 0 */ memset(matrix, 0, n*n*sizeof(@typ@)); for (i = 0; i < n; ++i) { *matrix = @cblas_type@_one; matrix += n+1; } } /**end repeat**/ /* lower/upper triangular matrix using blas (in place) */ /**begin repeat #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE# #typ=float,double,COMPLEX_t,DOUBLECOMPLEX_t# #cblas_type=s,d,c,z# */ static inline void triu_@TYPE@_matrix(void *ptr, size_t n) { size_t i,j; @typ@ *matrix = (@typ@*)ptr; matrix += n; for (i=1; i < n; ++i) { for (j=0; jA = a; params->W = w; params->WORK = work; params->RWORK = NULL; /* unused */ params->IWORK = iwork; params->N = N; params->LWORK = lwork; params->LRWORK = 0; /* unused */ params->LIWORK = liwork; params->JOBZ = JOBZ; params->UPLO = UPLO; return 1; error: /* something failed */ memset(params, 0, sizeof(*params)); free(mem_buff2); free(mem_buff); return 0; } static inline fortran_int call_@lapack_func@(EIGH_PARAMS_t *params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->JOBZ, ¶ms->UPLO, ¶ms->N, params->A, ¶ms->N, params->W, params->WORK, ¶ms->LWORK, params->IWORK, ¶ms->LIWORK, &rv); return rv; } /**end repeat**/ /**begin repeat #TYPE=CFLOAT,CDOUBLE# #typ=npy_cfloat,npy_cdouble# #basetyp=npy_float,npy_double# #ftyp=fortran_complex,fortran_doublecomplex# #fbasetyp=fortran_real,fortran_doublereal# #lapack_func=cheevd,zheevd# */ /* * Initialize the parameters to use in for the lapack function _heev * Handles buffer allocation */ static inline int init_@lapack_func@(EIGH_PARAMS_t *params, char JOBZ, char UPLO, fortran_int N) { npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; @ftyp@ query_work_size; @fbasetyp@ query_rwork_size; fortran_int query_iwork_size; fortran_int lwork = -1; fortran_int lrwork = -1; fortran_int liwork = -1; npy_uint8 *a, *w, *work, *rwork, *iwork; fortran_int info; mem_buff = malloc(N*N*sizeof(@typ@)+N*sizeof(@basetyp@)); if (!mem_buff) goto error; a = mem_buff; w = mem_buff+N*N*sizeof(@typ@); LAPACK(@lapack_func@)(&JOBZ, &UPLO, &N, (@ftyp@*)a, &N, (@fbasetyp@*)w, &query_work_size, &lwork, &query_rwork_size, &lrwork, &query_iwork_size, &liwork, &info); if (info != 0) goto error; lwork = (fortran_int)*(@fbasetyp@*)&query_work_size; lrwork = (fortran_int)query_rwork_size; liwork = query_iwork_size; mem_buff2 = malloc(lwork*sizeof(@typ@) + lrwork*sizeof(@basetyp@) + liwork*sizeof(fortran_int)); if (!mem_buff2) goto error; work = mem_buff2; rwork = work + lwork*sizeof(@typ@); iwork = rwork + lrwork*sizeof(@basetyp@); params->A = a; params->W = w; params->WORK = work; params->RWORK = rwork; params->IWORK = iwork; params->N = N; params->LWORK = lwork; params->LRWORK = lrwork; params->LIWORK = liwork; params->JOBZ = JOBZ; params->UPLO = UPLO; return 1; /* something failed */ error: memset(params, 0, sizeof(*params)); free(mem_buff2); free(mem_buff); return 0; } static inline fortran_int call_@lapack_func@(EIGH_PARAMS_t *params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->JOBZ, ¶ms->UPLO, ¶ms->N, params->A, ¶ms->N, params->W, params->WORK, ¶ms->LWORK, params->RWORK, ¶ms->LRWORK, params->IWORK, ¶ms->LIWORK, &rv); return rv; } /**end repeat**/ /**begin repeat #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE# #BASETYPE=FLOAT,DOUBLE,FLOAT,DOUBLE# #typ=npy_float,npy_double,npy_cfloat,npy_cdouble# #basetyp=npy_float,npy_double,npy_float,npy_double# #lapack_func=ssyevd,dsyevd,cheevd,zheevd# **/ /* * (M,M)->(M,)(M,M) * dimensions[1] -> M * args[0] -> A[in] * args[1] -> W * args[2] -> A[out] */ static inline void release_@lapack_func@(EIGH_PARAMS_t *params) { /* allocated memory in A and WORK */ free(params->A); free(params->WORK); memset(params, 0, sizeof(*params)); } static inline void @TYPE@_eigh_wrapper(char JOBZ, char UPLO, char**args, npy_intp* dimensions, npy_intp* steps) { ptrdiff_t outer_steps[3]; size_t iter; size_t outer_dim = *dimensions++; size_t op_count = (JOBZ=='N')?2:3; EIGH_PARAMS_t eigh_params; int error_occurred = get_fp_invalid_and_clear(); for (iter=0; iter < op_count; ++iter) { outer_steps[iter] = (ptrdiff_t) steps[iter]; } steps += op_count; if (init_@lapack_func@(&eigh_params, JOBZ, UPLO, (fortran_int)dimensions[0])) { LINEARIZE_DATA_t matrix_in_ld; LINEARIZE_DATA_t eigenvectors_out_ld; LINEARIZE_DATA_t eigenvalues_out_ld; init_linearize_data(&matrix_in_ld, eigh_params.N, eigh_params.N, steps[1], steps[0]); init_linearize_data(&eigenvalues_out_ld, 1, eigh_params.N, 0, steps[2]); if ('V' == eigh_params.JOBZ) { init_linearize_data(&eigenvectors_out_ld, eigh_params.N, eigh_params.N, steps[4], steps[3]); } for (iter = 0; iter < outer_dim; ++iter) { int not_ok; /* copy the matrix in */ linearize_@TYPE@_matrix(eigh_params.A, args[0], &matrix_in_ld); not_ok = call_@lapack_func@(&eigh_params); if (!not_ok) { /* lapack ok, copy result out */ delinearize_@BASETYPE@_matrix(args[1], eigh_params.W, &eigenvalues_out_ld); if ('V' == eigh_params.JOBZ) { delinearize_@TYPE@_matrix(args[2], eigh_params.A, &eigenvectors_out_ld); } } else { /* lapack fail, set result to nan */ error_occurred = 1; nan_@BASETYPE@_matrix(args[1], &eigenvalues_out_ld); if ('V' == eigh_params.JOBZ) { nan_@TYPE@_matrix(args[2], &eigenvectors_out_ld); } } update_pointers((npy_uint8**)args, outer_steps, op_count); } release_@lapack_func@(&eigh_params); } set_fp_invalid_or_clear(error_occurred); } /**end repeat**/ /**begin repeat #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE# */ static void @TYPE@_eighlo(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { @TYPE@_eigh_wrapper('V', 'L', args, dimensions, steps); } static void @TYPE@_eighup(char **args, npy_intp *dimensions, npy_intp *steps, void* NPY_UNUSED(func)) { @TYPE@_eigh_wrapper('V', 'U', args, dimensions, steps); } static void @TYPE@_eigvalshlo(char **args, npy_intp *dimensions, npy_intp *steps, void* NPY_UNUSED(func)) { @TYPE@_eigh_wrapper('N', 'L', args, dimensions, steps); } static void @TYPE@_eigvalshup(char **args, npy_intp *dimensions, npy_intp *steps, void* NPY_UNUSED(func)) { @TYPE@_eigh_wrapper('N', 'U', args, dimensions, steps); } /**end repeat**/ /* -------------------------------------------------------------------------- */ /* Solve family (includes inv) */ typedef struct gesv_params_struct { void *A; /* A is (N,N) of base type */ void *B; /* B is (N,NRHS) of base type */ fortran_int * IPIV; /* IPIV is (N) */ fortran_int N; fortran_int NRHS; fortran_int LDA; fortran_int LDB; } GESV_PARAMS_t; /**begin repeat #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE# #typ=npy_float,npy_double,npy_cfloat,npy_cdouble# #ftyp=fortran_real,fortran_doublereal,fortran_complex,fortran_doublecomplex# #lapack_func=sgesv,dgesv,cgesv,zgesv# */ /* * Initialize the parameters to use in for the lapack function _heev * Handles buffer allocation */ static inline int init_@lapack_func@(GESV_PARAMS_t *params, fortran_int N, fortran_int NRHS) { npy_uint8 *mem_buff = NULL; npy_uint8 *a, *b, *ipiv; mem_buff = malloc(N*N*sizeof(@ftyp@) + N*NRHS*sizeof(@ftyp@) + N*sizeof(fortran_int)); if (!mem_buff) goto error; a = mem_buff; b = a + N*N*sizeof(@ftyp@); ipiv = b + N*NRHS*sizeof(@ftyp@); params->A = a; params->B = b; params->IPIV = (fortran_int*)ipiv; params->N = N; params->NRHS = NRHS; params->LDA = N; params->LDB = N; return 1; error: free(mem_buff); memset(params, 0, sizeof(*params)); return 0; } static inline void release_@lapack_func@(GESV_PARAMS_t *params) { /* memory block base is in A */ free(params->A); memset(params, 0, sizeof(*params)); } static inline fortran_int call_@lapack_func@(GESV_PARAMS_t *params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->N, ¶ms->NRHS, params->A, ¶ms->LDA, params->IPIV, params->B, ¶ms->LDB, &rv); return rv; } static void @TYPE@_solve(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { GESV_PARAMS_t params; fortran_int n, nrhs; int error_occurred = get_fp_invalid_and_clear(); INIT_OUTER_LOOP_3 n = (fortran_int)dimensions[0]; nrhs = (fortran_int)dimensions[1]; if (init_@lapack_func@(¶ms, n, nrhs)) { LINEARIZE_DATA_t a_in, b_in, r_out; init_linearize_data(&a_in, n, n, steps[1], steps[0]); init_linearize_data(&b_in, nrhs, n, steps[3], steps[2]); init_linearize_data(&r_out, nrhs, n, steps[5], steps[4]); BEGIN_OUTER_LOOP_3 int not_ok; linearize_@TYPE@_matrix(params.A, args[0], &a_in); linearize_@TYPE@_matrix(params.B, args[1], &b_in); not_ok =call_@lapack_func@(¶ms); if (!not_ok) { delinearize_@TYPE@_matrix(args[2], params.B, &r_out); } else { error_occurred = 1; nan_@TYPE@_matrix(args[2], &r_out); } END_OUTER_LOOP release_@lapack_func@(¶ms); } set_fp_invalid_or_clear(error_occurred); } static void @TYPE@_solve1(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { GESV_PARAMS_t params; int error_occurred = get_fp_invalid_and_clear(); fortran_int n; INIT_OUTER_LOOP_3 n = (fortran_int)dimensions[0]; if (init_@lapack_func@(¶ms, n, 1)) { LINEARIZE_DATA_t a_in, b_in, r_out; init_linearize_data(&a_in, n, n, steps[1], steps[0]); init_linearize_data(&b_in, 1, n, 1, steps[2]); init_linearize_data(&r_out, 1, n, 1, steps[3]); BEGIN_OUTER_LOOP_3 int not_ok; linearize_@TYPE@_matrix(params.A, args[0], &a_in); linearize_@TYPE@_matrix(params.B, args[1], &b_in); not_ok = call_@lapack_func@(¶ms); if (!not_ok) { delinearize_@TYPE@_matrix(args[2], params.B, &r_out); } else { error_occurred = 1; nan_@TYPE@_matrix(args[2], &r_out); } END_OUTER_LOOP release_@lapack_func@(¶ms); } set_fp_invalid_or_clear(error_occurred); } static void @TYPE@_inv(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { GESV_PARAMS_t params; fortran_int n; int error_occurred = get_fp_invalid_and_clear(); INIT_OUTER_LOOP_2 n = (fortran_int)dimensions[0]; if (init_@lapack_func@(¶ms, n, n)) { LINEARIZE_DATA_t a_in, r_out; init_linearize_data(&a_in, n, n, steps[1], steps[0]); init_linearize_data(&r_out, n, n, steps[3], steps[2]); BEGIN_OUTER_LOOP_2 int not_ok; linearize_@TYPE@_matrix(params.A, args[0], &a_in); identity_@TYPE@_matrix(params.B, n); not_ok = call_@lapack_func@(¶ms); if (!not_ok) { delinearize_@TYPE@_matrix(args[1], params.B, &r_out); } else { error_occurred = 1; nan_@TYPE@_matrix(args[1], &r_out); } END_OUTER_LOOP release_@lapack_func@(¶ms); } set_fp_invalid_or_clear(error_occurred); } /**end repeat**/ /* -------------------------------------------------------------------------- */ /* Cholesky decomposition */ typedef struct potr_params_struct { void *A; fortran_int N; fortran_int LDA; char UPLO; } POTR_PARAMS_t; /**begin repeat #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE# #ftyp=fortran_real, fortran_doublereal, fortran_complex, fortran_doublecomplex# #lapack_func=spotrf,dpotrf,cpotrf,zpotrf# */ static inline int init_@lapack_func@(POTR_PARAMS_t *params, char UPLO, fortran_int N) { npy_uint8 *mem_buff = NULL; npy_uint8 *a; mem_buff = malloc(N*N*sizeof(@ftyp@)); if (!mem_buff) goto error; a = mem_buff; params->A = a; params->N = N; params->LDA = N; params->UPLO = UPLO; return 1; error: free(mem_buff); memset(params, 0, sizeof(*params)); return 0; } static inline void release_@lapack_func@(POTR_PARAMS_t *params) { /* memory block base in A */ free(params->A); memset(params, 0, sizeof(*params)); } static inline fortran_int call_@lapack_func@(POTR_PARAMS_t *params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->UPLO, ¶ms->N, params->A, ¶ms->LDA, &rv); return rv; } static void @TYPE@_cholesky(char uplo, char **args, npy_intp *dimensions, npy_intp *steps) { POTR_PARAMS_t params; int error_occurred = get_fp_invalid_and_clear(); fortran_int n; INIT_OUTER_LOOP_2 assert(uplo == 'L'); n = (fortran_int)dimensions[0]; if (init_@lapack_func@(¶ms, uplo, n)) { LINEARIZE_DATA_t a_in, r_out; init_linearize_data(&a_in, n, n, steps[1], steps[0]); init_linearize_data(&r_out, n, n, steps[3], steps[2]); BEGIN_OUTER_LOOP_2 int not_ok; linearize_@TYPE@_matrix(params.A, args[0], &a_in); not_ok = call_@lapack_func@(¶ms); if (!not_ok) { triu_@TYPE@_matrix(params.A, params.N); delinearize_@TYPE@_matrix(args[1], params.A, &r_out); } else { error_occurred = 1; nan_@TYPE@_matrix(args[1], &r_out); } END_OUTER_LOOP release_@lapack_func@(¶ms); } set_fp_invalid_or_clear(error_occurred); } static void @TYPE@_cholesky_lo(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { @TYPE@_cholesky('L', args, dimensions, steps); } /**end repeat**/ /* -------------------------------------------------------------------------- */ /* eig family */ typedef struct geev_params_struct { void *A; void *WR; /* RWORK in complex versions, REAL W buffer for (sd)geev*/ void *WI; void *VLR; /* REAL VL buffers for _geev where _ is s, d */ void *VRR; /* REAL VR buffers for _geev hwere _ is s, d */ void *WORK; void *W; /* final w */ void *VL; /* final vl */ void *VR; /* final vr */ fortran_int N; fortran_int LDA; fortran_int LDVL; fortran_int LDVR; fortran_int LWORK; char JOBVL; char JOBVR; } GEEV_PARAMS_t; static inline void dump_geev_params(const char *name, GEEV_PARAMS_t* params) { TRACE_TXT("\n%s\n" "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %p\n"\ "\t%10s: %d\n"\ "\t%10s: %d\n"\ "\t%10s: %d\n"\ "\t%10s: %d\n"\ "\t%10s: %d\n"\ "\t%10s: %c\n"\ "\t%10s: %c\n", name, "A", params->A, "WR", params->WR, "WI", params->WI, "VLR", params->VLR, "VRR", params->VRR, "WORK", params->WORK, "W", params->W, "VL", params->VL, "VR", params->VR, "N", (int)params->N, "LDA", (int)params->LDA, "LDVL", (int)params->LDVL, "LDVR", (int)params->LDVR, "LWORK", (int)params->LWORK, "JOBVL", params->JOBVL, "JOBVR", params->JOBVR); } /**begin repeat #TYPE=FLOAT,DOUBLE# #CTYPE=CFLOAT,CDOUBLE# #typ=float,double# #complextyp=COMPLEX_t,DOUBLECOMPLEX_t# #lapack_func=sgeev,dgeev# #zero=0.0f,0.0# */ static inline int init_@lapack_func@(GEEV_PARAMS_t *params, char jobvl, char jobvr, fortran_int n) { npy_uint8 *mem_buff=NULL; npy_uint8 *mem_buff2=NULL; npy_uint8 *a, *wr, *wi, *vlr, *vrr, *work, *w, *vl, *vr; size_t a_size = n*n*sizeof(@typ@); size_t wr_size = n*sizeof(@typ@); size_t wi_size = n*sizeof(@typ@); size_t vlr_size = jobvl=='V' ? n*n*sizeof(@typ@) : 0; size_t vrr_size = jobvr=='V' ? n*n*sizeof(@typ@) : 0; size_t w_size = wr_size*2; size_t vl_size = vlr_size*2; size_t vr_size = vrr_size*2; size_t work_count = 0; @typ@ work_size_query; fortran_int do_size_query = -1; fortran_int rv; /* allocate data for known sizes (all but work) */ mem_buff = malloc(a_size + wr_size + wi_size + vlr_size + vrr_size + w_size + vl_size + vr_size); if (!mem_buff) goto error; a = mem_buff; wr = a + a_size; wi = wr + wr_size; vlr = wi + wi_size; vrr = vlr + vlr_size; w = vrr + vrr_size; vl = w + w_size; vr = vl + vl_size; LAPACK(@lapack_func@)(&jobvl, &jobvr, &n, (void *)a, &n, (void *)wr, (void *)wi, (void *)vl, &n, (void *)vr, &n, &work_size_query, &do_size_query, &rv); if (0 != rv) goto error; work_count = (size_t)work_size_query; mem_buff2 = malloc(work_count*sizeof(@typ@)); if (!mem_buff2) goto error; work = mem_buff2; params->A = a; params->WR = wr; params->WI = wi; params->VLR = vlr; params->VRR = vrr; params->WORK = work; params->W = w; params->VL = vl; params->VR = vr; params->N = n; params->LDA = n; params->LDVL = n; params->LDVR = n; params->LWORK = (fortran_int)work_count; params->JOBVL = jobvl; params->JOBVR = jobvr; return 1; error: free(mem_buff2); free(mem_buff); memset(params, 0, sizeof(*params)); return 0; } static inline fortran_int call_@lapack_func@(GEEV_PARAMS_t* params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->JOBVL, ¶ms->JOBVR, ¶ms->N, params->A, ¶ms->LDA, params->WR, params->WI, params->VLR, ¶ms->LDVL, params->VRR, ¶ms->LDVR, params->WORK, ¶ms->LWORK, &rv); return rv; } static inline void mk_@TYPE@_complex_array_from_real(@complextyp@ *c, const @typ@ *re, size_t n) { size_t iter; for (iter = 0; iter < n; ++iter) { c[iter].array[0] = re[iter]; c[iter].array[1] = @zero@; } } static inline void mk_@TYPE@_complex_array(@complextyp@ *c, const @typ@ *re, const @typ@ *im, size_t n) { size_t iter; for (iter = 0; iter < n; ++iter) { c[iter].array[0] = re[iter]; c[iter].array[1] = im[iter]; } } static inline void mk_@TYPE@_complex_array_conjugate_pair(@complextyp@ *c, const @typ@ *r, size_t n) { size_t iter; for (iter = 0; iter < n; ++iter) { @typ@ re = r[iter]; @typ@ im = r[iter+n]; c[iter].array[0] = re; c[iter].array[1] = im; c[iter+n].array[0] = re; c[iter+n].array[1] = -im; } } /* * make the complex eigenvectors from the real array produced by sgeev/zgeev. * c is the array where the results will be left. * r is the source array of reals produced by sgeev/zgeev * i is the eigenvalue imaginary part produced by sgeev/zgeev * n is so that the order of the matrix is n by n */ static inline void mk_@lapack_func@_complex_eigenvectors(@complextyp@ *c, const @typ@ *r, const @typ@ *i, size_t n) { size_t iter = 0; while (iter < n) { if (i[iter] == @zero@) { /* eigenvalue was real, eigenvectors as well... */ mk_@TYPE@_complex_array_from_real(c, r, n); c += n; r += n; iter ++; } else { /* eigenvalue was complex, generate a pair of eigenvectors */ mk_@TYPE@_complex_array_conjugate_pair(c, r, n); c += 2*n; r += 2*n; iter += 2; } } } static inline void process_@lapack_func@_results(GEEV_PARAMS_t *params) { /* REAL versions of geev need the results to be translated * into complex versions. This is the way to deal with imaginary * results. In our gufuncs we will always return complex arrays! */ mk_@TYPE@_complex_array(params->W, params->WR, params->WI, params->N); /* handle the eigenvectors */ if ('V' == params->JOBVL) { mk_@lapack_func@_complex_eigenvectors(params->VL, params->VLR, params->WI, params->N); } if ('V' == params->JOBVR) { mk_@lapack_func@_complex_eigenvectors(params->VR, params->VRR, params->WI, params->N); } } /**end repeat**/ /**begin repeat #TYPE=CFLOAT,CDOUBLE# #typ=COMPLEX_t,DOUBLECOMPLEX_t# #ftyp=fortran_complex,fortran_doublecomplex# #realtyp=float,double# #lapack_func=cgeev,zgeev# */ static inline int init_@lapack_func@(GEEV_PARAMS_t* params, char jobvl, char jobvr, fortran_int n) { npy_uint8 *mem_buff = NULL; npy_uint8 *mem_buff2 = NULL; npy_uint8 *a, *w, *vl, *vr, *work, *rwork; size_t a_size = n*n*sizeof(@ftyp@); size_t w_size = n*sizeof(@ftyp@); size_t vl_size = jobvl=='V'? n*n*sizeof(@ftyp@) : 0; size_t vr_size = jobvr=='V'? n*n*sizeof(@ftyp@) : 0; size_t rwork_size = 2*n*sizeof(@realtyp@); size_t work_count = 0; @typ@ work_size_query; fortran_int do_size_query = -1; fortran_int rv; size_t total_size = a_size + w_size + vl_size + vr_size + rwork_size; mem_buff = malloc(total_size); if (!mem_buff) goto error; a = mem_buff; w = a + a_size; vl = w + w_size; vr = vl + vl_size; rwork = vr + vr_size; LAPACK(@lapack_func@)(&jobvl, &jobvr, &n, (void *)a, &n, (void *)w, (void *)vl, &n, (void *)vr, &n, (void *)&work_size_query, &do_size_query, (void *)rwork, &rv); if (0 != rv) goto error; work_count = (size_t) work_size_query.array[0]; mem_buff2 = malloc(work_count*sizeof(@ftyp@)); if (!mem_buff2) goto error; work = mem_buff2; params->A = a; params->WR = rwork; params->WI = NULL; params->VLR = NULL; params->VRR = NULL; params->VL = vl; params->VR = vr; params->WORK = work; params->W = w; params->N = n; params->LDA = n; params->LDVL = n; params->LDVR = n; params->LWORK = (fortran_int)work_count; params->JOBVL = jobvl; params->JOBVR = jobvr; return 1; error: free(mem_buff2); free(mem_buff); memset(params, 0, sizeof(*params)); return 0; } static inline fortran_int call_@lapack_func@(GEEV_PARAMS_t* params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->JOBVL, ¶ms->JOBVR, ¶ms->N, params->A, ¶ms->LDA, params->W, params->VL, ¶ms->LDVL, params->VR, ¶ms->LDVR, params->WORK, ¶ms->LWORK, params->WR, /* actually RWORK */ &rv); return rv; } static inline void process_@lapack_func@_results(GEEV_PARAMS_t *NPY_UNUSED(params)) { /* nothing to do here, complex versions are ready to copy out */ } /**end repeat**/ /**begin repeat #TYPE=FLOAT,DOUBLE,CDOUBLE# #COMPLEXTYPE=CFLOAT,CDOUBLE,CDOUBLE# #ftype=fortran_real,fortran_doublereal,fortran_doublecomplex# #lapack_func=sgeev,dgeev,zgeev# */ static inline void release_@lapack_func@(GEEV_PARAMS_t *params) { free(params->WORK); free(params->A); memset(params, 0, sizeof(*params)); } static inline void @TYPE@_eig_wrapper(char JOBVL, char JOBVR, char**args, npy_intp* dimensions, npy_intp* steps) { ptrdiff_t outer_steps[4]; size_t iter; size_t outer_dim = *dimensions++; size_t op_count = 2; int error_occurred = get_fp_invalid_and_clear(); GEEV_PARAMS_t geev_params; assert(JOBVL == 'N'); STACK_TRACE; op_count += 'V'==JOBVL?1:0; op_count += 'V'==JOBVR?1:0; for (iter=0; iter < op_count; ++iter) { outer_steps[iter] = (ptrdiff_t) steps[iter]; } steps += op_count; if (init_@lapack_func@(&geev_params, JOBVL, JOBVR, (fortran_int)dimensions[0])) { LINEARIZE_DATA_t a_in; LINEARIZE_DATA_t w_out; LINEARIZE_DATA_t vl_out; LINEARIZE_DATA_t vr_out; init_linearize_data(&a_in, geev_params.N, geev_params.N, steps[1], steps[0]); steps += 2; init_linearize_data(&w_out, 1, geev_params.N, 0, steps[0]); steps += 1; if ('V' == geev_params.JOBVL) { init_linearize_data(&vl_out, geev_params.N, geev_params.N, steps[1], steps[0]); steps += 2; } if ('V' == geev_params.JOBVR) { init_linearize_data(&vr_out, geev_params.N, geev_params.N, steps[1], steps[0]); } for (iter = 0; iter < outer_dim; ++iter) { int not_ok; char **arg_iter = args; /* copy the matrix in */ linearize_@TYPE@_matrix(geev_params.A, *arg_iter++, &a_in); not_ok = call_@lapack_func@(&geev_params); if (!not_ok) { process_@lapack_func@_results(&geev_params); delinearize_@COMPLEXTYPE@_matrix(*arg_iter++, geev_params.W, &w_out); if ('V' == geev_params.JOBVL) delinearize_@COMPLEXTYPE@_matrix(*arg_iter++, geev_params.VL, &vl_out); if ('V' == geev_params.JOBVR) delinearize_@COMPLEXTYPE@_matrix(*arg_iter++, geev_params.VR, &vr_out); } else { /* geev failed */ error_occurred = 1; nan_@COMPLEXTYPE@_matrix(*arg_iter++, &w_out); if ('V' == geev_params.JOBVL) nan_@COMPLEXTYPE@_matrix(*arg_iter++, &vl_out); if ('V' == geev_params.JOBVR) nan_@COMPLEXTYPE@_matrix(*arg_iter++, &vr_out); } update_pointers((npy_uint8**)args, outer_steps, op_count); } release_@lapack_func@(&geev_params); } set_fp_invalid_or_clear(error_occurred); } static void @TYPE@_eig(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { @TYPE@_eig_wrapper('N', 'V', args, dimensions, steps); } static void @TYPE@_eigvals(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { @TYPE@_eig_wrapper('N', 'N', args, dimensions, steps); } /**end repeat**/ /* -------------------------------------------------------------------------- */ /* singular value decomposition */ typedef struct gessd_params_struct { void *A; void *S; void *U; void *VT; void *WORK; void *RWORK; void *IWORK; fortran_int M; fortran_int N; fortran_int LDA; fortran_int LDU; fortran_int LDVT; fortran_int LWORK; char JOBZ; } GESDD_PARAMS_t; static inline void dump_gesdd_params(const char *name, GESDD_PARAMS_t *params) { TRACE_TXT("\n%s:\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18p\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %18d\n"\ "%14s: %15c'%c'\n", name, "A", params->A, "S", params->S, "U", params->U, "VT", params->VT, "WORK", params->WORK, "RWORK", params->RWORK, "IWORK", params->IWORK, "M", (int)params->M, "N", (int)params->N, "LDA", (int)params->LDA, "LDU", (int)params->LDU, "LDVT", (int)params->LDVT, "LWORK", (int)params->LWORK, "JOBZ", ' ',params->JOBZ); } static inline int compute_urows_vtcolumns(char jobz, fortran_int m, fortran_int n, fortran_int *urows, fortran_int *vtcolumns) { fortran_int min_m_n = mM = m; params->N = n; params->A = a; params->S = s; params->U = u; params->VT = vt; params->WORK = work; params->RWORK = NULL; params->IWORK = iwork; params->M = m; params->N = n; params->LDA = m; params->LDU = m; params->LDVT = vt_column_count; params->LWORK = work_count; params->JOBZ = jobz; return 1; error: TRACE_TXT("%s failed init\n", __FUNCTION__); free(mem_buff); free(mem_buff2); memset(params, 0, sizeof(*params)); return 0; } static inline fortran_int call_@lapack_func@(GESDD_PARAMS_t *params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->JOBZ, ¶ms->M, ¶ms->N, params->A, ¶ms->LDA, params->S, params->U, ¶ms->LDU, params->VT, ¶ms->LDVT, params->WORK, ¶ms->LWORK, params->IWORK, &rv); return rv; } /**end repeat**/ /**begin repeat #TYPE=CFLOAT,CDOUBLE# #ftyp=fortran_complex,fortran_doublecomplex# #frealtyp=fortran_real,fortran_doublereal# #typ=COMPLEX_t,DOUBLECOMPLEX_t# #lapack_func=cgesdd,zgesdd# */ static inline int init_@lapack_func@(GESDD_PARAMS_t *params, char jobz, fortran_int m, fortran_int n) { npy_uint8 *mem_buff = NULL, *mem_buff2 = NULL; npy_uint8 *a,*s, *u, *vt, *work, *rwork, *iwork; size_t a_size, s_size, u_size, vt_size, work_size, rwork_size, iwork_size; fortran_int u_row_count, vt_column_count, work_count; fortran_int min_m_n = marray[0]; work_size = (size_t)work_count * sizeof(@ftyp@); } mem_buff2 = malloc(work_size); if (!mem_buff2) goto error; work = mem_buff2; params->A = a; params->S = s; params->U = u; params->VT = vt; params->WORK = work; params->RWORK = rwork; params->IWORK = iwork; params->M = m; params->N = n; params->LDA = m; params->LDU = m; params->LDVT = vt_column_count; params->LWORK = work_count; params->JOBZ = jobz; return 1; error: TRACE_TXT("%s failed init\n", __FUNCTION__); free(mem_buff2); free(mem_buff); memset(params, 0, sizeof(*params)); return 0; } static inline fortran_int call_@lapack_func@(GESDD_PARAMS_t *params) { fortran_int rv; LAPACK(@lapack_func@)(¶ms->JOBZ, ¶ms->M, ¶ms->N, params->A, ¶ms->LDA, params->S, params->U, ¶ms->LDU, params->VT, ¶ms->LDVT, params->WORK, ¶ms->LWORK, params->RWORK, params->IWORK, &rv); return rv; } /**end repeat**/ /**begin repeat #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE# #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE# #lapack_func=sgesdd,dgesdd,cgesdd,zgesdd# */ static inline void release_@lapack_func@(GESDD_PARAMS_t* params) { /* A and WORK contain allocated blocks */ free(params->A); free(params->WORK); memset(params, 0, sizeof(*params)); } static inline void @TYPE@_svd_wrapper(char JOBZ, char **args, npy_intp* dimensions, npy_intp* steps) { ptrdiff_t outer_steps[4]; int error_occurred = get_fp_invalid_and_clear(); size_t iter; size_t outer_dim = *dimensions++; size_t op_count = (JOBZ=='N')?2:4; GESDD_PARAMS_t params; for (iter=0; iter < op_count; ++iter) { outer_steps[iter] = (ptrdiff_t) steps[iter]; } steps += op_count; if (init_@lapack_func@(¶ms, JOBZ, (fortran_int)dimensions[0], (fortran_int)dimensions[1])) { LINEARIZE_DATA_t a_in, u_out, s_out, v_out; init_linearize_data(&a_in, params.N, params.M, steps[1], steps[0]); if ('N' == params.JOBZ) { /* only the singular values are wanted */ fortran_int min_m_n = params.M < params.N? params.M : params.N; init_linearize_data(&s_out, 1, min_m_n, 0, steps[2]); } else { fortran_int u_columns, v_rows; fortran_int min_m_n = params.M < params.N? params.M : params.N; if ('S' == params.JOBZ) { u_columns = min_m_n; v_rows = min_m_n; } else { u_columns = params.M; v_rows = params.N; } init_linearize_data(&u_out, u_columns, params.M, steps[3], steps[2]); init_linearize_data(&s_out, 1, min_m_n, 0, steps[4]); init_linearize_data(&v_out, params.N, v_rows, steps[6], steps[5]); } for (iter = 0; iter < outer_dim; ++iter) { int not_ok; /* copy the matrix in */ linearize_@TYPE@_matrix(params.A, args[0], &a_in); not_ok = call_@lapack_func@(¶ms); if (!not_ok) { if ('N' == params.JOBZ) { delinearize_@REALTYPE@_matrix(args[1], params.S, &s_out); } else { delinearize_@TYPE@_matrix(args[1], params.U, &u_out); delinearize_@REALTYPE@_matrix(args[2], params.S, &s_out); delinearize_@TYPE@_matrix(args[3], params.VT, &v_out); } } else { error_occurred = 1; if ('N' == params.JOBZ) { nan_@REALTYPE@_matrix(args[1], &s_out); } else { nan_@TYPE@_matrix(args[1], &u_out); nan_@REALTYPE@_matrix(args[2], &s_out); nan_@TYPE@_matrix(args[3], &v_out); } } update_pointers((npy_uint8**)args, outer_steps, op_count); } release_@lapack_func@(¶ms); } set_fp_invalid_or_clear(error_occurred); } /**end repeat*/ /* svd gufunc entry points */ /**begin repeat #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE# */ static void @TYPE@_svd_N(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { @TYPE@_svd_wrapper('N', args, dimensions, steps); } static void @TYPE@_svd_S(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { @TYPE@_svd_wrapper('S', args, dimensions, steps); } static void @TYPE@_svd_A(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { @TYPE@_svd_wrapper('A', args, dimensions, steps); } /**end repeat**/ #pragma GCC diagnostic pop /* -------------------------------------------------------------------------- */ /* gufunc registration */ static void *array_of_nulls[] = { (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL }; #define FUNC_ARRAY_NAME(NAME) NAME ## _funcs #define GUFUNC_FUNC_ARRAY_REAL(NAME) \ static PyUFuncGenericFunction \ FUNC_ARRAY_NAME(NAME)[] = { \ FLOAT_ ## NAME, \ DOUBLE_ ## NAME \ } #define GUFUNC_FUNC_ARRAY_REAL_COMPLEX(NAME) \ static PyUFuncGenericFunction \ FUNC_ARRAY_NAME(NAME)[] = { \ FLOAT_ ## NAME, \ DOUBLE_ ## NAME, \ CFLOAT_ ## NAME, \ CDOUBLE_ ## NAME \ } /* There are problems with eig in complex single precision. * That kernel is disabled */ #define GUFUNC_FUNC_ARRAY_EIG(NAME) \ static PyUFuncGenericFunction \ FUNC_ARRAY_NAME(NAME)[] = { \ FLOAT_ ## NAME, \ DOUBLE_ ## NAME, \ CDOUBLE_ ## NAME \ } GUFUNC_FUNC_ARRAY_REAL_COMPLEX(slogdet); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(det); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eighlo); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eighup); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eigvalshlo); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(eigvalshup); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(solve); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(solve1); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(inv); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(cholesky_lo); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_N); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_S); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_A); GUFUNC_FUNC_ARRAY_EIG(eig); GUFUNC_FUNC_ARRAY_EIG(eigvals); static char equal_2_types[] = { NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE }; static char equal_3_types[] = { NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE }; /* second result is logdet, that will always be a REAL */ static char slogdet_types[] = { NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_FLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE }; static char eigh_types[] = { NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_CFLOAT, NPY_FLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE }; static char eighvals_types[] = { NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_CFLOAT, NPY_FLOAT, NPY_CDOUBLE, NPY_DOUBLE }; static char eig_types[] = { NPY_FLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_DOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE }; static char eigvals_types[] = { NPY_FLOAT, NPY_CFLOAT, NPY_DOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE }; static char svd_1_1_types[] = { NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_CFLOAT, NPY_FLOAT, NPY_CDOUBLE, NPY_DOUBLE }; static char svd_1_3_types[] = { NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_FLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE }; typedef struct gufunc_descriptor_struct { char *name; char *signature; char *doc; int ntypes; int nin; int nout; PyUFuncGenericFunction *funcs; char *types; } GUFUNC_DESCRIPTOR_t; GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = { { "slogdet", "(m,m)->(),()", "slogdet on the last two dimensions and broadcast on the rest. \n"\ "Results in two arrays, one with sign and the other with log of the"\ " determinants. \n"\ " \"(m,m)->(),()\" \n", 4, 1, 2, FUNC_ARRAY_NAME(slogdet), slogdet_types }, { "det", "(m,m)->()", "det of the last two dimensions and broadcast on the rest. \n"\ " \"(m,m)->()\" \n", 4, 1, 1, FUNC_ARRAY_NAME(det), equal_2_types }, { "eigh_lo", "(m,m)->(m),(m,m)", "eigh on the last two dimension and broadcast to the rest, using"\ " lower triangle \n"\ "Results in a vector of eigenvalues and a matrix with the"\ "eigenvectors. \n"\ " \"(m,m)->(m),(m,m)\" \n", 4, 1, 2, FUNC_ARRAY_NAME(eighlo), eigh_types }, { "eigh_up", "(m,m)->(m),(m,m)", "eigh on the last two dimension and broadcast to the rest, using"\ " upper triangle. \n"\ "Results in a vector of eigenvalues and a matrix with the"\ " eigenvectors. \n"\ " \"(m,m)->(m),(m,m)\" \n", 4, 1, 2, FUNC_ARRAY_NAME(eighup), eigh_types }, { "eigvalsh_lo", "(m,m)->(m)", "eigh on the last two dimension and broadcast to the rest, using"\ " lower triangle. \n"\ "Results in a vector of eigenvalues and a matrix with the"\ "eigenvectors. \n"\ " \"(m,m)->(m)\" \n", 4, 1, 1, FUNC_ARRAY_NAME(eigvalshlo), eighvals_types }, { "eigvalsh_up", "(m,m)->(m)", "eigvalsh on the last two dimension and broadcast to the rest,"\ " using upper triangle. \n"\ "Results in a vector of eigenvalues and a matrix with the"\ "eigenvectors.\n"\ " \"(m,m)->(m)\" \n", 4, 1, 1, FUNC_ARRAY_NAME(eigvalshup), eighvals_types }, { "solve", "(m,m),(m,n)->(m,n)", "solve the system a x = b, on the last two dimensions, broadcast"\ " to the rest. \n"\ "Results in a matrices with the solutions. \n"\ " \"(m,m),(m,n)->(m,n)\" \n", 4, 2, 1, FUNC_ARRAY_NAME(solve), equal_3_types }, { "solve1", "(m,m),(m)->(m)", "solve the system a x = b, for b being a vector, broadcast in"\ " the outer dimensions. \n"\ "Results in vectors with the solutions. \n"\ " \"(m,m),(m)->(m)\" \n", 4,2,1, FUNC_ARRAY_NAME(solve1), equal_3_types }, { "inv", "(m,m)->(m,m)", "compute the inverse of the last two dimensions and broadcast"\ " to the rest. \n"\ "Results in the inverse matrices. \n"\ " \"(m,m)->(m,m)\" \n", 4,1,1, FUNC_ARRAY_NAME(inv), equal_2_types }, { "cholesky_lo", "(m,m)->(m,m)", "cholesky decomposition of hermitian positive-definite matrices. \n"\ "Broadcast to all outer dimensions. \n"\ " \"(m,m)->(m,m)\" \n", 4, 1, 1, FUNC_ARRAY_NAME(cholesky_lo), equal_2_types }, { "svd_m", "(m,n)->(m)", "svd when n>=m. ", 4, 1, 1, FUNC_ARRAY_NAME(svd_N), svd_1_1_types }, { "svd_n", "(m,n)->(n)", "svd when n<=m", 4, 1, 1, FUNC_ARRAY_NAME(svd_N), svd_1_1_types }, { "svd_m_s", "(m,n)->(m,m),(m),(m,n)", "svd when m>=n", 4, 1, 3, FUNC_ARRAY_NAME(svd_S), svd_1_3_types }, { "svd_n_s", "(m,n)->(m,n),(n),(n,n)", "svd when m>=n", 4, 1, 3, FUNC_ARRAY_NAME(svd_S), svd_1_3_types }, { "svd_m_f", "(m,n)->(m,m),(m),(n,n)", "svd when m>=n", 4, 1, 3, FUNC_ARRAY_NAME(svd_A), svd_1_3_types }, { "svd_n_f", "(m,n)->(m,m),(n),(n,n)", "svd when m>=n", 4, 1, 3, FUNC_ARRAY_NAME(svd_A), svd_1_3_types }, { "eig", "(m,m)->(m),(m,m)", "eig on the last two dimension and broadcast to the rest. \n"\ "Results in a vector with the eigenvalues and a matrix with the"\ " eigenvectors. \n"\ " \"(m,m)->(m),(m,m)\" \n", 3, 1, 2, FUNC_ARRAY_NAME(eig), eig_types }, { "eigvals", "(m,m)->(m)", "eigvals on the last two dimension and broadcast to the rest. \n"\ "Results in a vector of eigenvalues. \n"\ " \"(m,m)->(m),(m,m)\" \n", 3, 1, 1, FUNC_ARRAY_NAME(eigvals), eigvals_types }, }; static void addUfuncs(PyObject *dictionary) { PyObject *f; int i; const int gufunc_count = sizeof(gufunc_descriptors)/ sizeof(gufunc_descriptors[0]); for (i=0; i < gufunc_count; i++) { GUFUNC_DESCRIPTOR_t* d = &gufunc_descriptors[i]; f = PyUFunc_FromFuncAndDataAndSignature(d->funcs, array_of_nulls, d->types, d->ntypes, d->nin, d->nout, PyUFunc_None, d->name, d->doc, 0, d->signature); PyDict_SetItemString(dictionary, d->name, f); #if 0 dump_ufunc_object((PyUFuncObject*) f); #endif Py_DECREF(f); } } /* -------------------------------------------------------------------------- */ /* Module initialization stuff */ static PyMethodDef UMath_LinAlgMethods[] = { {NULL, NULL, 0, NULL} /* Sentinel */ }; #if defined(NPY_PY3K) static struct PyModuleDef moduledef = { PyModuleDef_HEAD_INIT, UMATH_LINALG_MODULE_NAME, NULL, -1, UMath_LinAlgMethods, NULL, NULL, NULL, NULL }; #endif #if defined(NPY_PY3K) #define RETVAL m PyObject *PyInit__umath_linalg(void) #else #define RETVAL PyMODINIT_FUNC init_umath_linalg(void) #endif { PyObject *m; PyObject *d; PyObject *version; init_constants(); #if defined(NPY_PY3K) m = PyModule_Create(&moduledef); #else m = Py_InitModule(UMATH_LINALG_MODULE_NAME, UMath_LinAlgMethods); #endif if (m == NULL) return RETVAL; import_array(); import_ufunc(); d = PyModule_GetDict(m); version = PyString_FromString(umath_linalg_version_string); PyDict_SetItemString(d, "__version__", version); Py_DECREF(version); /* Load the ufunc operators into the module's namespace */ addUfuncs(d); if (PyErr_Occurred()) { PyErr_SetString(PyExc_RuntimeError, "cannot load _umath_linalg module."); } return RETVAL; }