Refactoring math routines. Added comments. Removed multithreading from fft routines. Switched to fftpack for FFT. Added Doxyfile

2018-02-06 12:01:27 +01:00 · 2018-02-06 12:01:27 +01:00 · 83aaf69bef
commit 83aaf69bef
parent 52e5a31bdb
28 changed files with 4508 additions and 1254 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,3 +12,7 @@ cython_debug
 beamforming/config.pxi
 beamforming/fft.c
 *.so
+test/test_bf
+test/test_fft
+test/test_math
+test/test_workers
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -13,6 +13,9 @@ add_definitions(-DASCEE_MAX_NUM_THREADS=8)

 add_definitions(-DASCEE_MAX_NUM_CHANNELS=80)

+# Reasonable maximum to the nfft size, at 48kHz this is 700s of data..
+add_definitions(-DASCEE_MAX_NFFT=33554432) # 2**25
+
 # ####################################### End of user-adjustable variables section

 add_definitions(-D_REENTRANT)
--- a/2482
+++ b/2482
--- a/beamforming/c/CMakeLists.txt
+++ b/beamforming/c/CMakeLists.txt
@ -6,10 +6,13 @@ endif(!ASCEE_DEBUG)
 add_library(beamforming_lib
  fft.c
  ascee_math.c
+  ascee_math_raw.c
+  ascee_alg.c
  ascee_assert.c
-  tracer.c
+  ascee_tracer.c
  window.c
-  # ps.c
+  aps.c
+  ps.c
  mq.c
  worker.c
  )
--- a/beamforming/c/aps.c
+++ b/beamforming/c/aps.c
@ -0,0 +1,228 @@
+// aps.c
+//
+// Author: J.A. de Jong -ASCEE
+// 
+// Description:
+//
+//////////////////////////////////////////////////////////////////////
+
+#include "aps.h"
+#include "ps.h"
+#include "ascee_alg.h"
+
+typedef struct AvPowerSpectra_s {
+    us os;              /**< Counter set to the position where
+                         * the next time block should be taken
+                         * from */
+    us nfft, nchannels;
+    us oo;
+
+    us naverages;               /* Counter that counts the number of
+                                 * averages taken for the computation
+                                 * of this averaged power spectra. */
+    
+    dmat buffer;                /**< Buffer storage of some of the
+                                 * previous samples. Number of rows is
+                                 * equal to nfft. */
+
+
+    cmat ps_storage;           /**< Here we store the averaged
+                                * results for each Cross-power
+                                * spectra computed so far. */
+    
+    cmat ps_single;             /**< This is the work area for a
+                                 * PowerSpectra computation on a
+                                 * single block */
+
+    PowerSpectra* ps;           /**< Pointer to underlying
+                                 * PowerSpectra calculator. */
+    
+} AvPowerSpectra;
+
+AvPowerSpectra* AvPowerSpectra_alloc(const us nfft,
+                                     const us nchannels,
+                                     const d overlap_percentage,
+                                     const WindowType wt) {
+    
+    fsTRACE(15);
+    
+    /* Check nfft */
+    if(nfft % 2 != 0 || nfft > ASCEE_MAX_NFFT) {
+        WARN("nfft should be even");
+        return NULL;
+    }
+
+    /* Check overlap percentage */
+    if(overlap_percentage >= 100) {
+        WARN("Overlap percentage >= 100!");
+        return NULL;
+    }
+    if(overlap_percentage < 0) {
+        WARN("Overlap percentage should be positive!");
+        return NULL;
+    }
+    
+    /* Compute and check overlap offset */
+    us oo = nfft - (us) (overlap_percentage*nfft/100);
+    if(oo == 0) {oo++;}
+        
+    PowerSpectra* ps = PowerSpectra_alloc(nfft,nchannels,wt);
+    if(!ps) {
+        WARN(ALLOCFAILED "ps");
+        return NULL;
+    }
+
+    AvPowerSpectra* aps = a_malloc(sizeof(AvPowerSpectra));
+    if(!aps) {
+        WARN("Allocation of AvPowerSpectra memory failed");
+        PowerSpectra_free(ps);
+        return NULL;
+    }
+    aps->nchannels = nchannels;
+    aps->nfft = nfft;
+    aps->ps = ps;
+    aps->naverages = 0;
+    aps->oo = oo;
+    aps->os = oo;
+
+    /* Allocate vectors and matrices */
+    aps->buffer = dmat_alloc(nfft,nchannels);
+    aps->ps_storage = cmat_alloc(nfft/2+1,nchannels*nchannels);
+    aps->ps_single = cmat_alloc(nfft/2+1,nchannels*nchannels);
+
+    cmat_set(&aps->ps_storage,0);
+    
+    return aps;
+}
+
+
+us AvPowerSpectra_getAverages(const AvPowerSpectra* ps) {
+    return ps->naverages;
+}
+
+/** 
+ * Helper function that adds a block of time data to the APS
+ *
+ * @param aps AvPowerSpectra handle 
+ * @param block Time data block. Size should be exactly nfft*nchannels.
+ */
+static void AvPowerSpectra_addBlock(AvPowerSpectra* aps,
+                                    const dmat* block) {
+    fsTRACE(15);
+    
+    dbgassert(aps && block,NULLPTRDEREF);
+    dbgassert(block->n_rows == aps->nfft,"Invalid block n_rows");
+    dbgassert(block->n_cols == aps->nchannels,"Invalid block n_cols");
+
+    cmat ps_single = aps->ps_single;
+    cmat ps_storage = aps->ps_storage;
+    
+    c naverages = (++aps->naverages);
+    
+    /* Scale previous result */
+    cmat_scale(&ps_storage,
+               (naverages-1)/naverages);
+
+
+    PowerSpectra_compute(aps->ps,
+                         block,
+                         &ps_single);
+
+
+    /* Add new result, scaled properly */
+    cmat_add_cmat(&ps_storage,
+                  &ps_single,1/naverages);
+
+    
+    feTRACE(15);
+}
+
+
+cmat* AvPowerSpectra_addTimeData(AvPowerSpectra* aps,
+                                const dmat* timedata) {
+    
+    fsTRACE(15);
+    
+    dbgassert(aps && timedata,NULLPTRDEREF);
+    const us nchannels = aps->nchannels;
+    const us nfft = aps->nfft;
+    
+    dbgassert(timedata->n_cols == nchannels,"Invalid time data");
+    dbgassert(timedata->n_rows >= nfft,"Invalid time data. "
+              "Should at least have nfft rows");
+
+    const us oo = aps->oo;
+    us* os = &aps->os;
+    us os_timedata = 0;
+
+    dmat buffer = aps->buffer;
+
+    /* Retrieve the buffer and use it to make the first time block. */
+    if(*os < oo) {
+
+        dmat tmp = dmat_alloc(nfft,nchannels);
+
+        copy_dmat_rows(&tmp,
+                       &buffer,
+                       *os,       /* Startrow_from */
+                       0,         /* Startrow to */
+                       nfft - *os /* nrows */
+            );
+
+        copy_dmat_rows(&tmp,
+                       timedata,
+                       0,
+                       nfft - *os,
+                       *os
+            );
+
+
+        AvPowerSpectra_addBlock(aps,&tmp);
+
+        os_timedata = oo + *os - nfft;
+        
+        dmat_free(&tmp);
+    }
+
+    /* Run until we cannot go any further */
+    while (os_timedata + nfft <= timedata->n_rows) {
+        dmat tmp = dmat_submat(timedata,
+                               os_timedata,
+                               0,nfft,nchannels);
+
+        AvPowerSpectra_addBlock(aps,&tmp);
+        
+        os_timedata += oo;
+
+        dmat_free(&tmp);
+    }
+
+    /* We copy the last piece of samples from the timedata to the
+     * buffer */
+    copy_dmat_rows(&buffer,
+                   timedata,
+                   timedata->n_rows-nfft,
+                   0,
+                   nfft);
+    *os = os_timedata+nfft-timedata->n_rows;
+    
+    dbgassert(*os < nfft,"BUG");
+
+    feTRACE(15);
+    return &aps->ps_storage;
+}
+
+void AvPowerSpectra_free(AvPowerSpectra* aps) {
+    fsTRACE(15);
+
+    PowerSpectra_free(aps->ps);
+    dmat_free(&aps->buffer);
+    cmat_free(&aps->ps_storage);
+    cmat_free(&aps->ps_single);
+    a_free(aps);
+    
+    feTRACE(15);
+}
+
+
+//////////////////////////////////////////////////////////////////////
--- a/beamforming/c/aps.h
+++ b/beamforming/c/aps.h
@ -0,0 +1,77 @@
+// aps.h
+//
+// Author: J.A. de Jong - ASCEE
+//
+// Description:
+//
+//////////////////////////////////////////////////////////////////////
+#pragma once
+#ifndef APS_H
+#define APS_H
+#include "types.h"
+#include "ascee_math.h"
+#include "window.h"
+
+typedef struct AvPowerSpectra_s AvPowerSpectra;
+
+/** 
+ * Allocates a AvPowerSpectra object which is used to compute an
+ * averaged power spectra from given input time samples.
+ *
+ * @param[in] nfft Size of the fft,
+ *
+ * @param[in] nchannels Number of channels of time data to allocate
+ * for.
+ *
+ * @param[in] overlap_percentage. Should be 0<= overlap_pct < 100. The
+ * overlap percentage will be coerced, as the offset will be an
+ * integer number of samples at all cases. Therefore, the real overlap
+ * percentage can be obtained later on
+ *
+ * @param wt 
+ *
+ * @return 
+ */
+AvPowerSpectra* AvPowerSpectra_alloc(const us nfft,
+                                     const us nchannels,
+                                     const d overlap_percentage,
+                                     const WindowType wt);
+                                 
+
+/** 
+ * Computes the real overlap offset 
+ *
+ * @param aps 
+ */
+d AvPowerSpectra_realoverlappct(const AvPowerSpectra* aps);
+
+/** 
+ * Return the current number of averages taken to obtain the result.
+ *
+ * @return The number of averages taken.
+ */
+us AvPowerSpectra_getAverages(const AvPowerSpectra*);
+
+/** 
+ * Add time data to this Averaged Power Spectra. Remembers the part
+ * that should be overlapped with the next time data.
+ *
+ * @param aps AvPowerSpectra handle 
+ *
+ * @param timedata Pointer to timedata buffer. Number of rows SHOULD
+ * *at least* be nfft. Number of columns should exactly be nchannels.
+ *
+ * @return pointer to the averaged power spectra buffer that is being
+ * written to. Pointer is valid until AvPowerSpectra_free() is called.
+ */
+cmat* AvPowerSpectra_addTimeData(AvPowerSpectra* aps,
+                                 const dmat* timedata);
+
+/** 
+ * Free storage of the AvPowerSpectra
+ */
+void AvPowerSpectra_free(AvPowerSpectra*);
+
+
+#endif // APS_H
+//////////////////////////////////////////////////////////////////////
--- a/beamforming/c/ascee_alg.c
+++ b/beamforming/c/ascee_alg.c
@ -0,0 +1,230 @@
+// ascee_alg.c
+//
+// Author: J.A. de Jong -ASCEE
+// 
+// Description:
+// (Linear) algebra routine implementations
+//////////////////////////////////////////////////////////////////////
+
+#include "ascee_alg.h"
+
+
+
+
+void cmv_dot(const cmat* A,const vc* restrict x,vc* restrict b){
+
+    assert(A->n_rows == b->size);
+    assert(A->n_cols == x->size);
+	
+    #if ASCEE_USE_BLAS == 1
+
+    /* typedef enum CBLAS_ORDER     {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; */
+    /* typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; */
+    /* 
+       void cblas_zgemv(OPENBLAS_CONST enum CBLAS_ORDER order,
+       OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,
+       OPENBLAS_CONST blasint m,
+       OPENBLAS_CONST blasint n,
+       OPENBLAS_CONST double *alpha,
+       OPENBLAS_CONST double  *a,
+       OPENBLAS_CONST blasint lda,
+       OPENBLAS_CONST double  *x,
+       OPENBLAS_CONST blasint incx,
+       OPENBLAS_CONST double *beta,
+       double  *y,
+       OPENBLAS_CONST blasint incy);
+    */
+    c alpha = 1.0;
+    c beta = 0.0;
+    cblas_zgemv(CblasColMajor,
+                CblasNoTrans,
+                A->n_rows,
+                A->n_cols,
+                (d*) &alpha,			/* alpha */
+                (d*) A->data,		/* A */
+                A->n_rows,		/* lda */
+                (d*) x->data,		/*  */
+                1,
+                (d*) &beta,			/* beta */
+                (d*) b->data,
+                1);
+				
+				
+				
+    #else
+    size_t i,j;
+    size_t n_rows = A->n_rows;
+
+    vc_set(b,0.0);
+
+    iVARTRACE(20,A->n_cols);
+    iVARTRACE(20,A->n_rows);
+
+    for(j=0;j<A->n_cols;j++){
+        for(i=0;i<A->n_rows;i++) {
+
+            c* Aij = &A->data[i+j*n_rows];
+            b->data[i] += *Aij * x->data[j];
+
+        }
+
+    }
+
+
+    #endif
+}
+
+/// The code below here is not yet worked on. Should be improved to
+/// directly couple to openblas, instead of using lapacke.h
+#if 0
+
+/* These functions can be directly linked to openBLAS */
+#define lapack_complex_double   double _Complex
+#define lapack_complex_float   float _Complex
+
+#define LAPACK_ROW_MAJOR               101
+#define LAPACK_COL_MAJOR               102
+
+#define LAPACK_WORK_MEMORY_ERROR       -1010
+#define LAPACK_TRANSPOSE_MEMORY_ERROR  -1011
+
+typedef int lapack_int;
+
+int LAPACKE_cgelss( int matrix_layout, int m, int n,
+                    int nrhs, lapack_complex_float* a,
+                    int lda, lapack_complex_float* b,
+                    int ldb, float* s, float rcond,
+                    int* rank );
+int LAPACKE_zgelss( int matrix_layout, int m, int n,
+                    int nrhs, lapack_complex_double* a,
+                    int lda, lapack_complex_double* b,
+                    int ldb, double* s, double rcond,
+                    int* rank );
+
+lapack_int LAPACKE_zgels( int matrix_layout, char trans, lapack_int m,
+                          lapack_int n, lapack_int nrhs,
+                          lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* b, lapack_int ldb );
+
+
+
+
+#if ASCEE_FLOAT == 64
+
+#define lapack_gelss LAPACKE_zgelss
+#define lapack_gels LAPACKE_zgels
+#else
+
+#define lapack_gelss LAPACKE_cgelss
+#endif
+
+#define max(a,b) ((a)>(b)?(a):(b))
+
+
+/* int lsq_solve(const cmat* A,const vc* b,vc* x){ */
+    
+/*     POOL_INIT(lsq_solve_pool); */
+/*     int rv; */
+/*     /\* M: number of rows of matrix *\/ */
+/*     /\* N: Number of columns *\/ */
+/*     /\* Norm: L2|b-A*x| *\/ */
+/*     /\* NRHS: Number of right hand sides: Number of columns of matrix B *\/ */
+
+/*     assert(A->n_rows>=A->n_cols); */
+/*     assert(x->size == A->n_cols); */
+/*     assert(b->size == A->n_rows); */
+	
+/*     int info; */
+	
+/*     size_t lda = max(1,A->n_rows); */
+/*     size_t ldb = max(lda,A->n_cols); */
+	
+/*     /\* Make explicit copy of matrix A data, as it will be overwritten */
+/*      * by lapack_gels *\/ */
+/*     c* A_data = Pool_allocatec(&lsq_solve_pool,A->n_rows*A->n_cols); */
+/*     c_copy(A_data,A->data,A->n_cols*A->n_rows); */
+
+/*     c* work_data = Pool_allocatec(&lsq_solve_pool,b->size); */
+/*     c_copy(work_data,b->data,b->size); */
+	
+/*     /\* Lapack documentation says: *\/ */
+/*     /\* 	if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */
+/*         squares solution vectors; the residual sum of squares for the */
+/*         solution in each column is given by the sum of squares of the */
+/*         modulus of elements N+1 to M in that column; */
+/*     *\/ */
+
+	
+/*     /\* We always assume one RHS column *\/ */
+/*     const int nrhs = 1; */
+
+/*     /\* General Least Squares Solve *\/ */
+/*     info = lapack_gels(LAPACK_COL_MAJOR, /\* Column-major ordering *\/ */
+/*                        'N', */
+/*                        A->n_rows,	/\* Number of rows in matrix *\/ */
+/*                        A->n_cols,	/\* Number of columns *\/ */
+/*                        nrhs,   /\* nrhs, which is number_mics *\/ */
+/*                        A_data, /\* The A-matrix *\/ */
+/*                        lda,	  /\* lda: the leading dimension of matrix A *\/ */
+/*                        work_data,	  /\* The b-matrix *\/ */
+/*                        ldb);  /\* ldb: the leading dimension of b: max(1,M,N) *\/ */
+		
+/*     if(info==0){ */
+/*         c_copy(x->data,work_data,x->size); */
+/*         rv = SUCCESS; */
+/*     } */
+/*     else { */
+/*         memset(x->data,0,x->size); */
+/*         WARN("LAPACK INFO VALUE"); */
+/*         printf("%i\n", info ); */
+/*         TRACE(15,"Solving least squares problem failed\n"); */
+
+/*         rv = FAILURE; */
+/*     } */
+
+/*     Pool_free(&lsq_solve_pool,A_data); */
+/*     Pool_free(&lsq_solve_pool,work_data); */
+/*     POOL_EXIT(lsq_solve_pool,15);     */
+/*     return rv; */
+    
+/* } */
+
+/* d c_normdiff(const cmat* A,const cmat* B) { */
+
+/*     TRACE(15,"c_normdif"); */
+	
+/*     dbgassert(A->n_cols==B->n_cols,"Number of columns of A and B " */
+/*               "should be equal"); */
+/*     dbgassert(A->n_rows==B->n_rows,"Number of rows of A and B " */
+/*               "should be equal"); */
+
+/*     size_t size = A->n_cols*A->n_rows; */
+
+/*     vc diff_temp = vc_al[MAX_MATRIX_SIZE]; */
+
+/*     c_copy(diff_temp,A->data,size); */
+
+/*     c alpha = -1.0; */
+
+/*     /\* This routine computes y <- alpha*x + beta*y *\/ */
+	
+	
+/*     /\* void cblas_zaxpy(OPENBLAS_CONST blasint n,  *\/ */
+/*     /\* 				 OPENBLAS_CONST double *alpha, *\/ */
+/*     /\* 				 OPENBLAS_CONST double *x, *\/ */
+/*     /\* 				 OPENBLAS_CONST blasint incx, *\/ */
+/*     /\* 				 double *y, *\/ */
+/*     /\* 				 OPENBLAS_CONST blasint incy); *\/ */
+
+/*     cblas_zaxpy(size, */
+/*                 (d*) &alpha, */
+/*                 (d*) B->data, */
+/*                 1, */
+/*                 (d*) diff_temp, */
+/*                 1 ); */
+	
+/*     return c_norm(diff_temp,size); */
+/* } */
+#endif  /* if 0 */
+
+//////////////////////////////////////////////////////////////////////
--- a/beamforming/c/ascee_alg.h
+++ b/beamforming/c/ascee_alg.h
@ -0,0 +1,149 @@
+// ascee_alg.h
+//
+// Author: J.A. de Jong - ASCEE
+//
+// Description:
+// (Linear) algebra routines on matrices and vectors
+//////////////////////////////////////////////////////////////////////
+#pragma once
+#ifndef ASCEE_ALG_H
+#define ASCEE_ALG_H
+#include "ascee_math.h"
+
+/** 
+ * Compute the dot product of two vectors of floats
+ *
+ * @param a First vector
+ * @param b Second second vector
+ * @return dot product as float
+ */
+static inline d vd_dot(const vd * a,const vd* b) {
+    dbgassert(a->size == b->size,SIZEINEQUAL);
+    return d_dot(a->ptr,b->ptr,a->size);
+}
+
+/** 
+ * y = fac * y
+ *
+ * @param y 
+ * @param fac scale factor
+ */
+static inline void dmat_scale(dmat* y,const c fac){
+    dbgassert(y,NULLPTRDEREF);
+    if(likely(y->data)) {
+        d_scale(y->data,fac,y->n_cols*y->n_rows);
+    }
+    else {
+        for(us col=0;col<y->n_cols;col++) {
+            d_scale(y->col_ptrs[col],fac,y->n_rows);
+        }
+    }
+}
+/** 
+ * y = fac * y
+ *
+ * @param y 
+ * @param fac scale factor
+ */
+static inline void cmat_scale(cmat* y,const c fac){
+    dbgassert(y,NULLPTRDEREF);
+    dbgassert(y,NULLPTRDEREF);
+    if(likely(y->data)) {
+        c_scale(y->data,fac,y->n_cols*y->n_rows);
+    }
+    else {
+        for(us col=0;col<y->n_cols;col++) {
+            c_scale(y->col_ptrs[col],fac,y->n_rows);
+        }
+    }
+}
+
+/** 
+ * x = x + fac*y
+ *
+ * @param x 
+ * @param y 
+ * @param fac 
+ */
+static inline void dmat_add_dmat(dmat* x,dmat* y,d fac) {
+    dbgassert(x && y,NULLPTRDEREF);
+    dbgassert(x->n_cols == y->n_cols,SIZEINEQUAL);
+    dbgassert(x->n_rows == y->n_rows,SIZEINEQUAL);
+    if(likely(x->data && y->data)) {
+        d_add_to(x->data,y->data,fac,x->n_cols*x->n_rows);
+    }
+    else {
+        for(us col=0;col<y->n_cols;col++) {
+            d_add_to(x->col_ptrs[col],y->col_ptrs[col],fac,x->n_rows);
+        }
+    }
+}
+/** 
+ * x = x + fac*y
+ *
+ * @param[in,out] x 
+ * @param[in] y 
+ * @param[in] fac 
+ */
+static inline void cmat_add_cmat(cmat* x,cmat* y,c fac) {
+    dbgassert(x && y,NULLPTRDEREF);
+    dbgassert(x->n_cols == y->n_cols,SIZEINEQUAL);
+    dbgassert(x->n_rows == y->n_rows,SIZEINEQUAL);
+    if(likely(x->data && y->data)) {
+        c_add_to(x->data,y->data,fac,x->n_cols*x->n_rows);
+    }
+    else {
+        for(us col=0;col<y->n_cols;col++) {
+            c_add_to(x->col_ptrs[col],y->col_ptrs[col],fac,x->n_rows);
+        }
+    }
+}
+
+/** 
+ * Compute the element-wise (Hadamard) product of a and b, and store
+ * in result
+ *
+ * @param[out] result 
+ * @param[in] a 
+ * @param[in] b 
+ */
+static inline void vc_elem_prod(vc* result,vc* a,vc* b) {
+    dbgassert(result  && a && b,NULLPTRDEREF);
+    dbgassert(result->size==a->size,SIZEINEQUAL);
+    dbgassert(b->size==a->size,SIZEINEQUAL);
+    c_elem_prod_c(result->ptr,a->ptr,b->ptr,a->size);
+}
+/** 
+ * Compute the matrix vector product for complex-valued types: b = A*x.
+ *
+ * @param[in] A Matrix A
+ * @param[in] x Vector x
+ * @param[out] b Result of computation
+ */
+void cmv_dot(const cmat* A,
+             const vc* restrict x,
+             vc* restrict b);
+
+int lsq_solve(const cmat* A,
+              const vc* restrict b,
+              vc* restrict x);
+
+/** 
+ * Compute the norm of the difference of two complex matrices
+ *
+ * @param A 
+ * @param B 
+ */
+d cmat_normdiff(const cmat* A,const cmat* B);
+
+/** 
+ * Computes the Kronecker product of a kron b, stores result in result.
+ *
+ * @param a a
+ * @param b b
+ * @param result a kron b
+ */
+void kronecker_product(const cmat* a,const cmat* b,cmat* result);
+
+#endif // ASCEE_ALG_H
+//////////////////////////////////////////////////////////////////////
--- a/beamforming/c/ascee_math.c
+++ b/beamforming/c/ascee_math.c
@ -1,33 +1,44 @@
-// si_math.c
+// ascee_math.c
 //
-// last-edit-by: J.A. de Jong 
+// Author: J.A. de Jong -ASCEE
 // 
 // Description:
-//
+// 
 //////////////////////////////////////////////////////////////////////
 #define TRACERPLUS (-10)
+#include "ascee_math.h"

 #include "ascee_assert.h"
 #include "ascee_math.h"
-#include "tracer.h"
+#include "ascee_tracer.h"

-#if ASCEE_USE_BLAS
-#include <cblas.h>
-#endif

 #include <math.h>

 #ifdef ASCEE_DEBUG
+void print_dmat(const dmat* m) {
+    size_t row,col;
+    for(row=0;row<m->n_rows;row++){
+        for(col=0;col<m->n_cols;col++){
+            d val = *getdmatval(m,row,col);
+            printf("%c%2.2e ", val<0?'-':' ' ,d_abs(val));
+			
+        }
+        printf("\n");
+
+    }
+}
 void print_cmat(const cmat* m) {
    size_t row,col;
    for(row=0;row<m->n_rows;row++){
        for(col=0;col<m->n_cols;col++){
-            c val = m->data[row+m->n_rows*col];
+            c val = *getcmatval(m,row,col);

            d rval = creal(val);
            d ival = cimag(val);
 			
-            printf("%c%2.2e%c%2.2ei ",rval< 0 ?'-': ' ', d_abs(rval),ival<0 ? '-' : '+',d_abs(ival) ) ;
+            printf("%c%2.2e%c%2.2ei ",rval< 0 ?'-': ' ',
+                   d_abs(rval),ival<0 ? '-' : '+',d_abs(ival) ) ;
 			
        }
        printf("\n");
@ -60,394 +71,7 @@ void print_vd(const vd* m) {
        printf("\n");
    }
 }
-void print_dmat(const dmat* m) {
-    size_t row,col;
-    for(row=0;row<m->n_rows;row++){
-        for(col=0;col<m->n_cols;col++){
-            d val = m->data[row+m->n_rows*col];
-            printf("%c%2.2e ", val<0?'-':' ' ,d_abs(val));
-			
-        }
-        printf("\n");

-    }
-}
 #endif

-void d_elem_prod_d(d res[],
-                   const d arr1[],
-                   const d arr2[],
-                   const us size) {
-
-    #if ASCEE_USE_BLAS
-
-    #if ASCEE_DEBUG
-
-    if(arr1 == arr2) {
-        DBGWARN("d_elem_prod_d: Array 1 and array 2 point to the same"
-                " memory. This results in pointer aliasing, for which"
-                " testing is still to be done. Results might be"
-                " unrealiable.");
-    }
-
-    #endif
-
-
-    #if ASCEE_DOUBLE_PRECISION
-    #define elem_prod_fun cblas_dsbmv
-    #else
-    #define elem_prod_fun cblas_ssbmv
-    #endif
-    /* These parameters do not matter for this specific case */
-    const CBLAS_ORDER  mat_order= CblasColMajor;
-    const CBLAS_UPLO   uplo = CblasLower;
-
-    /* Extra multiplication factor */
-    const d alpha = 1.0;
-
-    /* void cblas_dsbmv(OPENBLAS_CONST enum CBLAS_ORDER order, */
-    /*                  OPENBLAS_CONST enum CBLAS_UPLO Uplo, */
-    /*                  OPENBLAS_CONST blasint N, */
-    /*                  OPENBLAS_CONST blasint K, */
-    /*                  OPENBLAS_CONST double alpha, */
-    /*                  OPENBLAS_CONST double *A, */
-    /*                  OPENBLAS_CONST blasint lda, */
-    /*                  OPENBLAS_CONST double *X, */
-    /*                  OPENBLAS_CONST blasint incX, */
-    /*                  OPENBLAS_CONST double beta, */
-    /*                  double *Y, */
-    /*                  OPENBLAS_CONST blasint incY); */
-
-    elem_prod_fun(mat_order,
-                  uplo,
-                  (blasint) size,
-                  0,             // Just the diagonal; 0 super-diagonal bands
-                  alpha,        /* Multiplication factor alpha */
-                  arr1,
-                  1,            /* LDA */
-                  arr2,         /* x */
-                  1, /* incX = 1 */
-                  0.0,          /* Beta */
-                  res,    /* The Y matrix to write to */
-                  1); /* incY */
-    #undef elem_prod_fun
-
-    #else  /* No blas routines, routine is very simple, but here we
-            * go! */
-    DBGWARN("Performing slow non-blas vector-vector multiplication");
-    for(us i=0;i<size;i++) {
-        res[i] = arr1[i]*arr2[i];
-    }
-    #endif
-}
-
-void c_elem_prod_c(c res[],
-                   const c arr1[],
-                   const c arr2[],
-                   const us size) {
-
-    TRACE(15,"c_elem_prod_c");
-    uVARTRACE(15,size);
-    
-    #if ASCEE_USE_BLAS
-
-    #if ASCEE_DEBUG
-
-    if(arr1 == arr2) {
-        DBGWARN("c_elem_prod_c: Array 1 and array 2 point to the same"
-                " memory. This results in pointer aliasing, for which"
-                " testing is still to be done. Results might be"
-                " unrealiable.");
-    }
-
-    #endif  /* ASCEE_DEBUG */
-
-
-    #if ASCEE_DOUBLE_PRECISION
-    #define elem_prod_fun cblas_zgbmv
-    #else
-    #define elem_prod_fun cblas_cgbmv
-    #endif
-
-    /* These parameters do not matter for this specific case */
-    const CBLAS_ORDER  mat_order= CblasColMajor;
-    const CBLAS_TRANSPOSE tr = CblasNoTrans;
-
-    const c alpha = 1.0;
-    const c beta = 0.0;
-    TRACE(15,"Calling " annestr(elem_prod_fun));
-    
-    elem_prod_fun(mat_order,
-                  tr,
-                  (blasint) size, /* M: Number of rows */
-                  (blasint) size, /* B: Number of columns */
-                  0,              /* KL: Number of sub-diagonals */
-                  0,              /* KU: Number of super-diagonals */
-                  (d*) &alpha,        /* Multiplication factor */
-                  (d*) arr2,          /* A */
-                  1,            /* LDA */
-                  (d*) arr1,    /* x */
-                  1, /* incX = 1 */
-                  (d*) &beta,
-                  (d*) res,    /* The Y matrix to write to */
-                  1); /* incY */
-
-    #undef elem_prod_fun
-
-    #else  /* No blas routines, routine is very simple, but here we
-            * go! */
-    DBGWARN("Performing slow non-blas vector-vector multiplication");
-    for(us i=0;i<size;i++) {
-        res[i] = arr1[i]*arr2[i];
-    }
-    #endif
-}
-
-
-void cmv_dot(const cmat* A,const vc* restrict x,vc* restrict b){
-
-    assert(A->n_rows == b->size);
-    assert(A->n_cols == x->size);
-	
-    #if ASCEE_USE_BLAS == 1
-
-    /* typedef enum CBLAS_ORDER     {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; */
-    /* typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; */
-    /* 
-       void cblas_zgemv(OPENBLAS_CONST enum CBLAS_ORDER order,
-       OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,
-       OPENBLAS_CONST blasint m,
-       OPENBLAS_CONST blasint n,
-       OPENBLAS_CONST double *alpha,
-       OPENBLAS_CONST double  *a,
-       OPENBLAS_CONST blasint lda,
-       OPENBLAS_CONST double  *x,
-       OPENBLAS_CONST blasint incx,
-       OPENBLAS_CONST double *beta,
-       double  *y,
-       OPENBLAS_CONST blasint incy);
-    */
-    c alpha = 1.0;
-    c beta = 0.0;
-    cblas_zgemv(CblasColMajor,
-                CblasNoTrans,
-                A->n_rows,
-                A->n_cols,
-                (d*) &alpha,			/* alpha */
-                (d*) A->data,		/* A */
-                A->n_rows,		/* lda */
-                (d*) x->data,		/*  */
-                1,
-                (d*) &beta,			/* beta */
-                (d*) b->data,
-                1);
-				
-				
-				
-    #else
-    size_t i,j;
-    size_t n_rows = A->n_rows;
-
-    vc_set(b,0.0);
-
-    iVARTRACE(20,A->n_cols);
-    iVARTRACE(20,A->n_rows);
-
-    for(j=0;j<A->n_cols;j++){
-        for(i=0;i<A->n_rows;i++) {
-
-            c* Aij = &A->data[i+j*n_rows];
-            b->data[i] += *Aij * x->data[j];
-
-        }
-
-    }
-
-
-    #endif
-}
-	
-void kronecker_product(const cmat* a,const cmat* b,cmat* result){
-
-    assert(result->n_rows == a->n_rows*b->n_rows);
-    assert(result->n_cols == a->n_cols*b->n_cols);
-
-    c a_rs;
-    c b_vw;
-
-    int r_col;
-    int r_row;
-
-    for(size_t r=0; r< a->n_rows;r++){
-
-        for(size_t s=0; s <a->n_cols;s++) {
-
-            for(size_t v=0;v < b->n_rows; v++) {
-
-                for(size_t w=0;w < b->n_cols;w++) {
-					
-                    a_rs = *getcmatval(a,r,s);
-                    b_vw = *getcmatval(b,v,w);
-
-                    r_row = b->n_rows*r+v;
-                    r_col = b->n_cols*s+w;
-					
-                    result->data[r_row + r_col * result->n_rows] = a_rs * b_vw;
-
-                }
-            }
-        }
-    }
-} /* void kronecker_product */
-
-/* #include <lapacke.h> */
-/* These functions can be directly linked to openBLAS */
-#define lapack_complex_double   double _Complex
-#define lapack_complex_float   float _Complex
-
-#define LAPACK_ROW_MAJOR               101
-#define LAPACK_COL_MAJOR               102
-
-#define LAPACK_WORK_MEMORY_ERROR       -1010
-#define LAPACK_TRANSPOSE_MEMORY_ERROR  -1011
-
-typedef int lapack_int;
-
-int LAPACKE_cgelss( int matrix_layout, int m, int n,
-                    int nrhs, lapack_complex_float* a,
-                    int lda, lapack_complex_float* b,
-                    int ldb, float* s, float rcond,
-                    int* rank );
-int LAPACKE_zgelss( int matrix_layout, int m, int n,
-                    int nrhs, lapack_complex_double* a,
-                    int lda, lapack_complex_double* b,
-                    int ldb, double* s, double rcond,
-                    int* rank );
-
-lapack_int LAPACKE_zgels( int matrix_layout, char trans, lapack_int m,
-                          lapack_int n, lapack_int nrhs,
-                          lapack_complex_double* a, lapack_int lda,
-                          lapack_complex_double* b, lapack_int ldb );
-
-
-
-
-#if ASCEE_FLOAT == 64
-
-#define lapack_gelss LAPACKE_zgelss
-#define lapack_gels LAPACKE_zgels
-#else
-
-#define lapack_gelss LAPACKE_cgelss
-#endif
-
-#define max(a,b) ((a)>(b)?(a):(b))
-
-
-/* int lsq_solve(const cmat* A,const vc* b,vc* x){ */
-    
-/*     POOL_INIT(lsq_solve_pool); */
-/*     int rv; */
-/*     /\* M: number of rows of matrix *\/ */
-/*     /\* N: Number of columns *\/ */
-/*     /\* Norm: L2|b-A*x| *\/ */
-/*     /\* NRHS: Number of right hand sides: Number of columns of matrix B *\/ */
-
-/*     assert(A->n_rows>=A->n_cols); */
-/*     assert(x->size == A->n_cols); */
-/*     assert(b->size == A->n_rows); */
-	
-/*     int info; */
-	
-/*     size_t lda = max(1,A->n_rows); */
-/*     size_t ldb = max(lda,A->n_cols); */
-	
-/*     /\* Make explicit copy of matrix A data, as it will be overwritten */
-/*      * by lapack_gels *\/ */
-/*     c* A_data = Pool_allocatec(&lsq_solve_pool,A->n_rows*A->n_cols); */
-/*     c_copy(A_data,A->data,A->n_cols*A->n_rows); */
-
-/*     c* work_data = Pool_allocatec(&lsq_solve_pool,b->size); */
-/*     c_copy(work_data,b->data,b->size); */
-	
-/*     /\* Lapack documentation says: *\/ */
-/*     /\* 	if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */
-/*         squares solution vectors; the residual sum of squares for the */
-/*         solution in each column is given by the sum of squares of the */
-/*         modulus of elements N+1 to M in that column; */
-/*     *\/ */
-
-	
-/*     /\* We always assume one RHS column *\/ */
-/*     const int nrhs = 1; */
-
-/*     /\* General Least Squares Solve *\/ */
-/*     info = lapack_gels(LAPACK_COL_MAJOR, /\* Column-major ordering *\/ */
-/*                        'N', */
-/*                        A->n_rows,	/\* Number of rows in matrix *\/ */
-/*                        A->n_cols,	/\* Number of columns *\/ */
-/*                        nrhs,   /\* nrhs, which is number_mics *\/ */
-/*                        A_data, /\* The A-matrix *\/ */
-/*                        lda,	  /\* lda: the leading dimension of matrix A *\/ */
-/*                        work_data,	  /\* The b-matrix *\/ */
-/*                        ldb);  /\* ldb: the leading dimension of b: max(1,M,N) *\/ */
-		
-/*     if(info==0){ */
-/*         c_copy(x->data,work_data,x->size); */
-/*         rv = SUCCESS; */
-/*     } */
-/*     else { */
-/*         memset(x->data,0,x->size); */
-/*         WARN("LAPACK INFO VALUE"); */
-/*         printf("%i\n", info ); */
-/*         TRACE(15,"Solving least squares problem failed\n"); */
-
-/*         rv = FAILURE; */
-/*     } */
-
-/*     Pool_free(&lsq_solve_pool,A_data); */
-/*     Pool_free(&lsq_solve_pool,work_data); */
-/*     POOL_EXIT(lsq_solve_pool,15);     */
-/*     return rv; */
-    
-/* } */
-
-/* d c_normdiff(const cmat* A,const cmat* B) { */
-
-/*     TRACE(15,"c_normdif"); */
-	
-/*     dbgassert(A->n_cols==B->n_cols,"Number of columns of A and B " */
-/*               "should be equal"); */
-/*     dbgassert(A->n_rows==B->n_rows,"Number of rows of A and B " */
-/*               "should be equal"); */
-
-/*     size_t size = A->n_cols*A->n_rows; */
-
-/*     vc diff_temp = vc_al[MAX_MATRIX_SIZE]; */
-
-/*     c_copy(diff_temp,A->data,size); */
-
-/*     c alpha = -1.0; */
-
-/*     /\* This routine computes y <- alpha*x + beta*y *\/ */
-	
-	
-/*     /\* void cblas_zaxpy(OPENBLAS_CONST blasint n,  *\/ */
-/*     /\* 				 OPENBLAS_CONST double *alpha, *\/ */
-/*     /\* 				 OPENBLAS_CONST double *x, *\/ */
-/*     /\* 				 OPENBLAS_CONST blasint incx, *\/ */
-/*     /\* 				 double *y, *\/ */
-/*     /\* 				 OPENBLAS_CONST blasint incy); *\/ */
-
-/*     cblas_zaxpy(size, */
-/*                 (d*) &alpha, */
-/*                 (d*) B->data, */
-/*                 1, */
-/*                 (d*) diff_temp, */
-/*                 1 ); */
-	
-/*     return c_norm(diff_temp,size); */
-/* } */
-
 //////////////////////////////////////////////////////////////////////
-
--- a/beamforming/c/ascee_math.h
+++ b/beamforming/c/ascee_math.h
--- a/beamforming/c/ascee_math_raw.c
+++ b/beamforming/c/ascee_math_raw.c
@ -0,0 +1,146 @@
+// si_math.c
+//
+// last-edit-by: J.A. de Jong 
+// 
+// Description:
+// Operations working on raw arrays of floating point numbers
+//////////////////////////////////////////////////////////////////////
+#include "ascee_math_raw.h"
+#if ASCEE_USE_BLAS
+#include <cblas.h>
+#endif
+
+void d_elem_prod_d(d res[],
+                   const d arr1[],
+                   const d arr2[],
+                   const us size) {
+
+    #if ASCEE_USE_BLAS
+
+    #if ASCEE_DEBUG
+
+    if(arr1 == arr2) {
+        DBGWARN("d_elem_prod_d: Array 1 and array 2 point to the same"
+                " memory. This results in pointer aliasing, for which"
+                " testing is still to be done. Results might be"
+                " unrealiable.");
+    }
+
+    #endif
+
+
+    #if ASCEE_DOUBLE_PRECISION
+    #define elem_prod_fun cblas_dsbmv
+    #else
+    #define elem_prod_fun cblas_ssbmv
+    #endif
+    /* These parameters do not matter for this specific case */
+    const CBLAS_ORDER  mat_order= CblasColMajor;
+    const CBLAS_UPLO   uplo = CblasLower;
+
+    /* Extra multiplication factor */
+    const d alpha = 1.0;
+
+    /* void cblas_dsbmv(OPENBLAS_CONST enum CBLAS_ORDER order, */
+    /*                  OPENBLAS_CONST enum CBLAS_UPLO Uplo, */
+    /*                  OPENBLAS_CONST blasint N, */
+    /*                  OPENBLAS_CONST blasint K, */
+    /*                  OPENBLAS_CONST double alpha, */
+    /*                  OPENBLAS_CONST double *A, */
+    /*                  OPENBLAS_CONST blasint lda, */
+    /*                  OPENBLAS_CONST double *X, */
+    /*                  OPENBLAS_CONST blasint incX, */
+    /*                  OPENBLAS_CONST double beta, */
+    /*                  double *Y, */
+    /*                  OPENBLAS_CONST blasint incY); */
+
+    elem_prod_fun(mat_order,
+                  uplo,
+                  (blasint) size,
+                  0,             // Just the diagonal; 0 super-diagonal bands
+                  alpha,        /* Multiplication factor alpha */
+                  arr1,
+                  1,            /* LDA */
+                  arr2,         /* x */
+                  1, /* incX = 1 */
+                  0.0,          /* Beta */
+                  res,    /* The Y matrix to write to */
+                  1); /* incY */
+    #undef elem_prod_fun
+
+    #else  /* No blas routines, routine is very simple, but here we
+            * go! */
+    DBGWARN("Performing slow non-blas vector-vector multiplication");
+    for(us i=0;i<size;i++) {
+        res[i] = arr1[i]*arr2[i];
+    }
+    #endif
+}
+
+void c_elem_prod_c(c res[],
+                   const c arr1[],
+                   const c arr2[],
+                   const us size) {
+
+    TRACE(15,"c_elem_prod_c");
+    uVARTRACE(15,size);
+    
+    #if ASCEE_USE_BLAS
+
+    #if ASCEE_DEBUG
+
+    if(arr1 == arr2) {
+        DBGWARN("c_elem_prod_c: Array 1 and array 2 point to the same"
+                " memory. This results in pointer aliasing, for which"
+                " testing is still to be done. Results might be"
+                " unrealiable.");
+    }
+
+    #endif  /* ASCEE_DEBUG */
+
+
+    #if ASCEE_DOUBLE_PRECISION
+    #define elem_prod_fun cblas_zgbmv
+    #else
+    #define elem_prod_fun cblas_cgbmv
+    #endif
+
+    /* These parameters do not matter for this specific case */
+    const CBLAS_ORDER  mat_order= CblasColMajor;
+    const CBLAS_TRANSPOSE tr = CblasNoTrans;
+
+    const c alpha = 1.0;
+    const c beta = 0.0;
+    TRACE(15,"Calling " annestr(elem_prod_fun));
+    
+    elem_prod_fun(mat_order,
+                  tr,
+                  (blasint) size, /* M: Number of rows */
+                  (blasint) size, /* B: Number of columns */
+                  0,              /* KL: Number of sub-diagonals */
+                  0,              /* KU: Number of super-diagonals */
+                  (d*) &alpha,        /* Multiplication factor */
+                  (d*) arr2,          /* A */
+                  1,            /* LDA */
+                  (d*) arr1,    /* x */
+                  1, /* incX = 1 */
+                  (d*) &beta,
+                  (d*) res,    /* The Y matrix to write to */
+                  1); /* incY */
+
+    #undef elem_prod_fun
+
+    #else  /* No blas routines, routine is very simple, but here we
+            * go! */
+    DBGWARN("Performing slow non-blas vector-vector multiplication");
+    for(us i=0;i<size;i++) {
+        res[i] = arr1[i]*arr2[i];
+    }
+    #endif
+}
+
+
+
+
+//////////////////////////////////////////////////////////////////////
+
--- a/beamforming/c/ascee_math_raw.h
+++ b/beamforming/c/ascee_math_raw.h
@ -0,0 +1,469 @@
+// ascee_math_raw.h
+//
+// Author: J.A. de Jong - ASCEE
+//
+// Description: Raw math routines working on raw arrays of floats and
+// complex numbers.
+//////////////////////////////////////////////////////////////////////
+#pragma once
+#ifndef ASCEE_MATH_RAW_H
+#define ASCEE_MATH_RAW_H
+#include "ascee_assert.h"
+#include "ascee_tracer.h"
+#include "types.h"
+#include <math.h>
+
+#if ASCEE_USE_BLAS == 1
+#include <cblas.h>
+#endif
+
+#ifdef ASCEE_DOUBLE_PRECISION
+#define c_real creal
+#define c_imag cimag
+#define d_abs fabs
+#define c_abs cabs
+#define c_conj conj
+#define d_atan2 atan2
+#define d_acos acos
+#define d_sqrt sqrt
+#define c_exp cexp
+#define d_sin sin
+#define d_cos cos
+#define d_pow pow
+
+#else  // ASCEE_DOUBLE_PRECISION not defined
+#define c_conj conjf
+#define c_real crealf
+#define c_imag cimagf
+#define d_abs fabsf
+#define c_abs cabsf
+#define d_atan2 atan2f
+#define d_acos acosf
+#define d_sqrt sqrtf
+#define c_exp cexpf
+#define d_sin sinf
+#define d_cos cosf
+#define d_pow powf
+
+#endif // ASCEE_DOUBLE_PRECISION
+
+#ifdef M_PI
+static const d number_pi = M_PI;
+#else
+static const d number_pi = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679;
+#endif
+
+/** 
+ * Set all elements in an array equal to val
+ *
+ * @param to 
+ * @param val 
+ * @param size 
+ */
+static inline void d_set(d to[],d val,us size) {
+    for(us i=0;i<size;i++) {
+        to[i]=val;
+    }
+}
+/** 
+ * Set all elements in an array equal to val
+ *
+ * @param to 
+ * @param val 
+ * @param size 
+ */
+static inline void c_set(c to[],c val,us size) {
+    for(us i=0;i<size;i++) {
+        to[i]=val;
+    }
+}
+
+/** 
+ * Return the maximum of two doubles
+ *
+ * @param a value 1
+ * @param b value 2
+ * 
+ * @returns the maximum of value 1 and 2
+ */
+static inline d d_max(const d a,const d b) {
+    return a>b?a:b;
+}
+
+
+/** 
+ * Return the dot product of two arrays, one of them complex-valued,
+ * the other real-valued
+ *
+ * @param a the complex-valued array
+ * @param b the real-valued array
+ * @param size the size of the arrays. *Should be equal-sized!*
+ *
+ * @return the dot product
+ */
+static inline c cd_dot(const c a[],const d b[],us size){
+    c result = 0;
+    us i;
+    for(i=0;i<size;i++){
+        result+=a[i]*b[i];
+    }
+    return result;
+}
+/** 
+ * Return the dot product of two complex-valued arrays. Wraps BLAS
+ * when ASCEE_USE_BLAS == 1.
+ *
+ * @param a complex-valued array
+ * @param b complex-valued array
+ * @param size the size of the arrays. *Should be equal-sized!*
+ *
+ * @return the dot product
+ */
+static inline c cc_dot(const c a[],const c b[],us size){
+    #if ASCEE_USE_BLAS == 1
+    WARN("CBlas zdotu not yet tested");
+    #if ASCEE_DOUBLE_PRECISION
+    // assert(0);
+    return cblas_zdotu(size,(d*) a,1,(d*) b,1);
+    #else
+    return cblas_cdotu(size,(d*) a,1,(d*) b,1);
+    #endif
+    #else
+    c result = 0;
+    us i;
+    for(i=0;i<size;i++){
+        result+=a[i]*b[i];
+    }
+    return result;
+    #endif
+}
+
+/** 
+ * Compute the dot product of two real arrays.
+ *
+ * @param a First array.
+ * @param b Second array.
+ * @param size Size of the arrays.
+ * @return The result.
+ */
+static inline d d_dot(const d a[],const d b[],const us size){
+    #if ASCEE_USE_BLAS == 1
+    #if ASCEE_DOUBLE_PRECISION
+    return cblas_ddot(size,a,1,b,1);
+    #else  // Single precision function
+    return cblas_sdot(size,a,1,b,1);
+    #endif
+    #else  // No BLAS, do it manually
+    d result = 0;
+    us i;
+    for(i=0;i<size;i++){
+        result+=a[i]*b[i];
+    }
+    return result;
+    #endif
+}
+
+
+/** 
+ * Copy array of floats.
+ *
+ * @param to : Array to write to
+ * @param from : Array to read from
+ * @param size : Size of arrays
+ */
+static inline void d_copy(d to[],const d from[],const us size){
+    #if ASCEE_USE_BLAS == 1
+    cblas_dcopy(size,from,1,to,1);
+    #else
+    us i;
+    for(i=0;i<size;i++)
+        to[i] = from[i];
+    #endif
+}
+
+
+/** 
+ * Copy array of floats to array of complex floats. Imaginary part set
+ * to zero.
+ *
+ * @param to : Array to write to
+ * @param from : Array to read from
+ * @param size : Size of arrays
+ */
+static inline void cd_copy(c to[],const d from[],const us size) {
+    us i;
+    for(i=0;i<size;i++) {
+        to[i] = (c) (from[i]);
+        dbgassert(cimag(to[i]) == 0,"Imaginary part not equal to zero");
+    }
+}
+
+/** 
+ * Copy float vector to complex vector. Imaginary part set
+ * to zero.
+ *
+ * @param to : Vector to write to
+ * @param from : Vector to read from
+ */
+static inline void c_copy(c to[],const c from[],us size){
+    
+    #if ASCEE_USE_BLAS == 1
+    #if ASCEE_DOUBLE_PRECISION
+    cblas_zcopy(size,(d*) from,1,(d*) to,1);
+    #else
+    cblas_ccopy(size,(d*) from,1,(d*) to,1);
+    #endif
+    #else
+    us i;
+    for(i=0;i<size;i++)
+        to[i] = from[i];
+    #endif
+}
+/** 
+ * Multiply y with fac, and add result to x
+ *
+ * @param x Array to add to
+ * @param y Array to add to x
+ * @param fac Factor with which to multiply y
+ * @param size Size of the arrays
+ */
+static inline void d_add_to(d x[],const d y[],d fac,us size){
+    #if ASCEE_USE_BLAS == 1
+    #if ASCEE_DOUBLE_PRECISION
+    cblas_daxpy(size,fac,y,1,x,1);
+    #else
+    cblas_saxpy(size,fac,y,1,x,1);
+    #endif
+    #else
+    us i;
+    for(i=0;i<size;i++)
+        x[i]+=fac*y[i];
+    #endif
+}
+/** 
+ * x = x + fac*y
+ *
+ * @param x Array to add to
+ * @param y Array to add to x
+ * @param fac Factor with which to multiply y
+ * @param size Size of the arrays
+ */
+static inline void c_add_to(c x[],const c y[],c fac,us size){
+    #if ASCEE_USE_BLAS == 1
+    #if ASCEE_DOUBLE_PRECISION
+    cblas_zaxpy(size,(d*) &fac,(d*) y,1,(d*) x,1);
+    #else
+    cblas_caxpy(size,(d*) &fac,(d*) y,1,(d*) x,1);
+    #endif
+    #else
+    us i;
+    for(i=0;i<size;i++)
+        x[i]+=fac*y[i];
+    #endif
+}
+
+/** 
+ * Scale an array of doubles
+ *
+ * @param a array
+ * @param scale_fac scale factor
+ * @param size size of the array
+ */
+static inline void d_scale(d a[],const d scale_fac,us size){
+    #if ASCEE_USE_BLAS == 1
+    #if ASCEE_DOUBLE_PRECISION    
+    cblas_dscal(size,scale_fac,a,1);
+    #else
+    cblas_sscal(size,scale_fac,a,1);
+    #endif
+    #else
+    us i;
+    for(i=0;i<size;i++)
+        a[i]*=scale_fac;
+    #endif
+}
+
+
+/** 
+ * Scale an array of complex floats
+ *
+ * @param a array
+ * @param scale_fac scale factor
+ * @param size size of the array
+ */
+static inline void c_scale(c a[],const c scale_fac,us size){
+    #if ASCEE_USE_BLAS == 1
+    // Complex argument should be given in as array of two double
+    // values. The first the real part, the second the imaginary
+    // part. Fortunately the (c) type stores the two values in this
+    // order. To be portable and absolutely sure anything goes well,
+    // we convert it explicitly here.
+    d scale_fac_d [] = {creal(scale_fac),cimag(scale_fac)};
+
+    #if ASCEE_DOUBLE_PRECISION
+    cblas_zscal(size,scale_fac_d,(d*) a,1);
+    #else
+    cblas_cscal(size,scale_fac_d,(d*) a,1);
+    #endif
+    #else
+    us i;
+    for(i=0;i<size;i++)
+        a[i]*=scale_fac;
+    #endif
+}
+
+
+/** 
+ * Compute the maximum value of an array
+ *
+ * @param a array
+ * @param size size of the array
+ * @return maximum
+ */
+static inline d darray_max(const d a[],us size){
+    us i;
+    d max = a[0];
+    for(i=1;i<size;i++){
+        if(a[i] > max) max=a[i];
+    }
+    return max;
+}
+/** 
+ * Compute the minimum of an array
+ *
+ * @param a array
+ * @param size size of the array
+ * @return minimum
+ */
+static inline d d_min(const d a[],us size){
+    us i;
+    d min = a[0];
+    for(i=1;i<size;i++){
+        if(a[i] > min) min=a[i];
+    }
+    return min;
+}
+
+/** 
+ * Compute the \f$ L_2 \f$ norm of an array of doubles
+ *
+ * @param a Array
+ * @param size Size of array
+ */
+static inline d d_norm(const d a[],us size){
+    #if ASCEE_USE_BLAS == 1
+    return cblas_dnrm2(size,a,1);
+    #else	
+    d norm = 0;
+    us i;
+    for(i=0;i<size;i++){
+        norm+=a[i]*a[i];
+    }
+    norm = d_sqrt(norm);
+    return norm;
+    #endif
+	
+}
+
+/** 
+ * Compute the \f$ L_2 \f$ norm of an array of complex floats
+ *
+ * @param a Array
+ * @param size Size of array
+ */
+static inline d c_norm(const c a[],us size){
+    #if ASCEE_USE_BLAS == 1
+    return cblas_dznrm2(size,(d*) a,1);
+    #else	
+    d norm = 0;
+    us i;
+    for(i=0;i<size;i++){
+        d absa = c_abs(a[i]);
+        norm+=absa*absa;
+    }
+    norm = d_sqrt(norm);
+    return norm;
+    #endif
+	
+}
+
+
+/** 
+ * Computes the element-wise vector product, or Hadamard product of
+ * arr1 and arr2
+ *
+ * @param res Where the result will be stored
+ * @param arr1 Array 1
+ * @param vec2 Array 2
+ * @param size: Size of the arrays
+ */
+void d_elem_prod_d(d res[],
+                   const d arr1[],
+                   const d arr2[],
+                   const us size);
+
+/** 
+ * Computes the element-wise vector product, or Hadamard product of
+ * arr1 and arr2 for complex floats
+ *
+ * @param res Where the result will be stored
+ * @param arr1 Array 1
+ * @param vec2 Array 2
+ * @param size: Size of the arrays
+ */
+void c_elem_prod_c(c res[],
+                   const c arr1[],
+                   const c arr2[],
+                   const us size);
+
+/** 
+ * Compute the complex conjugate of a complex vector and store the
+ * result.
+ *
+ * @param res Result vector
+ * @param in Input vector
+ * @param size Size of the vector
+ */
+static inline void c_conj_c(c res[],const c in[],const us size) {
+    // First set the result vector to zero
+    c_set(res,0,size);
+    #ifdef ASCEE_USE_BLAS
+    #if ASCEE_DOUBLE_PRECISION
+    // Cast as a float, scale all odd elements with minus one to find
+    // the complex conjugate.
+    cblas_daxpy(size ,1,(d*) in,2,(d*) res,2);
+    cblas_daxpy(size,-1,&((d*) in)[1],2,&((d*) res)[1],2);
+    #else
+    cblas_faxpy(size ,1,(d*) in,2,(d*) res,2);
+    cblas_faxpy(size,-1,&((d*) in)[1],2,&((d*) res)[1],2);
+    #endif  // ASCEE_DOUBLE_PRECISION
+    #else
+    for(us i=0;i<size;i++) {
+        res[i] = c_conj(in[i]);
+    }
+    #endif  // ASCEE_USE_BLAS
+}
+/** 
+ * In place complex conjugation
+ *
+ * @param res Result vector
+ * @param size Size of the vector
+ */
+static inline void c_conj_inplace(c res[],us size) {
+    #ifdef ASCEE_USE_BLAS
+    #if ASCEE_DOUBLE_PRECISION
+    // Cast as a float, scale all odd elements with minus one to find
+    // the complex conjugate.
+    cblas_dscal(size,-1,&((d*) res)[1],2);
+    #else
+    cblas_sscal(size,-1,&((d*) res)[1],2);
+    #endif  // ASCEE_DOUBLE_PRECISION
+    #else
+    for(us i=0;i<size;i++) {
+        res[i] = c_conj(res[i]);
+    }
+    #endif  // ASCEE_USE_BLAS
+}
+
+#endif // ASCEE_MATH_RAW_H
+//////////////////////////////////////////////////////////////////////
--- a/beamforming/c/ascee_tracer.c
+++ b/beamforming/c/ascee_tracer.c
@ -7,7 +7,7 @@
 //////////////////////////////////////////////////////////////////////
 #if TRACER == 1
 #include <stdio.h>
-#include "tracer.h"
+#include "ascee_tracer.h"
 #include "types.h"

 #ifdef _REENTRANT
--- a/beamforming/c/ascee_tracer.h
+++ b/beamforming/c/ascee_tracer.h
@ -1,4 +1,4 @@
-// tracer.h
+// ascee_tracer.h
 //
 // Author: J.A. de Jong - ASCEE
 //
@ -6,8 +6,8 @@
 // Basic tracing code for debugging.
 //////////////////////////////////////////////////////////////////////
 #pragma once
-#ifndef TRACER_H
-#define TRACER_H
+#ifndef ASCEE_TRACER_H
+#define ASCEE_TRACER_H

 // Some console colors
 #define RESET   "\033[0m"
@ -214,6 +214,5 @@ void uvartrace_impl(const char* pos,int line,const char* varname,size_t var);
 #endif	// ######################################## TRACER ==1


-
-#endif // TRACER_H
+#endif // ASCEE_TRACER_H
 //////////////////////////////////////////////////////////////////////
--- a/beamforming/c/fft.c
+++ b/beamforming/c/fft.c
@ -6,30 +6,26 @@
 // 
 //////////////////////////////////////////////////////////////////////
 #define TRACERPLUS (-5)
-#include "tracer.h"
+#include "ascee_tracer.h"
 #include "fft.h"
 #include "types.h"
-/* #include "kiss_fftr.h" */
 #include "fftpack.h"

 typedef struct Fft_s {
    us nfft;
    us nchannels;
-
    Fftr* fftr;
-
 } Fft;

 Fft* Fft_alloc(const us nfft,const us nchannels) {
    fsTRACE(15);

-    #ifdef ASCEE_DEBUG
    if(nchannels > ASCEE_MAX_NUM_CHANNELS) {
        WARN("Too high number of channels! Please increase the "
             "ASCEE_MAX_NUM_CHANNELS compilation flag");
        return NULL;
    }
-    
+
    Fft* fft = a_malloc(sizeof(Fft));
    if(fft==NULL) {
        WARN("Fft allocation failed");
@ -41,10 +37,10 @@ Fft* Fft_alloc(const us nfft,const us nchannels) {

    fft->fftr = Fftr_alloc(nfft);
    if(!fft->fftr) {
+        a_free(fft);
        WARN(ALLOCFAILED "fftr");
        return NULL;
    }
-    #endif // ASCEE_PARALLEL

    feTRACE(15); 
    return fft;
@ -57,10 +53,9 @@ void Fft_free(Fft* fft) {
 }
 us Fft_nchannels(const Fft* fft) {return fft->nchannels;}
 us Fft_nfft(const Fft* fft) {return fft->nfft;}
-
 void Fft_fft(const Fft* fft,const dmat* timedata,cmat* result) {
    fsTRACE(15);
-
+    dbgassert(timedata && result,NULLPTRDEREF);
    dbgassert(timedata->n_rows == fft->nfft,"Invalid size for time data rows."
        " Should be equal to nfft");
    dbgassert(timedata->n_cols == fft->nchannels,"Invalid size for time data cols."
--- a/beamforming/c/mq.c
+++ b/beamforming/c/mq.c
@ -8,7 +8,7 @@
 //////////////////////////////////////////////////////////////////////
 #define TRACERPLUS (-6)
 #include "types.h"
-#include "tracer.h"
+#include "ascee_tracer.h"
 #include "ascee_assert.h"
 #include "ascee_alloc.h"
 #include "mq.h"
@ -257,8 +257,6 @@ void* JobQueue_assign(JobQueue* jq) {
    pid_t tid = syscall(SYS_gettid);    
    iVARTRACE(16,tid);    
    #endif // __linux__
-    
-    
    #endif //  ASCEE_DEBUG


--- a/beamforming/c/ps.c
+++ b/beamforming/c/ps.c
@ -5,11 +5,11 @@
 // Description:
 //
 //////////////////////////////////////////////////////////////////////
-#define TRACERPLUS 1000
+/* #define TRACERPLUS 1000 */
 #include "ps.h"
 #include "fft.h"
 #include "ascee_alloc.h"
-#include "ascee_math.h"
+#include "ascee_alg.h"
 #include "ascee_assert.h"

 typedef struct PowerSpectra_s {
@ -27,7 +27,7 @@ PowerSpectra* PowerSpectra_alloc(const us nfft,
                                 const us nchannels,
                                 const WindowType wt) {
    
-    TRACE(15,"PowerSpectra_alloc");
+    fsTRACE(15);
    int rv;

    /* Check nfft */
@ -61,11 +61,12 @@ PowerSpectra* PowerSpectra_alloc(const us nfft,
    if(rv!=0) {
        WARN("Error creating window function, continuing anyway");
    }
+    feTRACE(15);
    return ps;
 }

 void PowerSpectra_free(PowerSpectra* ps) {
-    TRACE(15,"PowerSpectra_free");
+    fsTRACE(15);

    Fft_free(ps->fft);
    vd_free(&ps->window);
@ -73,15 +74,19 @@ void PowerSpectra_free(PowerSpectra* ps) {
    dmat_free(&ps->timedata_work);
    vc_free(&ps->j_vec_conj);
    a_free(ps);
+
+    feTRACE(15);
 }


-int PowerSpectra_compute(const PowerSpectra* ps,
+void PowerSpectra_compute(const PowerSpectra* ps,
                         const dmat * timedata,
                         cmat * result) {
    
-    TRACE(15,"PowerSpectra_compute");
+    fsTRACE(15);

+    dbgassert(ps && timedata && result,NULLPTRDEREF);
+    
    const us nchannels = Fft_nchannels(ps->fft);
    const us nfft = Fft_nfft(ps->fft);
    uVARTRACE(15,nchannels);
@ -125,11 +130,13 @@ int PowerSpectra_compute(const PowerSpectra* ps,
            &timedata_work,
            &fft_work);

+    TRACE(15,"fft done");
    
    /* Scale fft such that power is easily comxputed */
    const c scale_fac = d_sqrt(2/win_pow)/nfft;
-    c_scale(fft_work.data,scale_fac,(nfft/2+1)*nchannels);
-
+    cmat_scale(&fft_work,scale_fac);
+    TRACE(15,"scale done");
+    
    for(i=0;i< nchannels;i++) {
        /* Multiply DC term with 1/sqrt(2) */
        *getcmatval(&fft_work,0,i) *= 1/d_sqrt(2.)+0*I;
@ -139,166 +146,34 @@ int PowerSpectra_compute(const PowerSpectra* ps,
    }

    /* print_cmat(&fft_work); */
+    TRACE(15,"Nyquist and DC correction done");

-    c* j_vec_conj = ps->j_vec_conj.data;
+    vc j_vec_conj = ps->j_vec_conj;

    /* Compute Cross-power spectra and store result */
    for(i =0; i<nchannels;i++) {
        for(j=0;j<nchannels;j++) {
+            iVARTRACE(15,i);
+            iVARTRACE(15,j);
            /* The indices here are important. This is also how it
             * is documented */
-            c* res = getcmatval(result,0,i+j*nchannels);
-
-            c* i_vec = getcmatval(&fft_work,0,i);
-            c* j_vec = getcmatval(&fft_work,0,j);
-
+            vc res = cmat_column(result,i+j*nchannels);
+            TRACE(15,"SFSG");
+            vc i_vec = cmat_column(&fft_work,i);
+            vc j_vec = cmat_column(&fft_work,j);
+            TRACE(15,"SFSG");
            /* Compute the conjugate of spectra j */
-            c_conj_c(j_vec_conj,j_vec,nfft/2+1);
-
-            /* Compute the product of the two vectors and store the
-             * result as the result */
-            c_elem_prod_c(res,i_vec,j_vec_conj,nfft/2+1);
-
+            vc_conj_vc(&j_vec_conj,&j_vec);
+            TRACE(15,"SFSG");
+            /* Compute the element-wise product of the two vectors and
+             * store the result as the result */
+            vc_elem_prod(&res,&i_vec,&j_vec_conj);
+            TRACE(15,"SFSG");
        }
    }
-
-    return SUCCESS;
+    feTRACE(15);
 }



-
-/* typedef struct AvPowerSpectra_s { */
-
-
-/*     us overlap_offset; */
-/*     us naverages;               /\* The number of averages taken *\/ */
-    
-/*     dmat prev_timedata;         /\**< Storage for previous */
-/*                                  * timedata buffer *\/ */
-
-/*     vc* ps;           /\**< Here we store the averaged */
-/*                        * results for each Cross-power */
-/*                        * spectra. These are */
-/*                        * nchannels*nchannels vectors *\/ */
-
-/*     vc* ps_work;         /\**< Work area for the results, also */
-/*                           * nchannels*nchannels *\/ */
-
-/* } AvPowerSpectra; */
-
-/* AvPowerSpectra* AvPowerSpectra_alloc(const us nfft, */
-/*                                      const us nchannels, */
-/*                                      const d overlap_percentage) { */
-    
-/*     TRACE(15,"AvPowerSpectra_alloc"); */
-/*     int rv; */
-
-/*     /\* Check nfft *\/ */
-/*     if(nfft % 2 != 0) { */
-/*         WARN("nfft should be even"); */
-/*         return NULL; */
-/*     } */
-
-/*     /\* Check overlap percentage *\/ */
-/*     us overlap_offset = nfft - (us) overlap_percentage*nfft/100; */
-
-/*     if(overlap_offset == 0 || overlap_offset > nfft) { */
-
-/*         WARN("Overlap percentage results in offset of 0, or a too high number of */
-/* overlap" */
-/*              " samples. Number of overlap samples should be < nfft"); */
-
-/*         WARN("Illegal overlap percentage. Should be 0 <= %% < 100"); */
-/*         return NULL; */
-/*     } */
-
-
-/*     /\* ALlocate space *\/ */
-/*     Fft fft; */
-/*     rv = Fft_alloc(&fft,nfft,nchannels); */
-/*     if(rv != SUCCESS) { */
-/*         WARN("Fft allocation failed"); */
-/*         return NULL; */
-/*     } */
-
-/*     AvPowerSpectra* aps = a_malloc(sizeof(AvPowerSpectra)); */
-/*     if(!aps) { */
-/*         WARN("Allocation of AvPowerSpectra memory failed"); */
-/*         return NULL; */
-/*     } */
-/*     ps->naverages = 0; */
-/*     ps->overlap_offset = overlap_offset; */
-
-/*     /\* Allocate vectors and matrices *\/ */
-/*     ps->prev_timedata = dmat_alloc(nfft,nchannels); */
-
-/*     return ps; */
-/* } */
-/* us AvPowerSpectra_getAverages(const AvPowerSpectra* ps) { */
-/*     return ps->naverages; */
-/* } */
-
-/* /\**  */
-/*  * Compute single power spectra by  */
-/*  * */
-/*  * @param ps Initialized AvPowerSpectra structure */
-/*  * @param timedata Timedata to compute for */
-/*  * @param result Result */
-/*  * */
-/*  * @return  */
-/*  *\/ */
-/* static int AvPowerSpectra_computeSingle(const AvPowerSpectra* ps, */
-/*                                       const dmat* timedata, */
-/*                                       vc* results) { */
-
-/*     us nchannels = ps->fft.nchannels; */
-/*     for(us channel=0;channel<ps;channel++) { */
-
-/*     } */
-/*     return SUCCESS; */
-/* } */
-
-
-/* int AvPowerSpectra_addTimeData(AvPowerSpectra* ps, */
-/*                              const dmat* timedata) { */
-    
-/*     TRACE(15,"AvPowerSpectra_addTimeData"); */
-/*     dbgassert(ps,"Null pointer given for ps"); */
-    
-/*     const us nchannels = ps->fft.nchannels; */
-/*     const us nfft = ps->fft.nfft; */
-
-/*     dbgassert(timedata->n_cols == nchannels,"Invalid time data"); */
-/*     dbgassert(timedata->n_rows == nfft,"Invalid time data"); */
-
-/*     dmat prevt = ps->prev_timedata; */
-/*     us noverlap = ps->noverlap; */
-
-/*     if(ps->naverages != 0) { */
-/*         /\* Copy the overlap buffer to the tbuf *\/ */
-/*         copy_dmat_rows(&tbuf,&overlap,0,0,noverlap); */
-
-/*         /\* Copy the new timedata buffer to the tbuf *\/ */
-/*         copy_dmat_rows(&tbuf,timedata,0,noverlap,(nfft-noverlap)); */
-/*     } */
-/*     else { */
-/*         /\* Copy the overlap buffer to the tbuf *\/ */
-/*         copy_dmat_rows(&tbuf,&overlap,0,0,noverlap); */
-
-
-/*     } */
-
-/*     return SUCCESS; */
-/* } */
-/* void AvPowerSpectra_free(AvPowerSpectra* ps) { */
-/*     TRACE(15,"AvPowerSpectra_free"); */
-
-/*     Fft_free(&ps->fft); */
-/*     dmat_free(&ps->prev_timedata); */
-/*     vd_free(&ps->window);     */
-
-/*     a_free(ps); */
-/* } */
-
 //////////////////////////////////////////////////////////////////////
--- a/beamforming/c/ps.h
+++ b/beamforming/c/ps.h
@ -9,59 +9,54 @@
 #ifndef PS_H
 #define PS_H
 #include "window.h"
-
-struct PowerSpectra_s;
-struct AvPowerSpectra_s;
-
 typedef struct PowerSpectra_s PowerSpectra;
-typedef struct AvPowerSpectra_s AvPowerSpectra;


+/** 
+ * Allocate a PowerSpectra computer
+ *
+ * @param nfft fft length
+ * @param nchannels Number of channels
+ * @param wt Windowtype, as defined in window.h
+ *
+ * @return PowerSpectra handle, NULL on error
+ */
 PowerSpectra* PowerSpectra_alloc(const us nfft,
                                 const us nchannels,
                                 const WindowType wt);

 /** 
- * Compute power spectra, returns to a complex array
+ * Compute power spectra.
+ *
+ * @param ps[in] PowerSpectra handle
+ * @param[in] timedata Time data. Should have size nfft*nchannels
+ *
+ * @param[out] result Here, the result will be stored. Should have
+ * size (nfft/2+1)*nchannels^2, such that the Cij at frequency index f
+ * can be obtained as result[f,i+j*nchannels]
 *
- * @param[in] timedata Time data, should be of size nfft*nchannels
- * @param [out] Cross-power spectra, array should be of size
- * (nfft/2+1) x (nchannels*nchannels), such that the cross spectra of
- * channel i with channel j at can be found as
- * getvcval(0,i+j*nchannels).
- * @return status code, SUCCESS on succes.
 */
-int PowerSpectra_compute(const PowerSpectra*,
+void PowerSpectra_compute(const PowerSpectra* ps,
                         const dmat* timedata,
                         cmat *result);

 /** 
- * Free storage of the PowerSpectra
- */
-void PowerSpectra_free(PowerSpectra*);
-
-
-// AvPowerSpectra* AvPowerSpectra_alloc(const us nfft,
-//                                      const us nchannels,
-//                                      const d overlap_percentage,
-//                                      const WindowType wt);
-                                 
-
-/** 
- * Return the current number of averages taken to obtain the result.
+ * Return nfft 
 *
- * @return The number of averages taken.
+ * @param ps[in] PowerSpectra handle
+ *
+ * @return nfft
 */
-us AvPowerSpectra_getAverages(const AvPowerSpectra*);
-
-
-int AvPowerSpectra_addTimeData(AvPowerSpectra*,
-                             const dmat* timedata);
+us PowerSpectra_getnfft(const PowerSpectra* ps);

 /** 
- * Free storage of the AvPowerSpectra
+ * Free PowerSpectra
+ *
+ * @param[in] ps PowerSpectra handle
 */
-void AvPowerSpectra_free(AvPowerSpectra*);
+void PowerSpectra_free(PowerSpectra* ps);
+
+

 #endif // PS_H
 //////////////////////////////////////////////////////////////////////
--- a/beamforming/c/types.h
+++ b/beamforming/c/types.h
@ -10,6 +10,15 @@
 #ifndef TYPES_H
 #define TYPES_H

+// Branch prediction performance improvement
+#if !defined(likely) && defined(__GNUC__) && !defined(ASCEE_DEBUG)
+#define likely(x)       __builtin_expect(!!(x), 1)
+#define unlikely(x)     __builtin_expect(!!(x), 0)
+#else
+#define likely(x)       (x)
+#define unlikely(x)     (x)
+#endif
+
 /// We often use boolean values
 #include <stdbool.h>            // true, false
 #include <stddef.h>
--- a/beamforming/c/worker.c
+++ b/beamforming/c/worker.c
@ -11,7 +11,7 @@
 #include "ascee_alloc.h"
 #include <pthread.h>
 #include "ascee_assert.h"
-#include "tracer.h"
+#include "ascee_tracer.h"

 typedef struct Workers_s {
    JobQueue* jq;
--- a/beamforming/config.pxi.in
+++ b/beamforming/config.pxi.in
@ -18,25 +18,26 @@ ELSE:

 ctypedef size_t us

-cdef extern from "math.h":
+cdef extern from "ascee_tracer.h":
+    void setTracerLevel(int)
+
+cdef extern from "ascee_math.h":
    ctypedef struct dmat:
        d* data
+        d** col_ptrs
        us n_cols,n_rows
    ctypedef struct cmat:
        c* data
+        c** col_ptrs
        us n_cols,n_rows

-# cdef np.ndarray empty(us nrows,us ncols,dtype):
-cdef dmat dmat_from_array(d[::1,:] arr):
-    cdef dmat result
-    result.data = &arr[0,0]
-    result.n_rows = arr.shape[0]
-    result.n_cols = arr.shape[1]
-    return result
+    dmat dmat_foreign(us n_rows,
+                      us n_cols,
+                      d* data)
+    cmat cmat_foreign(us n_rows,
+                      us n_cols,
+                      c* data)

-cdef cmat cmat_from_array(c[::1,:] arr):
-    cdef cmat result
-    result.data = &arr[0,0]
-    result.n_rows = arr.shape[0]
-    result.n_cols = arr.shape[1]
-    return result
+    void dmat_free(dmat*)
+    void cmat_free(cmat*)
+    void cmat_copy(cmat* to,cmat* from_)
--- a/beamforming/fft.pyx
+++ b/beamforming/fft.pyx
@ -1,5 +1,7 @@
 include "config.pxi"

+# setTracerLevel(0)
+
 cdef extern from "fft.h":
    ctypedef struct c_Fft "Fft"
    c_Fft* Fft_alloc(us nfft,us nchannels)
@ -32,11 +34,19 @@ cdef class Fft:
                           nchannels),
                           dtype=NUMPY_COMPLEX_TYPE,order='F')
        # result[:,:] = np.nan+1j*np.nan
+        cdef c[::1,:] result_view = result
+        cdef cmat r = cmat_foreign(result.shape[0],
+                                   result.shape[1],
+                                   &result_view[0,0])
+        cdef dmat t = dmat_foreign(timedata.shape[0],
+                                   timedata.shape[1],
+                                   &timedata[0,0])

-        cdef cmat r = cmat_from_array(result)
-        cdef dmat t = dmat_from_array(timedata)
        Fft_fft(self._fft,&t,&r)

+        dmat_free(&t)
+        cmat_free(&r)
+
        return result


@ -50,46 +60,139 @@ cdef extern from "window.h":
 cdef extern from "ps.h":
    ctypedef struct c_PowerSpectra "PowerSpectra"
    c_PowerSpectra* PowerSpectra_alloc(const us nfft,
-                                     const us nchannels,
-                                     const WindowType wt)
-    int PowerSpectra_compute(const c_PowerSpectra* ps,
+                                       const us nchannels,
+                                       const WindowType wt)
+    void PowerSpectra_compute(const c_PowerSpectra* ps,
                             const dmat * timedata,
                             cmat * result)
    

    void PowerSpectra_free(c_PowerSpectra*)

-# cdef class PowerSpectra:
-#     cdef:
-#         c_PowerSpectra* _ps
+cdef class PowerSpectra:
+    cdef:
+        c_PowerSpectra* _ps

-#     def __cinit__(self, us nfft,us nchannels):
-#         self._ps = PowerSpectra_alloc(nfft,nchannels,Rectangular)
-#         if self._ps == NULL:
-#             raise RuntimeError('PowerSpectra allocation failed')
+    def __cinit__(self, us nfft,us nchannels):
+        self._ps = PowerSpectra_alloc(nfft,nchannels,Rectangular)
+        if self._ps == NULL:
+            raise RuntimeError('PowerSpectra allocation failed')

-#     def compute(self,d[::1,:] timedata):
-#         cdef:
-#             us nchannels = timedata.shape[1]
-#             us nfft = timedata.shape[0]
-#             int rv
-#             dmat td
-#             cmat result_mat
-#         td.data = &timedata[0,0]
-#         td.n_rows = nfft
-#         td.n_cols = nchannels
-#         result = np.empty((nfft//2+1,nchannels*nchannels),
-#                           dtype = NUMPY_COMPLEX_TYPE,
-#                           order='F')
-#         result_mat = cmat_from_array(result)
+    def compute(self,d[::1,:] timedata):
+        cdef:
+            us nchannels = timedata.shape[1]
+            us nfft = timedata.shape[0]
+            int rv
+            dmat td
+            cmat result_mat
+        td = dmat_foreign(nfft,
+                          nchannels,
+                          &timedata[0,0])

-#         rv = PowerSpectra_compute(self._ps,&td,&result_mat)
-#         if rv !=0:
-#             raise RuntimeError("Error computing power spectra")
+        # The array here is created in such a way that the strides
+        # increase with increasing dimension. This is required for
+        # interoperability with the C-code, that stores all
+        # cross-spectra in a 2D matrix, where the first axis is the
+        # frequency axis, and the second axis corresponds to a certain
+        # cross-spectrum, as C_ij(f) = result[freq,i+j*nchannels]

-#         return result
+        result = np.empty((nfft//2+1,nchannels,nchannels),
+                          dtype = NUMPY_COMPLEX_TYPE,
+                          order='F')
+        cdef c[::1,:,:] result_view = result
+
+        result_mat = cmat_foreign(nfft//2+1,
+                                  nchannels*nchannels,
+                                  &result_view[0,0,0])
+
+        PowerSpectra_compute(self._ps,&td,&result_mat)
+
+        dmat_free(&td)
+        cmat_free(&result_mat)
+
+        return result


-#     def __dealloc__(self):
-#         if self._ps != NULL:
-#             PowerSpectra_free(self._ps)
+    def __dealloc__(self):
+        if self._ps != NULL:
+            PowerSpectra_free(self._ps)
+
+
+cdef extern from "aps.h":
+    ctypedef struct c_AvPowerSpectra "AvPowerSpectra"
+    c_AvPowerSpectra* AvPowerSpectra_alloc(const us nfft,
+                                           const us nchannels,
+                                           d overlap_percentage,
+                                           const WindowType wt)
+
+    cmat* AvPowerSpectra_addTimeData(const c_AvPowerSpectra* ps,
+                                     const dmat * timedata)
+    
+
+    void AvPowerSpectra_free(c_AvPowerSpectra*)
+
+            
+cdef class AvPowerSpectra:
+    cdef:
+        c_AvPowerSpectra* aps
+        us nfft, nchannels
+
+    def __cinit__(self,us nfft,us nchannels,d overlap_percentage):
+        self.aps = AvPowerSpectra_alloc(nfft,
+                                        nchannels,
+                                        overlap_percentage,
+                                        Rectangular)
+        self.nchannels = nchannels
+        self.nfft = nfft
+
+        if self.aps == NULL:
+            raise RuntimeError('AvPowerSpectra allocation failed')
+
+    def __dealloc__(self):
+        if self.aps:
+            AvPowerSpectra_free(self.aps)
+        
+    def addTimeData(self,d[::1,:] timedata):
+        """!
+        Adds time data, returns current result
+        """
+        cdef:
+            us nsamples = timedata.shape[0]
+            us nchannels = timedata.shape[1]
+            dmat td
+            cmat* result_ptr
+
+        if nsamples < self.nfft:
+            raise RuntimeError('Number of samples should be > nfft')
+        if nchannels != self.nchannels:
+            raise RuntimeError('Invalid number of channels')            
+
+        td = dmat_foreign(nsamples,
+                          nchannels,
+                          &timedata[0,0])
+
+        result_ptr = AvPowerSpectra_addTimeData(self.aps,
+                                                &td)
+
+        # The array here is created in such a way that the strides
+        # increase with increasing dimension. This is required for
+        # interoperability with the C-code, that stores all
+        # cross-spectra in a 2D matrix, where the first axis is the
+        # frequency axis, and the second axis corresponds to a certain
+        # cross-spectrum, as C_ij(f) = result[freq,i+j*nchannels]
+
+        result = np.empty((self.nfft//2+1,nchannels,nchannels),
+                          dtype = NUMPY_COMPLEX_TYPE,
+                          order='F')
+        cdef c[::1,:,:] result_view = result
+        cdef cmat res = cmat_foreign(self.nfft//2+1,
+                                     nchannels*nchannels,
+                                     &result_view[0,0,0])
+        # Copy result
+        cmat_copy(&res,result_ptr)
+
+        cmat_free(&res)
+        dmat_free(&td)
+
+
+        return result
--- a/fftpack/fftpack.h
+++ b/fftpack/fftpack.h
@ -8,7 +8,7 @@
 #pragma once
 #ifndef FFTPACK_H
 #define FFTPACK_H
-#include "tracer.h"
+#include "ascee_tracer.h"
 #include "ascee_alloc.h"
 #include "npy_fftpack.h"

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -2,6 +2,8 @@ include_directories(${CMAKE_SOURCE_DIR}/beamforming/c)
 add_executable(test_bf test_bf.c)
 add_executable(test_workers test_workers.c)
 add_executable(test_fft test_fft.c)
+add_executable(test_math test_math.c)
 target_link_libraries(test_bf beamforming_lib)
 target_link_libraries(test_fft beamforming_lib pthread)
 target_link_libraries(test_workers beamforming_lib pthread)
+target_link_libraries(test_math beamforming_lib pthread efence)
--- a/test/ps_test.py
+++ b/test/ps_test.py
@ -8,10 +8,10 @@ Created on Mon Jan 15 19:45:33 2018
 import numpy as np
 from beamforming.fft import Fft, PowerSpectra

-nfft=2**10
+nfft=2**4
 print('nfft:',nfft)
-print(nfft)
-nchannels = 8
+#print(nfft)
+nchannels = 2

 t = np.linspace(0,1,nfft+1)
 # print(t)
--- a/test/test_fft.c
+++ b/test/test_fft.c
@ -7,11 +7,13 @@
 //////////////////////////////////////////////////////////////////////

 #include "fft.h"
-#include "tracer.h"
+#include "ascee_tracer.h"


 int main() {

+    setTracerLevel(0);
+
    iVARTRACE(15,getTracerLevel());

    Fft* fft = Fft_alloc(100000,8);
--- a/test/test_math.c
+++ b/test/test_math.c
@ -0,0 +1,36 @@
+// test_bf.c
+//
+// Author: J.A. de Jong -ASCEE
+// 
+// Description:
+//
+//////////////////////////////////////////////////////////////////////
+
+#include "fft.h"
+#include "ascee_math.h"
+#include "ascee_tracer.h"
+
+int main() {
+
+    iVARTRACE(15,getTracerLevel());
+
+    vc a = vc_alloc(5);
+    vc_set(&a,2+3*I);
+
+    c_conj_inplace(a.data,a.size);
+    print_vc(&a);
+
+    vc b = vc_alloc(5);    
+
+    c_conj_c(b.data,a.data,5);
+
+    print_vc(&b);    
+
+    vc_free(&a);
+    vc_free(&b);    
+    return 0;
+}
+
+//////////////////////////////////////////////////////////////////////
+
+
--- a/test/test_workers.c
+++ b/test/test_workers.c
@ -7,8 +7,8 @@
 //////////////////////////////////////////////////////////////////////
 #include "worker.h"
 #include "mq.h"
-#include "tracer.h"
-
+#include "ascee_tracer.h"
+#include <unistd.h>
 static void* walloc(void*);
 static int worker(void*,void*);
 static void wfree(void*);
@ -20,7 +20,7 @@ int main() {

    iVARTRACE(15,getTracerLevel());

-    int njobs = 4;
+    us njobs = 4;
    JobQueue* jq = JobQueue_alloc(njobs);
    assert(jq);

@ -43,11 +43,10 @@ int main() {

    return 0;
 }
-
 static void* walloc(void* data) {
    TRACE(15,"WALLOC");
-    uVARTRACE(15,(int) data);
-    return (void*) getpid();
+    uVARTRACE(15,(us) data);
+    return (void*) 1;
 }

 static int worker(void* w_data,void* tj) {