From c0c5424ccd8a8d772792185d1e43579518dfa363 Mon Sep 17 00:00:00 2001
From: Sarod Yatawatta <sarod@dop373.nfra.nl>
Date: Wed, 7 Feb 2018 10:36:47 +0100
Subject: [PATCH] update

---
 src/lib/Radio/Radio.h          |    2 +-
 src/lib/Radio/predict_model.cu | 1095 +++++++++++++++++++++-----------
 2 files changed, 737 insertions(+), 360 deletions(-)

diff --git a/src/lib/Radio/Radio.h b/src/lib/Radio/Radio.h
index de0ee0a..4b39d76 100644
--- a/src/lib/Radio/Radio.h
+++ b/src/lib/Radio/Radio.h
@@ -221,7 +221,7 @@ calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double
     if model has n0>20 or so, try increasing this and recompiling
    the default GPU values is ~ 8MB */
 #ifndef GPU_HEAP_SIZE
-#define GPU_HEAP_SIZE 20 
+#define GPU_HEAP_SIZE 32 
 #endif
 
 extern void
diff --git a/src/lib/Radio/predict_model.cu b/src/lib/Radio/predict_model.cu
index 6678c34..af1f0c4 100644
--- a/src/lib/Radio/predict_model.cu
+++ b/src/lib/Radio/predict_model.cu
@@ -26,6 +26,25 @@
 /* enable this for checking for kernel failure */
 //#define CUDA_DBG
 
+/* matrix multiplications */
+/* C=A*B */
+__device__ void
+amb(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) {
+ c[0]=cuCadd(cuCmul(a[0],b[0]),cuCmul(a[1],b[2]));
+ c[1]=cuCadd(cuCmul(a[0],b[1]),cuCmul(a[1],b[3]));
+ c[2]=cuCadd(cuCmul(a[2],b[0]),cuCmul(a[3],b[2]));
+ c[3]=cuCadd(cuCmul(a[2],b[1]),cuCmul(a[3],b[3]));
+}
+/* C=A*B^H */
+__device__ void
+ambt(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) {
+ c[0]=cuCadd(cuCmul(a[0],cuConj(b[0])),cuCmul(a[1],cuConj(b[1])));
+ c[1]=cuCadd(cuCmul(a[0],cuConj(b[2])),cuCmul(a[1],cuConj(b[3])));
+ c[2]=cuCadd(cuCmul(a[2],cuConj(b[0])),cuCmul(a[3],cuConj(b[1])));
+ c[3]=cuCadd(cuCmul(a[2],cuConj(b[2])),cuCmul(a[3],cuConj(b[3])));
+}
+
+
 __device__ void
 radec2azel_gmst__(float ra, float dec, float longitude, float latitude, float thetaGMST, float *az, float *el) {
   float thetaLST=thetaGMST+longitude*180.0f/M_PI;
@@ -51,134 +70,49 @@ radec2azel_gmst__(float ra, float dec, float longitude, float latitude, float th
   *az=azd;
 }
 
-/* slave kernel to calculate phase of manifold vector for given station */
-/* x,y,z: Nx1 arrays of element coords */
-/* sum: scalar to store result */
-/* NOTE: only 1 block should be used here */
-__global__ void 
-kernel_array_beam_slave_sin(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) {
-  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
-  extern __shared__ float tmpsum[]; /* assumed to be size Nx1 */
-  if (n<N) {
-    tmpsum[n]=sinf((r1*__ldg(&x[n])+r2*__ldg(&y[n])+r3*__ldg(&z[n])));
-  }
-  __syncthreads();
-
- // Build summation tree over elements, handling case where total threads is not a power of two.
-  int nTotalThreads = blockDim_2; // Total number of threads (==N), rounded up to the next power of two
-  while(nTotalThreads > 1) {
-    int halfPoint = (nTotalThreads >> 1); // divide by two
-    if (n < halfPoint) {
-     int thread2 = n + halfPoint;
-     if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 )
-      tmpsum[n] = tmpsum[n]+tmpsum[thread2];
-     }
-    }
-    __syncthreads();
-    nTotalThreads = halfPoint; // Reducing the binary tree size by two
-  }
-
-  /* now thread 0 will add up results */
-  if (threadIdx.x==0) {
-   *sum=tmpsum[0];
-  }
-}
-
-__global__ void 
-kernel_array_beam_slave_cos(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) {
-  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
-  extern __shared__ float tmpsum[]; /* assumed to be size Nx1 */
-  if (n<N) {
-    tmpsum[n]=cosf((r1*__ldg(&x[n])+r2*__ldg(&y[n])+r3*__ldg(&z[n])));
-  }
-  __syncthreads();
-  // Build summation tree over elements, handling case where total threads is not a power of two.
-  int nTotalThreads = blockDim_2; // Total number of threads (==N), rounded up to the next power of two
-  while(nTotalThreads > 1) {
-    int halfPoint = (nTotalThreads >> 1); // divide by two
-    if (n < halfPoint) {
-     int thread2 = n + halfPoint;
-     if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 )
-      tmpsum[n] = tmpsum[n]+tmpsum[thread2];
-     }
-    }
-    __syncthreads();
-    nTotalThreads = halfPoint; // Reducing the binary tree size by two
-  }
-
-  /* now thread 0 will add up results */
-  if (threadIdx.x==0) {
-   *sum=tmpsum[0];
-  }
-}
-
-/* sum: 2x1 array */
-__global__ void 
-kernel_array_beam_slave_sincos(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) {
-  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
-  extern __shared__ float tmpsum[]; /* assumed to be size 2*Nx1 */
-  if (n<N) {
-    float ss,cc;
-    sincosf((r1*__ldg(&x[n])+r2*__ldg(&y[n])+r3*__ldg(&z[n])),&ss,&cc);
-    tmpsum[2*n]=ss;
-    tmpsum[2*n+1]=cc;
-  }
-  __syncthreads();
-
- // Build summation tree over elements, handling case where total threads is not a power of two.
-  int nTotalThreads = blockDim_2; // Total number of threads (==N), rounded up to the next power of two
-  while(nTotalThreads > 1) {
-    int halfPoint = (nTotalThreads >> 1); // divide by two
-    if (n < halfPoint) {
-     int thread2 = n + halfPoint;
-     if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 )
-      tmpsum[2*n] = tmpsum[2*n]+tmpsum[2*thread2];
-      tmpsum[2*n+1] = tmpsum[2*n+1]+tmpsum[2*thread2+1];
-     }
-    }
-    __syncthreads();
-    nTotalThreads = halfPoint; // Reducing the binary tree size by two
-  }
-
-  /* now thread 0 will add up results */
-  if (threadIdx.x==0) {
-   sum[0]=tmpsum[0];
-   sum[1]=tmpsum[1];
-  }
-}
-
-
-__device__ int
-NearestPowerOf2 (int n){
-  if (!n) return n;  //(0 == 2^0)
-
-  int x = 1;
-  while(x < n) {
-      x <<= 1;
-  }
-  return x;
-}
-
-
+/* use compiler directives to use/not use shared memory */
+/* ARRAY_MAX_ELEM : define this in header file beforehand, if using shared memory */
 /* master kernel to calculate beam */
-/* tarr: size NTKFx2 buffer to store sin() cos() sums */
 __global__ void 
-kernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude, float *latitude,
- double *time_utc, int *Nelem, float **xx, float **yy, float **zz, float *ra, float *dec, float ph_ra0, float ph_dec0, float ph_freq0, float *beam, float *tarr) {
+kernel_array_beam(int N, int T, int K, int F, 
+ const double *__restrict__ freqs, const float *__restrict__ longitude, const float *__restrict__ latitude,
+ const double *__restrict__ time_utc, const int *__restrict__ Nelem, 
+ const float * const *__restrict__ xx, const float * const *__restrict__ yy, const float * const *__restrict__ zz, 
+ const float *__restrict__ ra, const float *__restrict__ dec, 
+ float ph_ra0, float ph_dec0, float ph_freq0, float *beam) {
 
-  /* global thread index */
-  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
-  /* allowed max threads */
-  int Ntotal=N*T*K*F;
-  if (n<Ntotal) {
-   /* find respective station,source,freq,time for this thread */
-   int istat=n/(K*T*F);
-   int n1=n-istat*(K*T*F);
-   int isrc=n1/(T*F);
-   n1=n1-isrc*(T*F);
-   int ifrq=n1/(T);
-   n1=n1-ifrq*(T);
-   int itm=n1;
+    /* global thread index, x-dimension (data) */
+    int x=threadIdx.x+blockDim.x*blockIdx.x;
+    /* y-dimension is station */
+    int istat=blockIdx.y;
+
+    // find respective source,freq,time for this thread
+    int n1 = x;
+    int isrc=n1/(T*F);
+    n1=n1-isrc*(T*F);
+    int ifrq=n1/(T);
+    n1=n1-ifrq*(T);
+    int itm=n1;
+
+    //number of elements for this station
+    int Nelems = __ldg(&Nelem[istat]);
+
+    //using shared memory
+    #if (ARRAY_USE_SHMEM==1)
+      __shared__ float sh_x[ARRAY_MAX_ELEM];
+      __shared__ float sh_y[ARRAY_MAX_ELEM];
+      __shared__ float sh_z[ARRAY_MAX_ELEM];
+      for (int i=threadIdx.x; i<Nelems; i+=blockDim.x) {
+        sh_x[i] = __ldg(&xx[istat][i]);
+        sh_y[i] = __ldg(&yy[istat][i]);
+        sh_z[i] = __ldg(&zz[istat][i]);
+      }
+      __syncthreads();
+    #endif
+
+  float r1,r2,r3;
+  //check data limit
+  if (x<(K*T*F)) {
 
 /*********************************************************************/
    /* time is already converted to thetaGMST */
@@ -202,12 +136,11 @@ kernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude, fl
    sincosf(theta0,&sint0,&cost0);
    sincosf(phi0,&sinph0,&cosph0);
 
-   float r1,r2,r3;
    /*r1=(float)-tpc*(ph_freq0*sint0*cosph0-freqs[ifrq]*sint*cosph);
    r2=(float)-tpc*(ph_freq0*sint0*sinph0-freqs[ifrq]*sint*sinph);
    r3=(float)-tpc*(ph_freq0*cost0-freqs[ifrq]*cost);
    */
-   float f=__ldg(&freqs[ifrq]);
+   float f=(float)__ldg(&freqs[ifrq]);
    float rat1=ph_freq0*sint0;
    float rat2=f*sint;
    r1=-tpc*(rat1*cosph0-rat2*cosph);
@@ -215,31 +148,26 @@ kernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude, fl
    r3=-tpc*(ph_freq0*cost0-f*cost);
    
 
-   /* always use 1 block, assuming total elements<512 */
-   /* shared memory to store either sin or cos values */
+        float ssum = 0.0f;
+        float csum = 0.0f;
+        for (int i=0; i<Nelems; i++) {
+            float ss,cc;
+            #if (ARRAY_USE_SHMEM == 0)
+            sincosf((r1*__ldg(&xx[istat][i])+r2*__ldg(&yy[istat][i])+r3*__ldg(&zz[istat][i])),&ss,&cc);
+            #else
+            sincosf(r1*sh_x[i]+r2*sh_y[i]+r3*sh_z[i],&ss,&cc);
+            #endif
+            ssum += ss;
+            csum += cc;
+        }
+
+
 
-#ifdef CUDA_DBG
-   cudaError_t error;
-#endif
-   //int boffset=istat*K*T*F + isrc*T*F + ifrq*T + itm;
-   int boffset=itm*N*K*F+isrc*N*F+ifrq*N+istat;
-   int Nelems=__ldg(&Nelem[istat]);
-   kernel_array_beam_slave_sincos<<<1,Nelems,sizeof(float)*Nelems*2>>>(Nelems,r1,r2,r3,xx[istat],yy[istat],zz[istat],&tarr[2*n],NearestPowerOf2(Nelems));
-   cudaDeviceSynchronize();
-#ifdef CUDA_DBG
-  error = cudaGetLastError();
-  if(error != cudaSuccess) {
-    // print the CUDA error message and exit
-    printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
-  }
-#endif
-   float ssum=__ldg(&tarr[2*n]);
-   float csum=__ldg(&tarr[2*n+1]);
-   
    float Nnor=1.0f/(float)Nelems;
    ssum*=Nnor;
    csum*=Nnor;
    /* store output (amplitude of beam)*/
+   int boffset=itm*N*K*F+isrc*N*F+ifrq*N+istat;
    beam[boffset]=sqrtf(ssum*ssum+csum*csum);
    //printf("thread %d stat %d src %d freq %d time %d : %lf longitude=%lf latitude=%lf time=%lf freq=%lf elem=%d ra=%lf dec=%lf beam=%lf\n",n,istat,isrc,ifrq,itm,time_utc[itm],longitude[istat],latitude[istat],time_utc[itm],freqs[ifrq],Nelem[istat],ra[isrc],dec[isrc],beam[boffset]);
   }
@@ -247,8 +175,8 @@ kernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude, fl
 }
 
 /***************************************************************************/
-__device__ cuFloatComplex
-gaussian_contrib(int *dd, float u, float v, float w) {
+__device__ cuDoubleComplex
+gaussian_contrib__(int *dd, float u, float v, float w) {
   exinfo_gaussian *dp=(exinfo_gaussian*)dd;
   float up,vp,a,b,ut,vt,cosph,sinph;
 
@@ -268,13 +196,13 @@ gaussian_contrib(int *dd, float u, float v, float w) {
   ut=a*(cosph*up-sinph*vp);
   vt=b*(sinph*up+cosph*vp);
 
-  return make_cuFloatComplex(0.5f*M_PI*expf(-(ut*ut+vt*vt)),0.0f);
+  return make_cuDoubleComplex((double)(0.5f*M_PI*expf(-(ut*ut+vt*vt))),0.0);
 }
 
 
 
-__device__ cuFloatComplex
-ring_contrib(int *dd, float u, float v, float w) {
+__device__ cuDoubleComplex
+ring_contrib__(int *dd, float u, float v, float w) {
   exinfo_ring *dp=(exinfo_ring*)dd;
   float up,vp,a,b;
 
@@ -285,11 +213,11 @@ ring_contrib(int *dd, float u, float v, float w) {
   a=dp->eX; /* diameter */
   b=sqrtf(up*up+vp*vp)*a*2.0f*M_PI;
 
-  return make_cuFloatComplex(j0f(b),0.0f);
+  return make_cuDoubleComplex((double)j0f(b),0.0);
 }
 
-__device__ cuFloatComplex
-disk_contrib(int *dd, float u, float v, float w) {
+__device__ cuDoubleComplex
+disk_contrib__(int *dd, float u, float v, float w) {
   exinfo_disk *dp=(exinfo_disk*)dd;
   float up,vp,a,b;
 
@@ -300,7 +228,7 @@ disk_contrib(int *dd, float u, float v, float w) {
   a=dp->eX; /* diameter */
   b=sqrtf(up*up+vp*vp)*a*2.0f*M_PI;
 
-  return make_cuFloatComplex(j1f(b),0.0f);
+  return make_cuDoubleComplex((double)j1f(b),0.0);
 }
 
 
@@ -324,7 +252,7 @@ H_e(float x, int n) {
 }
 
 __device__ void
-calculate_uv_mode_vectors_scalar(float u, float v, float beta, int n0, float *Av, int *cplx) {
+calculate_uv_mode_vectors_scalar00(float u, float v, float beta, int n0, float *Av, int *cplx) {
 
   int xci,zci,Ntot;
 
@@ -393,8 +321,56 @@ calculate_uv_mode_vectors_scalar(float u, float v, float beta, int n0, float *Av
   free(shpvl);
 }
 
-__device__ cuFloatComplex
-shapelet_contrib(int *dd, float u, float v, float w) {
+
+__device__ void
+calculate_uv_mode_vectors_scalar(float u, float v, float beta, int n0, float *Av, int *cplx, float *fact, float *shpvl) {
+
+  int xci;
+
+  int n1,n2,start;
+  int signval;
+
+
+  /* start filling in the array from the positive values */
+  float xvalu=u*beta;
+  float expvalu=__expf(-0.5f*xvalu*xvalu);
+  float xvalv=v*beta;
+  float expvalv=__expf(-0.5f*xvalv*xvalv);
+  for (xci=0; xci<n0; xci++) {
+    shpvl[xci]=H_e(xvalu,xci)*expvalu/__fsqrt_rn((float)(2<<xci)*fact[xci]);
+
+    shpvl[xci+n0]=H_e(xvalv,xci)*expvalv/__fsqrt_rn((float)(2<<xci)*fact[xci]);
+  }
+
+  /* now calculate the mode vectors */
+  /* each vector is 1 x 1 length and there are n0*n0 of them */
+
+  for (n2=0; n2<(n0); n2++) {
+   for (n1=0; n1<(n0); n1++) {
+    cplx[n2*n0+n1]=((n1+n2)%2==0?0:1) /* even (real) or odd (imaginary)*/;
+    /* sign */
+    if (cplx[n2*n0+n1]==0) {
+      signval=(((n1+n2)/2)%2==0?1:-1);
+    } else {
+      signval=(((n1+n2-1)/2)%2==0?1:-1);
+    }
+
+    /* fill in 1*1*(zci) to 1*1*(zci+1)-1 */
+    start=(n2*(n0)+n1);
+    if (signval==-1) {
+        Av[start]=-shpvl[n1]*shpvl[n0+n2];
+    } else {
+        Av[start]=shpvl[n1]*shpvl[n0+n2];
+    }
+   }
+  }
+
+}
+
+
+
+__device__ cuDoubleComplex
+shapelet_contrib__(int *dd, float u, float v, float w) {
   exinfo_shapelet *dp=(exinfo_shapelet*)dd;
   int *cplx;
   float *Av;
@@ -418,12 +394,38 @@ shapelet_contrib(int *dd, float u, float v, float w) {
   __sincosf((float)dp->eP,&sinph,&cosph);
   ut=a*(cosph*up-sinph*vp);
   vt=b*(sinph*up+cosph*vp);
+  /* if u,v is way off the scale (beta) of shapelet modes, the result is almost always zero,
+     so check this here and return 0, otherwise spurious nans may result */
+  if (__fdiv_rz(100.0f,__fsqrt_rz(ut*ut+vt*vt))<dp->beta) {
+   return make_cuDoubleComplex(0.0,0.0);
+  }
   /* note: we decompose f(-l,m) so the Fourier transform is F(-u,v)
    so negate the u grid */
   Av=(float*)malloc((size_t)((dp->n0)*(dp->n0))*sizeof(float));
   cplx=(int*)malloc((size_t)((dp->n0)*(dp->n0))*sizeof(int));
 
-  calculate_uv_mode_vectors_scalar(-ut, vt, dp->beta, dp->n0, Av, cplx);
+  float *fact=0;
+  float *shpvl=0;
+  /* set up factorial array */
+  fact=(float *)malloc((size_t)(dp->n0)*sizeof(float));
+  /* setup array to store calculated shapelet value */
+  /* need max storage 2 x n0 */
+  shpvl=(float*)malloc((size_t)(2*dp->n0)*sizeof(float));
+
+  if (!fact || !Av || !cplx || !shpvl) {
+   printf("Error: Device memory allocation failure!! increase heap size.\n");
+  }
+  fact[0]=1.0f;
+  for (ci=1; ci<dp->n0; ci++) {
+    fact[ci]=((float)ci+1.0f)*fact[ci-1];
+  }
+
+
+  calculate_uv_mode_vectors_scalar(-ut, vt, dp->beta, dp->n0, Av, cplx, fact, shpvl);
+
+  free(fact);
+  free(shpvl);
+
   realsum=imagsum=0.0f;
   M=(dp->n0)*(dp->n0);
   for (ci=0; ci<M; ci++) {
@@ -436,163 +438,198 @@ shapelet_contrib(int *dd, float u, float v, float w) {
 
   free(Av);
   free(cplx);
-  //return 2.0*M_PI*(realsum+_Complex_I*imagsum);
   realsum*=2.0f*M_PI*a*b;
   imagsum*=2.0f*M_PI*a*b;
-  return make_cuFloatComplex(realsum,imagsum);
+  /* additional safeguards */
+  if ( isnan(realsum) ) { realsum=0.0f; }
+  if ( isnan(imagsum) ) { imagsum=0.0f; }
+  return make_cuDoubleComplex((double)realsum,(double)imagsum);
 }
 
-
-
-/* slave thread to calculate coherencies, for 1 source */
-/* baseline (sta1,sta2) at time itm */
-/* K: total sources, uset to find right offset 
-   Kused: actual sources calculated in this thread block
-   Koff: offset in source array to start calculation
-   NOTE: only 1 block is used
- */
-__global__ void 
-kernel_coherencies_slave(int sta1, int sta2, int itm, int B, int N, int T, int K, int Kused, int Koff, int F, float u, float v, float w, float *freqs, float *beam, float *ll, float *mm, float *nn, float *sI,
-  unsigned char *stype, float *sI0, float *f0, float *spec_idx, float *spec_idx1, float *spec_idx2, int **exs, float deltaf, float deltat, float dec0, float *__restrict__ coh,int dobeam,int blockDim_2) {
-  /* which source we work on */
-  unsigned int k=threadIdx.x+blockDim.x*blockIdx.x;
-
-  extern __shared__ float tmpcoh[]; /* assumed to be size 8*F*Kusedx1 */
-
-  if (k<Kused) {
-   int k1=k+Koff; /* actual source id */
-   /* preload all freq independent variables */
-   /* Fourier phase */
-   float phterm0=2.0f*M_PI*(u*__ldg(&ll[k])+v*__ldg(&mm[k])+w*__ldg(&nn[k]));
-   float sIf,sI0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
-   sIf=__ldg(&sI[k]);
-   if (F>1) {
-     sI0f=__ldg(&sI0[k]);
-     spec_idxf=__ldg(&spec_idx[k]);
-     spec_idx1f=__ldg(&spec_idx1[k]);
-     spec_idx2f=__ldg(&spec_idx2[k]);
-     myf0=__ldg(&f0[k]);
-   }
-   unsigned char stypeT=__ldg(&stype[k]);
-   for(int cf=0; cf<F; cf++) {
-     float sinph,cosph;
-     float myfreq=__ldg(&freqs[cf]);
-     sincosf(phterm0*myfreq,&sinph,&cosph);
-     cuFloatComplex prodterm=make_cuFloatComplex(cosph,sinph);
-     float If;
-     if (F==1) {
-      /* flux: do not use spectra here, because F=1*/
-      If=sIf;
-     } else {
-      /* evaluate spectra */
-      float fratio=__logf(myfreq/myf0);
-      float fratio1=fratio*fratio;
-      float fratio2=fratio1*fratio;
-      /* catch -ve flux */
-      if (sI0f>0.0f) {
-        If=__expf(__logf(sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
-      } else if (sI0f<0.0f) {
-        If=-__expf(__logf(-sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
-      } else {
-        If=0.0f;
-      }
+__device__ void 
+compute_prodterm_multifreq(int sta1, int sta2, int N, int K, int T, int F,
+double phterm0, double sI0f, double sQ0f, double sU0f, double sV0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0,
+ double myfreq, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) {
+     /* F>1 is assumed output: 8x1 array */
+     double sinph,cosph;
+     sincos(phterm0*myfreq,&sinph,&cosph);
+     cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph);
+     double If,Qf,Uf,Vf;
+     If=Qf=Uf=Vf=0.0;
+     /* evaluate spectra */
+     double fratio=log(myfreq/myf0);
+     double fratio1=fratio*fratio;
+     double fratio2=fratio1*fratio;
+     double cm=spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2;
+     /* catch -ve flux */
+     if (sI0f>0.0) {
+        If=exp(log(sI0f)+cm);
+     } else if (sI0f<0.0) {
+        If=-exp(log(-sI0f)+cm);
+     } 
+     if (sQ0f>0.0) {
+        Qf=exp(log(sQ0f)+cm);
+     } else if (sI0f<0.0) {
+        Qf=-exp(log(-sQ0f)+cm);
+     } 
+     if (sU0f>0.0) {
+        Uf=exp(log(sU0f)+cm);
+     } else if (sI0f<0.0) {
+        Uf=-exp(log(-sU0f)+cm);
+     } 
+     if (sV0f>0.0) {
+        Vf=exp(log(sV0f)+cm);
+     } else if (sI0f<0.0) {
+        Vf=-exp(log(-sV0f)+cm);
      }
-     /* smearing */
-     float phterm =phterm0*0.5f*deltaf;
-     if (phterm!=0.0f) {
-      sinph=__sinf(phterm)/phterm;
-      If *=fabsf(sinph); /* catch -ve values due to rounding off */
+
+     /* smearing, beam */
+     double scalef=1.0;
+     double phterm =(phterm0*0.5*deltaf);
+     if (phterm!=0.0) {
+      sinph=(sin(phterm)/phterm);
+      scalef *=fabs(sinph); /* catch -ve values due to rounding off */
      }
 
      if (dobeam) {
       /* get beam info */
       //int boffset1=sta1*K*T*F + k1*T*F + cf*T + itm;
-      int boffset1=itm*N*K*F+k1*N*F+cf*N+sta1;
+
+      int boffset1=itm*N*K*F+k*N*F+cf*N+sta1;
+      //  printf("itm=%d, k1=%d, sta1=%d, sta2=%d, boffset1=%d, boffset2=%d\n", itm, k1, sta1, sta2, boffset1, boffset2);
       float beam1=__ldg(&beam[boffset1]);
       //int boffset2=sta2*K*T*F + k1*T*F + cf*T + itm;
-      int boffset2=itm*N*K*F+k1*N*F+cf*N+sta2;
+      int boffset2=itm*N*K*F+k*N*F+cf*N+sta2;
       float beam2=__ldg(&beam[boffset2]);
-      If *=beam1*beam2;
+      scalef *=(double)(beam1*beam2);
      }
 
+
      /* form complex value */
-     prodterm.x *=If;
-     prodterm.y *=If;
+     prodterm.x *=scalef;
+     prodterm.y *=scalef;
 
      /* check for type of source */
-     if (stypeT!=STYPE_POINT) {
-      float uscaled=u*myfreq;
-      float vscaled=v*myfreq;
-      float wscaled=w*myfreq;
+     if (stypeT!=STYPE_POINT && !(u==0.0 && v==0.0)) {
+      float uscaled=(float)(u*myfreq);
+      float vscaled=(float)(v*myfreq);
+      float wscaled=(float)(w*myfreq);
       if (stypeT==STYPE_SHAPELET) {
-       prodterm=cuCmulf(shapelet_contrib(exs[k],uscaled,vscaled,wscaled),prodterm);
+       prodterm=cuCmul(shapelet_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
       } else if (stypeT==STYPE_GAUSSIAN) {
-       prodterm=cuCmulf(gaussian_contrib(exs[k],uscaled,vscaled,wscaled),prodterm);
+       prodterm=cuCmul(gaussian_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
       } else if (stypeT==STYPE_DISK) {
-       prodterm=cuCmulf(disk_contrib(exs[k],uscaled,vscaled,wscaled),prodterm);
+       prodterm=cuCmul(disk_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
       } else if (stypeT==STYPE_RING) {
-       prodterm=cuCmulf(ring_contrib(exs[k],uscaled,vscaled,wscaled),prodterm);
+       prodterm=cuCmul(ring_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
       }
      }
-     
-//printf("k=%d cf=%d freq=%f uvw %f,%f,%f lmn %f,%f,%f phterm %f If %f\n",k,cf,freqs[cf],u,v,w,ll[k],mm[k],nn[k],phterm,If);
 
-     /* write output to shared array */
-     tmpcoh[k*8*F+8*cf]=prodterm.x;
-     tmpcoh[k*8*F+8*cf+1]=prodterm.y;
-     tmpcoh[k*8*F+8*cf+2]=0.0f;
-     tmpcoh[k*8*F+8*cf+3]=0.0f;
-     tmpcoh[k*8*F+8*cf+4]=0.0f;
-     tmpcoh[k*8*F+8*cf+5]=0.0f;
-     tmpcoh[k*8*F+8*cf+6]=prodterm.x;
-     tmpcoh[k*8*F+8*cf+7]=prodterm.y;
-   }
-  }
-  __syncthreads();
+    double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy;
+    Ix=If*prodterm.x;
+    Iy=If*prodterm.y;
+    Qx=Qf*prodterm.x;
+    Qy=Qf*prodterm.y;
+    Ux=Uf*prodterm.x;
+    Uy=Uf*prodterm.y;
+    Vx=Vf*prodterm.x;
+    Vy=Vf*prodterm.y;
 
-  // Build summation tree over elements, handling case where total threads is not a power of two.
-  int nTotalThreads = blockDim_2; // Total number of threads (==Kused), rounded up to the next power of two
-  while(nTotalThreads > 1) {
-    int halfPoint = (nTotalThreads >> 1); // divide by two
-    if (k < halfPoint) {
-     int thread2 = k + halfPoint;
-     if (thread2 < blockDim.x) { // Skipping the fictitious threads >Kused ( blockDim.x ... blockDim_2-1 )
-      for(int cf=0; cf<F; cf++) {
-       tmpcoh[k*8*F+8*cf]=tmpcoh[k*8*F+8*cf]+tmpcoh[thread2*8*F+8*cf];
-       tmpcoh[k*8*F+8*cf+1]=tmpcoh[k*8*F+8*cf+1]+tmpcoh[thread2*8*F+8*cf+1];
-       tmpcoh[k*8*F+8*cf+2]=tmpcoh[k*8*F+8*cf+2]+tmpcoh[thread2*8*F+8*cf+2];
-       tmpcoh[k*8*F+8*cf+3]=tmpcoh[k*8*F+8*cf+3]+tmpcoh[thread2*8*F+8*cf+3];
-       tmpcoh[k*8*F+8*cf+4]=tmpcoh[k*8*F+8*cf+4]+tmpcoh[thread2*8*F+8*cf+4];
-       tmpcoh[k*8*F+8*cf+5]=tmpcoh[k*8*F+8*cf+5]+tmpcoh[thread2*8*F+8*cf+5];
-       tmpcoh[k*8*F+8*cf+6]=tmpcoh[k*8*F+8*cf+6]+tmpcoh[thread2*8*F+8*cf+6];
-       tmpcoh[k*8*F+8*cf+7]=tmpcoh[k*8*F+8*cf+7]+tmpcoh[thread2*8*F+8*cf+7];
-      }
 
+
+    output[0]=Ix+Qx;
+    output[1]=Iy+Qy;
+    output[2]=Ux-Vy;
+    output[3]=Vx+Uy;
+    output[4]=Ux+Vy;
+    output[5]=-Vx+Uy;
+    output[6]=Ix-Qx;
+    output[7]=Iy-Qy;
+
+}
+
+
+__device__ void 
+compute_prodterm(int sta1, int sta2, int N, int K, int T,
+ double phterm0, double If, double Qf, double Uf, double Vf,
+ double myfreq, double deltaf, int dobeam, int itm, int k, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) {
+     /* F==1 is assumed, output: 8x1 array */
+     double sinph,cosph;
+     sincos(phterm0*myfreq,&sinph,&cosph);
+     cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph);
+
+     /* smearing, beam */
+     double scalef=1.0;
+     double phterm =(phterm0*0.5*deltaf);
+     if (phterm!=0.0) {
+      sinph=(sin(phterm)/phterm);
+      scalef *=fabs(sinph); /* catch -ve values due to rounding off */
      }
-    }
-    __syncthreads();
-    nTotalThreads = halfPoint; // Reducing the binary tree size by two
-  }
 
-  /* add up to form final result */
-  if (threadIdx.x==0) {
-    for(int cf=0; cf<F; cf++) {
-     coh[cf*8*B]=tmpcoh[8*cf];
-     coh[cf*8*B+1]=tmpcoh[8*cf+1];
-     coh[cf*8*B+2]=tmpcoh[8*cf+2];
-     coh[cf*8*B+3]=tmpcoh[8*cf+3];
-     coh[cf*8*B+4]=tmpcoh[8*cf+4];
-     coh[cf*8*B+5]=tmpcoh[8*cf+5];
-     coh[cf*8*B+6]=tmpcoh[8*cf+6];
-     coh[cf*8*B+7]=tmpcoh[8*cf+7];
-    }
-  }
+     if (dobeam) {
+      /* get beam info */
+      int boffset1=itm*N*K+k*N+sta1;
+      //  printf("itm=%d, k1=%d, sta1=%d, sta2=%d, boffset1=%d, boffset2=%d\n", itm, k1, sta1, sta2, boffset1, boffset2);
+      float beam1=__ldg(&beam[boffset1]);
+      int boffset2=itm*N*K+k*N+sta2;
+      float beam2=__ldg(&beam[boffset2]);
+      scalef *=(double)(beam1*beam2);
+     }
+
+
+     /* form complex value */
+     prodterm.x *=scalef;
+     prodterm.y *=scalef;
+
+     /* check for type of source */
+     if (stypeT!=STYPE_POINT && !(u==0.0 && v==0.0)) {
+      float uscaled=(float)(u*myfreq);
+      float vscaled=(float)(v*myfreq);
+      float wscaled=(float)(w*myfreq);
+      if (stypeT==STYPE_SHAPELET) {
+       prodterm=cuCmul(shapelet_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
+      } else if (stypeT==STYPE_GAUSSIAN) {
+       prodterm=cuCmul(gaussian_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
+      } else if (stypeT==STYPE_DISK) {
+       prodterm=cuCmul(disk_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
+      } else if (stypeT==STYPE_RING) {
+       prodterm=cuCmul(ring_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
+      }
+     }
+
+
+    double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy;
+    Ix=If*prodterm.x;
+    Iy=If*prodterm.y;
+    Qx=Qf*prodterm.x;
+    Qy=Qf*prodterm.y;
+    Ux=Uf*prodterm.x;
+    Uy=Uf*prodterm.y;
+    Vx=Vf*prodterm.x;
+    Vy=Vf*prodterm.y;
+
+
+
+    output[0]=Ix+Qx;
+    output[1]=Iy+Qy;
+    output[2]=Ux-Vy;
+    output[3]=Vx+Uy;
+    output[4]=Ux+Vy;
+    output[5]=-Vx+Uy;
+    output[6]=Ix-Qx;
+    output[7]=Iy-Qy;
+
 }
 
 /* master kernel to calculate coherencies */
 __global__ void 
-kernel_coherencies(int B, int N, int T, int K, int F,float *u, float *v, float *w,baseline_t *barr, float *freqs, float *beam, float *ll, float *mm, float *nn, float *sI,
-  unsigned char *stype, float *sI0, float *f0, float *spec_idx, float *spec_idx1, float *spec_idx2, int **exs, float deltaf, float deltat, float dec0, float *coh, int dobeam) {
+kernel_coherencies(int B, int N, int T, int K, int F,
+  const double *__restrict__ u, const double *__restrict__ v, const double *__restrict__ w,
+  baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, 
+  const double *__restrict__ sI, const double *__restrict__ sQ, const double *__restrict__ sU, const double *__restrict__ sV,
+  const unsigned char *__restrict__ stype, const double *__restrict__ sI0, 
+const double *__restrict__ sQ0, const double *__restrict__ sU0, const double *__restrict__ sV0,
+  const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
 
   /* global thread index */
   unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
@@ -605,54 +642,361 @@ kernel_coherencies(int B, int N, int T, int K, int F,float *u, float *v, float *
    int tslot=n/((N*(N-1)/2));
 
 
-#ifdef CUDA_DBG
-   cudaError_t error;
-#endif
+   double u_n=(u[n]);
+   double v_n=(v[n]);
+   double w_n=(w[n]);
 
-   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-   // int ThreadsPerBlock=16;
-   /* each slave thread will calculate one source, 8xF values for all freq */
-   /* also give right offset for coherencies */
-   if (K<ThreadsPerBlock) {
-    /* one kernel is enough, offset is 0 */
-    kernel_coherencies_slave<<<1,K,sizeof(float)*(8*F*K)>>>(sta1,sta2,tslot,B,N,T,K,K,0,F,__ldg(&u[n]),__ldg(&v[n]),__ldg(&w[n]),freqs,beam,ll,mm,nn,sI,stype,sI0,f0,spec_idx,spec_idx1,spec_idx2,exs,deltaf,deltat,dec0,&coh[8*n],dobeam,NearestPowerOf2(K));
-    cudaDeviceSynchronize();
-#ifdef CUDA_DBG
-  error = cudaGetLastError();
-  if(error != cudaSuccess) {
-    // print the CUDA error message and exit
-    printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
-  }
-#endif
-   } else {
-    /* more than 1 kernel */
-    int L=(K+ThreadsPerBlock-1)/ThreadsPerBlock;
-    int ct=0;
-    int myT;
-    for (int ci=0; ci<L; ci++) {
-     if (ct+ThreadsPerBlock<K) {
-       myT=ThreadsPerBlock;
-     } else {
-       myT=K-ct;
-     }
-     /* launch kernel with myT threads, starting at ct offset */
-     kernel_coherencies_slave<<<1,myT,sizeof(float)*(8*F*myT)>>>(sta1,sta2,tslot,B,N,T,K,myT,ct,F,__ldg(&u[n]),__ldg(&v[n]),__ldg(&w[n]),freqs,beam,&ll[ct],&mm[ct],&nn[ct],&sI[ct],&stype[ct],&sI0[ct],&f0[ct],&spec_idx[ct],&spec_idx1[ct],&spec_idx2[ct],&exs[ct],deltaf,deltat,dec0,&coh[8*n],dobeam,NearestPowerOf2(myT));
-     cudaDeviceSynchronize();
-#ifdef CUDA_DBG
-  error = cudaGetLastError();
-  if(error != cudaSuccess) {
-    // print the CUDA error message and exit
-    printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
-  }
-#endif
-     ct=ct+ThreadsPerBlock;
-    }
+   //TODO: figure out if this max_f makes any sense
+   #define MAX_F 20
+   double l_coh[MAX_F][8];
+   for(int cf=0; cf<F; cf++) {
+        l_coh[cf][0]=0.0;
+        l_coh[cf][1]=0.0;
+        l_coh[cf][2]=0.0;
+        l_coh[cf][3]=0.0;
+        l_coh[cf][4]=0.0;
+        l_coh[cf][5]=0.0;
+        l_coh[cf][6]=0.0;
+        l_coh[cf][7]=0.0;
    }
 
+
+   // split to two cases, F==1 and F>1
+   if (F==1) {
+     //use simply for-loop, if K is very large this may be slow and may need further parallelization
+     for (int k=0; k<K; k++) {
+        //source specific params
+        double sIf,sQf,sUf,sVf;
+        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
+        sIf=__ldg(&sI[k]);
+        sQf=__ldg(&sQ[k]);
+        sUf=__ldg(&sU[k]);
+        sVf=__ldg(&sV[k]);
+
+        unsigned char stypeT=__ldg(&stype[k]);
+
+        double llcoh[8];
+        compute_prodterm(sta1, sta2, N, K, T, phterm0, sIf, sQf, sUf, sVf,
+               __ldg(&(freqs[0])), deltaf, dobeam, tslot, k, beam, exs, stypeT, u_n, v_n, w_n, llcoh);
+
+         l_coh[0][0] +=llcoh[0];
+         l_coh[0][1] +=llcoh[1];
+         l_coh[0][2] +=llcoh[2];
+         l_coh[0][3] +=llcoh[3];
+         l_coh[0][4] +=llcoh[4];
+         l_coh[0][5] +=llcoh[5];
+         l_coh[0][6] +=llcoh[6];
+         l_coh[0][7] +=llcoh[7];
+
+     }
+   } else {
+     //use simply for-loop, if K is very large this may be slow and may need further parallelization
+     for (int k=0; k<K; k++) {
+        //source specific params
+        double sI0f,sQ0f,sU0f,sV0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
+        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
+        sI0f=__ldg(&sI0[k]);
+        sQ0f=__ldg(&sQ0[k]);
+        sU0f=__ldg(&sU0[k]);
+        sV0f=__ldg(&sV0[k]);
+        spec_idxf=__ldg(&spec_idx[k]);
+        spec_idx1f=__ldg(&spec_idx1[k]);
+        spec_idx2f=__ldg(&spec_idx2[k]);
+        myf0=__ldg(&f0[k]);
+
+        unsigned char stypeT=__ldg(&stype[k]);
+
+        for(int cf=0; cf<F; cf++) {
+            double llcoh[8];
+            compute_prodterm_multifreq(sta1, sta2, N, K, T, F, phterm0, sI0f, sQ0f, sU0f, sV0f, spec_idxf, spec_idx1f, spec_idx2f, 
+               myf0, __ldg(&(freqs[cf])), deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n,llcoh);
+         l_coh[cf][0] +=llcoh[0];
+         l_coh[cf][1] +=llcoh[1];
+         l_coh[cf][2] +=llcoh[2];
+         l_coh[cf][3] +=llcoh[3];
+         l_coh[cf][4] +=llcoh[4];
+         l_coh[cf][5] +=llcoh[5];
+         l_coh[cf][6] +=llcoh[6];
+         l_coh[cf][7] +=llcoh[7];
+        }
+
+     }
+
+
+   }
+
+
+
+     //write output with right multi frequency offset
+    double *coh1 = &coh[8*n];
+    for(int cf=0; cf<F; cf++) {
+        coh1[cf*8*B+0] = l_coh[cf][0];
+        coh1[cf*8*B+1] = l_coh[cf][1];
+        coh1[cf*8*B+2] = l_coh[cf][2];
+        coh1[cf*8*B+3] = l_coh[cf][3];
+        coh1[cf*8*B+4] = l_coh[cf][4];
+        coh1[cf*8*B+5] = l_coh[cf][5];
+        coh1[cf*8*B+6] = l_coh[cf][6];
+        coh1[cf*8*B+7] = l_coh[cf][7];
+    }
+
+  
   }
 
 }
 
+/* kernel to calculate residuals */
+__global__ void 
+kernel_residuals(int B, int N, int T, int K, int F,
+  const double *__restrict__ u, const double *__restrict__ v, const double *__restrict__ w,
+  const double *__restrict__ p, int nchunk,
+  baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, 
+  const double *__restrict__ sI, const double *__restrict__ sQ, const double *__restrict__ sU, const double *__restrict__ sV,
+  const unsigned char *__restrict__ stype, const double *__restrict__ sI0, 
+const double *__restrict__ sQ0, const double *__restrict__ sU0, const double *__restrict__ sV0,
+  const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
+
+  /* global thread index */
+  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
+
+  /* each thread will calculate for one baseline, over all sources */
+  if (n<B) {
+   int sta1=barr[n].sta1;
+   int sta2=barr[n].sta2;
+   /* find out which time slot this baseline is from */
+   int tslot=n/((N*(N-1)/2));
+   /* find out which chunk to select from p : 0,1..nchunk-1 */
+   int chunk=(n)/((B+nchunk-1)/nchunk);
+   /* create G1 and G2 Jones matrices from p */
+   cuDoubleComplex G1[4],G2[4];
+   G1[0].x=__ldg(&p[chunk*8*N+sta1*8]);
+   G1[0].y=__ldg(&p[chunk*8*N+sta1*8+1]);
+   G1[1].x=__ldg(&p[chunk*8*N+sta1*8+2]);
+   G1[1].y=__ldg(&p[chunk*8*N+sta1*8+3]);
+   G1[2].x=__ldg(&p[chunk*8*N+sta1*8+4]);
+   G1[2].y=__ldg(&p[chunk*8*N+sta1*8+5]);
+   G1[3].x=__ldg(&p[chunk*8*N+sta1*8+6]);
+   G1[3].y=__ldg(&p[chunk*8*N+sta1*8+7]);
+   G2[0].x=__ldg(&p[chunk*8*N+sta2*8]);
+   G2[0].y=__ldg(&p[chunk*8*N+sta2*8+1]);
+   G2[1].x=__ldg(&p[chunk*8*N+sta2*8+2]);
+   G2[1].y=__ldg(&p[chunk*8*N+sta2*8+3]);
+   G2[2].x=__ldg(&p[chunk*8*N+sta2*8+4]);
+   G2[2].y=__ldg(&p[chunk*8*N+sta2*8+5]);
+   G2[3].x=__ldg(&p[chunk*8*N+sta2*8+6]);
+   G2[3].y=__ldg(&p[chunk*8*N+sta2*8+7]);
+
+   double u_n=(u[n]);
+   double v_n=(v[n]);
+   double w_n=(w[n]);
+
+   //TODO: figure out if this max_f makes any sense
+   #define MAX_F 20
+   double l_coh[MAX_F][8];
+   for(int cf=0; cf<F; cf++) {
+        l_coh[cf][0]=0.0;
+        l_coh[cf][1]=0.0;
+        l_coh[cf][2]=0.0;
+        l_coh[cf][3]=0.0;
+        l_coh[cf][4]=0.0;
+        l_coh[cf][5]=0.0;
+        l_coh[cf][6]=0.0;
+        l_coh[cf][7]=0.0;
+   }
+
+
+   // split to two cases, F==1 and F>1
+   if (F==1) {
+     //use simply for-loop, if K is very large this may be slow and may need further parallelization
+     for (int k=0; k<K; k++) {
+        //source specific params
+        double sIf,sQf,sUf,sVf;
+        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
+        sIf=__ldg(&sI[k]);
+        sQf=__ldg(&sQ[k]);
+        sUf=__ldg(&sU[k]);
+        sVf=__ldg(&sV[k]);
+
+        unsigned char stypeT=__ldg(&stype[k]);
+
+        double llcoh[8];
+        compute_prodterm(sta1, sta2, N, K, T, phterm0, sIf, sQf, sUf, sVf,
+               __ldg(&(freqs[0])), deltaf, dobeam, tslot, k, beam, exs, stypeT, u_n, v_n, w_n, llcoh);
+
+         l_coh[0][0] +=llcoh[0];
+         l_coh[0][1] +=llcoh[1];
+         l_coh[0][2] +=llcoh[2];
+         l_coh[0][3] +=llcoh[3];
+         l_coh[0][4] +=llcoh[4];
+         l_coh[0][5] +=llcoh[5];
+         l_coh[0][6] +=llcoh[6];
+         l_coh[0][7] +=llcoh[7];
+
+
+     }
+     cuDoubleComplex L1[4],L2[4];
+     L1[0].x=l_coh[0][0];
+     L1[0].y=l_coh[0][1];
+     L1[1].x=l_coh[0][2];
+     L1[1].y=l_coh[0][3];
+     L1[2].x=l_coh[0][4];
+     L1[2].y=l_coh[0][5];
+     L1[3].x=l_coh[0][6];
+     L1[3].y=l_coh[0][7];
+     /* L2=G1*L1 */
+     amb(G1,L1,L2);
+     /* L1=L2*G2^H */
+     ambt(L2,G2,L1);
+     l_coh[0][0]=L1[0].x;
+     l_coh[0][1]=L1[0].y;
+     l_coh[0][2]=L1[1].x;
+     l_coh[0][3]=L1[1].y;
+     l_coh[0][4]=L1[2].x;
+     l_coh[0][5]=L1[2].y;
+     l_coh[0][6]=L1[3].x;
+     l_coh[0][7]=L1[3].y;
+
+   } else {
+     //use simply for-loop, if K is very large this may be slow and may need further parallelization
+     for (int k=0; k<K; k++) {
+        //source specific params
+        double sI0f,sQ0f,sU0f,sV0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
+        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
+        sI0f=__ldg(&sI0[k]);
+        sQ0f=__ldg(&sQ0[k]);
+        sU0f=__ldg(&sU0[k]);
+        sV0f=__ldg(&sV0[k]);
+        spec_idxf=__ldg(&spec_idx[k]);
+        spec_idx1f=__ldg(&spec_idx1[k]);
+        spec_idx2f=__ldg(&spec_idx2[k]);
+        myf0=__ldg(&f0[k]);
+
+        unsigned char stypeT=__ldg(&stype[k]);
+
+        for(int cf=0; cf<F; cf++) {
+            double llcoh[8];
+            compute_prodterm_multifreq(sta1, sta2, N, K, T, F, phterm0, sI0f, sQ0f, sU0f, sV0f, spec_idxf, spec_idx1f, spec_idx2f, 
+               myf0, __ldg(&(freqs[cf])), deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n,llcoh);
+         l_coh[cf][0] +=llcoh[0];
+         l_coh[cf][1] +=llcoh[1];
+         l_coh[cf][2] +=llcoh[2];
+         l_coh[cf][3] +=llcoh[3];
+         l_coh[cf][4] +=llcoh[4];
+         l_coh[cf][5] +=llcoh[5];
+         l_coh[cf][6] +=llcoh[6];
+         l_coh[cf][7] +=llcoh[7];
+        }
+
+     }
+
+     cuDoubleComplex L1[4],L2[4];
+     for(int cf=0; cf<F; cf++) {
+       L1[0].x=l_coh[cf][0];
+       L1[0].y=l_coh[cf][1];
+       L1[1].x=l_coh[cf][2];
+       L1[1].y=l_coh[cf][3];
+       L1[2].x=l_coh[cf][4];
+       L1[2].y=l_coh[cf][5];
+       L1[3].x=l_coh[cf][6];
+       L1[3].y=l_coh[cf][7];
+       /* L2=G1*L1 */
+       amb(G1,L1,L2);
+       /* L1=L2*G2^H */
+       ambt(L2,G2,L1);
+       l_coh[cf][0]=L1[0].x;
+       l_coh[cf][1]=L1[0].y;
+       l_coh[cf][2]=L1[1].x;
+       l_coh[cf][3]=L1[1].y;
+       l_coh[cf][4]=L1[2].x;
+       l_coh[cf][5]=L1[2].y;
+       l_coh[cf][6]=L1[3].x;
+       l_coh[cf][7]=L1[3].y;
+     }
+
+
+   }
+
+
+
+     //write output with right multi frequency offset
+    double *coh1 = &coh[8*n];
+    for(int cf=0; cf<F; cf++) {
+        coh1[cf*8*B+0] = l_coh[cf][0];
+        coh1[cf*8*B+1] = l_coh[cf][1];
+        coh1[cf*8*B+2] = l_coh[cf][2];
+        coh1[cf*8*B+3] = l_coh[cf][3];
+        coh1[cf*8*B+4] = l_coh[cf][4];
+        coh1[cf*8*B+5] = l_coh[cf][5];
+        coh1[cf*8*B+6] = l_coh[cf][6];
+        coh1[cf*8*B+7] = l_coh[cf][7];
+    }
+
+  
+  }
+
+}
+
+
+/* kernel to correct residuals */
+__global__ void
+kernel_correct_residuals(int B, int N, int Nb, int boff, int F, int nchunk, double *__restrict__ x, const double *__restrict__ p, baseline_t *barr)  {
+
+  /* relative baseline index */
+  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
+
+  if (n<Nb) {
+   int sta1=barr[n].sta1;
+   int sta2=barr[n].sta2;
+   /* find out which chunk to select from p : 0,1..nchunk-1 */
+   int chunk=(n+boff)/((B+nchunk-1)/nchunk);
+   /* create G1 and G2 Jones matrices from p */
+   cuDoubleComplex G1[4],G2[4];
+   G1[0].x=__ldg(&p[chunk*8*N+sta1*8]);
+   G1[0].y=__ldg(&p[chunk*8*N+sta1*8+1]);
+   G1[1].x=__ldg(&p[chunk*8*N+sta1*8+2]);
+   G1[1].y=__ldg(&p[chunk*8*N+sta1*8+3]);
+   G1[2].x=__ldg(&p[chunk*8*N+sta1*8+4]);
+   G1[2].y=__ldg(&p[chunk*8*N+sta1*8+5]);
+   G1[3].x=__ldg(&p[chunk*8*N+sta1*8+6]);
+   G1[3].y=__ldg(&p[chunk*8*N+sta1*8+7]);
+   G2[0].x=__ldg(&p[chunk*8*N+sta2*8]);
+   G2[0].y=__ldg(&p[chunk*8*N+sta2*8+1]);
+   G2[1].x=__ldg(&p[chunk*8*N+sta2*8+2]);
+   G2[1].y=__ldg(&p[chunk*8*N+sta2*8+3]);
+   G2[2].x=__ldg(&p[chunk*8*N+sta2*8+4]);
+   G2[2].y=__ldg(&p[chunk*8*N+sta2*8+5]);
+   G2[3].x=__ldg(&p[chunk*8*N+sta2*8+6]);
+   G2[3].y=__ldg(&p[chunk*8*N+sta2*8+7]);
+
+   for (int cf=0; cf<F; cf++) {
+      cuDoubleComplex L1[4],L2[4];
+
+      L1[0].x=x[cf*8*Nb+8*n];
+      L1[0].y=x[cf*8*Nb+8*n+1];
+      L1[1].x=x[cf*8*Nb+8*n+2];
+      L1[1].y=x[cf*8*Nb+8*n+3];
+      L1[2].x=x[cf*8*Nb+8*n+4];
+      L1[2].y=x[cf*8*Nb+8*n+5];
+      L1[3].x=x[cf*8*Nb+8*n+6];
+      L1[3].y=x[cf*8*Nb+8*n+7];
+
+      /* L2=G1*L1 */
+      amb(G1,L1,L2);
+      /* L1=L2*G2^H */
+      ambt(L2,G2,L1);
+
+      x[cf*8*Nb+8*n]=L1[0].x;
+      x[cf*8*Nb+8*n+1]=L1[0].y;
+      x[cf*8*Nb+8*n+2]=L1[1].x;
+      x[cf*8*Nb+8*n+3]=L1[1].y;
+      x[cf*8*Nb+8*n+4]=L1[2].x;
+      x[cf*8*Nb+8*n+5]=L1[2].y;
+      x[cf*8*Nb+8*n+6]=L1[3].x;
+      x[cf*8*Nb+8*n+7]=L1[3].y;
+
+   }
+  }
+}
 
 /* kernel to convert time (JD) to GMST angle*/
 __global__ void 
@@ -705,7 +1049,7 @@ checkCudaError(cudaError_t err, const char *file, int line)
 */
 
 void
-cudakernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude, float *latitude,
+cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitude, float *latitude,
  double *time_utc, int *Nelem, float **xx, float **yy, float **zz, float *ra, float *dec, float ph_ra0, float ph_dec0, float ph_freq0, float *beam) {
 #ifdef CUDA_DBG
   cudaError_t error;
@@ -716,28 +1060,17 @@ cudakernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude
   //cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128*1024*1024);
   // for an array of max 24*16 x 2  double, the default 8MB is ok
 
-  /* total number of threads needed */
-  int Ntotal=N*T*K*F;
-  float *buffer;
-  /* allocate buffer to store intermerdiate sin() cos() values per thread */
-#ifdef CUDA_DBG
-  error=cudaMalloc((void**)&buffer, 2*Ntotal*sizeof(float));
-  checkCudaError(error,__FILE__,__LINE__);
-#endif
-#ifndef CUDA_DBG
-  cudaMalloc((void**)&buffer, 2*Ntotal*sizeof(float));
-#endif
-  cudaMemset(buffer,0,sizeof(float)*2*Ntotal);
-
-
-  // int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int ThreadsPerBlock=8;
+  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
   /* note: make sure we do not exceed max no of blocks available, otherwise (too many sources, loop over source id) */
-  int BlocksPerGrid= 2*(Ntotal+ThreadsPerBlock-1)/ThreadsPerBlock;
-  kernel_array_beam<<<BlocksPerGrid,ThreadsPerBlock>>>(N,T,K,F,freqs,longitude,latitude,time_utc,Nelem,xx,yy,zz,ra,dec,ph_ra0,ph_dec0,ph_freq0,beam,buffer);
+
+  /* 2D grid of threads: x dim->data, y dim-> stations */
+  dim3 grid(1, 1, 1);
+  grid.x = (int)ceilf((K*T*F) / (float)ThreadsPerBlock);
+  grid.y = N;
+
+  kernel_array_beam<<<grid,ThreadsPerBlock>>>(N,T,K,F,freqs,longitude,latitude,time_utc,Nelem,xx,yy,zz,ra,dec,ph_ra0,ph_dec0,ph_freq0,beam);
   cudaDeviceSynchronize();
 
-  cudaFree(buffer);
 #ifdef CUDA_DBG
   error = cudaGetLastError();
   if(error != cudaSuccess) {
@@ -751,7 +1084,7 @@ cudakernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude
 
 /* 
   calculate coherencies:
-  B: total baselines
+  B: total baselines (could be more than one timeslot)
   N: no of stations
   T: no of time slots
   K: no of sources
@@ -774,26 +1107,20 @@ cudakernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude
   dobeam: enable beam if >0
 */
 void
-cudakernel_coherencies(int B, int N, int T, int K, int F, float *u, float *v, float *w,baseline_t *barr, float *freqs, float *beam, float *ll, float *mm, float *nn, float *sI,
-  unsigned char *stype, float *sI0, float *f0, float *spec_idx, float *spec_idx1, float *spec_idx2, int **exs, float deltaf, float deltat, float dec0, float *coh,int dobeam) {
+cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
+  unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
 #ifdef CUDA_DBG
   cudaError_t error;
   error = cudaGetLastError();
-  error=cudaMemset(coh,0,sizeof(float)*8*B*F);
-  checkCudaError(error,__FILE__,__LINE__);
-#endif
-#ifndef CUDA_DBG
-  cudaMemset(coh,0,sizeof(float)*8*B*F);
 #endif
 
   /* spawn threads to handle baselines, these threads will spawn threads for sources */
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  // int ThreadsPerBlock=16;
   /* note: make sure we do not exceed max no of blocks available, 
    otherwise (too many baselines, loop over source id) */
-  int BlocksPerGrid= 2*(B+ThreadsPerBlock-1)/ThreadsPerBlock;
-  kernel_coherencies<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI,
-    stype, sI0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
+  int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock;
+  kernel_coherencies<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV,
+    stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
   cudaDeviceSynchronize();
 #ifdef CUDA_DBG
   error = cudaGetLastError();
@@ -806,6 +1133,56 @@ cudakernel_coherencies(int B, int N, int T, int K, int F, float *u, float *v, fl
 }
 
 
+/* p : parameters 8Nxnchunk values */
+void
+cudakernel_residuals(int B, int N, int T, int K, int F, double *u, double *v, double *w, double *p, int nchunk, baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
+  unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
+#ifdef CUDA_DBG
+  cudaError_t error;
+  error = cudaGetLastError();
+#endif
+
+  /* spawn threads to handle baselines, these threads will loop over sources */
+  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
+  /* note: make sure we do not exceed max no of blocks available, 
+   otherwise (too many baselines, loop over source id) */
+  int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock;
+  kernel_residuals<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,p,nchunk,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV,
+    stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
+  cudaDeviceSynchronize();
+#ifdef CUDA_DBG
+  error = cudaGetLastError();
+  if(error != cudaSuccess) {
+    // print the CUDA error message and exit
+    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
+    exit(-1);
+  }
+#endif
+}
+
+
+void
+cudakernel_correct_residuals(int B, int N, int Nb, int boff, int F, int nchunk, double *x, double *p, baseline_t *barr) {
+#ifdef CUDA_DBG
+  cudaError_t error;
+  error = cudaGetLastError();
+#endif
+
+  /* spawn threads to handle baselines */
+  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
+  int BlocksPerGrid=(Nb+ThreadsPerBlock-1)/ThreadsPerBlock;
+  kernel_correct_residuals<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, Nb, boff, F, nchunk, x, p, barr);
+  cudaDeviceSynchronize();
+#ifdef CUDA_DBG
+  error = cudaGetLastError();
+  if(error != cudaSuccess) {
+    // print the CUDA error message and exit
+    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
+    exit(-1);
+  }
+#endif
+}
+
 /* convert time JD to GMST angle
   store result at the same location */
 void
@@ -818,10 +1195,10 @@ cudakernel_convert_time(int T, double *time_utc) {
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
   /* note: make sure we do not exceed max no of blocks available, 
    otherwise (too many baselines, loop over source id) */
-  int BlocksPerGrid= 2*(T+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid=(T+ThreadsPerBlock-1)/ThreadsPerBlock;
   kernel_convert_time<<<BlocksPerGrid,ThreadsPerBlock>>>(T,time_utc);
   cudaDeviceSynchronize();
- #ifdef CUDA_DBG
+#ifdef CUDA_DBG
   error = cudaGetLastError();
   if(error != cudaSuccess) {
     // print the CUDA error message and exit