added model prediction and residual calculation using GPU

2017-11-20 12:36:17 +01:00 · 2017-11-20 12:36:17 +01:00 · 028ac53347
parent e07a26b63f
commit 028ac53347
7 changed files with 1254 additions and 93 deletions
--- a/src/MPI/main.cpp
+++ b/src/MPI/main.cpp
@ -239,6 +239,7 @@ ParseCmdLine(int ac, char **av) {
    }
 }

+
 /* real main program */
 int
 main(int argc, char **argv) {
--- a/src/MPI/sagecal_slave.cpp
+++ b/src/MPI/sagecal_slave.cpp
@ -537,12 +537,27 @@ cout<<myrank<<" : "<<cm<<": downweight ratio ("<<iodata_vec[cm].fratio<<") based
     }

     for(int cm=0; cm<mymscount; cm++) {
+#ifndef HAVE_CUDA
      if (!doBeam) {
       precalculate_coherencies(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
      } else {
       precalculate_coherencies_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,
        beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,iodata_vec[cm].tilesz,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt);
      }
+#endif
+#ifdef HAVE_CUDA
+     if (GPUpredict) {
+       precalculate_coherencies_withbeam_gpu(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,
+  beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,iodata_vec[cm].tilesz,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,doBeam,Data::Nt);
+     } else {
+      if (!doBeam) {
+       precalculate_coherencies(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
+      } else {
+       precalculate_coherencies_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,
+        beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,iodata_vec[cm].tilesz,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt);
+      }
+     }
+#endif
     }
 
     /******************** ADMM  *******************************/
@ -734,12 +749,27 @@ cout<<myrank<<" : "<<cm<<": downweight ratio ("<<iodata_vec[cm].fratio<<") based

     /* write residuals to output */
     for(int cm=0; cm<mymscount; cm++) {
+#ifndef HAVE_CUDA
      if (!doBeam) {
-      calculate_residuals_multifreq(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
-     } else {
-      calculate_residuals_multifreq_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
+       calculate_residuals_multifreq(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
+      } else {
+       calculate_residuals_multifreq_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
       beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
+      }
+#endif
+#ifdef HAVE_CUDA
+     if (GPUpredict) {
+       calculate_residuals_multifreq_withbeam_gpu(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
+          beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,doBeam,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
+     } else {
+      if (!doBeam) {
+       calculate_residuals_multifreq(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
+      } else {
+       calculate_residuals_multifreq_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
+       beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
+      }
     }
+#endif
     }
     tilex+=iodata_vec[0].tilesz;

--- a/src/MS/main.cpp
+++ b/src/MS/main.cpp
@ -526,18 +526,27 @@ main(int argc, char **argv) {
   /* FIXME: uvmin is not needed in calibration, because its taken care of by flags */
    if (!Data::DoSim) {
    /****************** calibration **************************/
-////#ifndef HAVE_CUDA
+#ifndef HAVE_CUDA
    if (!doBeam) {
     precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
    } else {
     precalculate_coherencies_withbeam(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt);
    }
-////#endif
-////#ifdef HAVE_CUDA
-////     precalculate_coherencies_withbeam_gpu(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
-////  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt);
-////#endif
+#endif
+#ifdef HAVE_CUDA
+   if (GPUpredict) {
+     precalculate_coherencies_withbeam_gpu(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
+  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt);
+   } else {
+    if (!doBeam) {
+     precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
+    } else {
+     precalculate_coherencies_withbeam(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
+  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt);
+    }
+   }
+#endif

    
 #ifndef HAVE_CUDA
@ -651,12 +660,28 @@ main(int argc, char **argv) {
    } else {
     /* since residual is calculated not using x (instead using xo), no need to unwhiten data
        in case x was whitened */
+#ifndef HAVE_CUDA
     if (!doBeam) {
      calculate_residuals_multifreq(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
     } else {
      calculate_residuals_multifreq_withbeam(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
 beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
     }
+#endif
+#ifdef HAVE_CUDA
+    if (GPUpredict) {
+      calculate_residuals_multifreq_withbeam_gpu(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
+beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
+    } else {
+     if (!doBeam) {
+      calculate_residuals_multifreq(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
+     } else {
+      calculate_residuals_multifreq_withbeam(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
+beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
+     }
+    }
+#endif
+
    }
    /****************** end calibration **************************/
    /****************** begin diagnostics ************************/
--- a/src/lib/clmfit.c
+++ b/src/lib/clmfit.c
@ -1224,6 +1224,7 @@ attach_gpu_to_thread1(int card,  cublasHandle_t *cbhandle, cusolverDnHandle_t *s
    status=cusolverDnCreate(solver_handle);
    if (status != CUSOLVER_STATUS_SUCCESS) {
     fprintf(stderr,"%s: %d: CUSOLV create fail %d\n",__FILE__,__LINE__,status);
+     fprintf(stderr,"common problems: not initialized %d, alloc fail %d, no compute %d\n",CUSOLVER_STATUS_NOT_INITIALIZED, CUSOLVER_STATUS_ALLOC_FAILED, CUSOLVER_STATUS_ARCH_MISMATCH);
     exit(1);
    }
  }
--- a/src/lib/predict_model.cu
+++ b/src/lib/predict_model.cu
@ -26,6 +26,25 @@
 /* enable this for checking for kernel failure */
 //#define CUDA_DBG

+/* matrix multiplications */
+/* C=A*B */
+__device__ void
+amb(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) {
+ c[0]=cuCadd(cuCmul(a[0],b[0]),cuCmul(a[1],b[2]));
+ c[1]=cuCadd(cuCmul(a[0],b[1]),cuCmul(a[1],b[3]));
+ c[2]=cuCadd(cuCmul(a[2],b[0]),cuCmul(a[3],b[2]));
+ c[3]=cuCadd(cuCmul(a[2],b[1]),cuCmul(a[3],b[3]));
+}
+/* C=A*B^H */
+__device__ void
+ambt(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) {
+ c[0]=cuCadd(cuCmul(a[0],cuConj(b[0])),cuCmul(a[1],cuConj(b[1])));
+ c[1]=cuCadd(cuCmul(a[0],cuConj(b[2])),cuCmul(a[1],cuConj(b[3])));
+ c[2]=cuCadd(cuCmul(a[2],cuConj(b[0])),cuCmul(a[3],cuConj(b[1])));
+ c[3]=cuCadd(cuCmul(a[2],cuConj(b[2])),cuCmul(a[3],cuConj(b[3])));
+}
+
+
 __device__ void
 radec2azel_gmst__(float ra, float dec, float longitude, float latitude, float thetaGMST, float *az, float *el) {
  float thetaLST=thetaGMST+longitude*180.0f/M_PI;
@ -327,6 +346,11 @@ shapelet_contrib__(int *dd, float u, float v, float w) {
  __sincosf((float)dp->eP,&sinph,&cosph);
  ut=a*(cosph*up-sinph*vp);
  vt=b*(sinph*up+cosph*vp);
+  /* if u,v is way off the scale (beta) of shapelet modes, the result is almost always zero,
+     so check this here and return 0, otherwise spurious nans may result */
+  if (__fdiv_rz(100.0f,__fsqrt_rz(ut*ut+vt*vt))<dp->beta) {
+   return make_cuDoubleComplex(0.0,0.0);
+  }
  /* note: we decompose f(-l,m) so the Fourier transform is F(-u,v)
   so negate the u grid */
  Av=(float*)malloc((size_t)((dp->n0)*(dp->n0))*sizeof(float));
@ -345,45 +369,57 @@ shapelet_contrib__(int *dd, float u, float v, float w) {

  free(Av);
  free(cplx);
-  //return 2.0*M_PI*(realsum+_Complex_I*imagsum);
  realsum*=2.0f*M_PI*a*b;
  imagsum*=2.0f*M_PI*a*b;
+  /* additional safeguards */
+  if ( isnan(realsum) ) { realsum=0.0f; }
+  if ( isnan(imagsum) ) { imagsum=0.0f; }
  return make_cuDoubleComplex((double)realsum,(double)imagsum);
 }

-
-__device__ cuDoubleComplex 
-compute_prodterm(int sta1, int sta2, int N, int K, int T, int F,
-double phterm0, double sIf, double sI0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0,
-const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w) {
-
+__device__ void 
+compute_prodterm_multifreq(int sta1, int sta2, int N, int K, int T, int F,
+double phterm0, double sI0f, double sQ0f, double sU0f, double sV0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0,
+ double myfreq, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) {
+     /* F>1 is assumed output: 8x1 array */
     double sinph,cosph;
-     double myfreq=__ldg(&freqs[cf]);
     sincos(phterm0*myfreq,&sinph,&cosph);
     cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph);
-     double If;
-     if (F==1) {
-      /* flux: do not use spectra here, because F=1*/
-      If=sIf;
-     } else {
-      /* evaluate spectra */
-      double fratio=log(myfreq/myf0);
-      double fratio1=fratio*fratio;
-      double fratio2=fratio1*fratio;
-      /* catch -ve flux */
-      if (sI0f>0.0) {
-        If=exp(log(sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
-      } else if (sI0f<0.0) {
-        If=-exp(log(-sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
-      } else {
-        If=0.0;
-      }
+     double If,Qf,Uf,Vf;
+     If=Qf=Uf=Vf=0.0;
+     /* evaluate spectra */
+     double fratio=log(myfreq/myf0);
+     double fratio1=fratio*fratio;
+     double fratio2=fratio1*fratio;
+     double cm=spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2;
+     /* catch -ve flux */
+     if (sI0f>0.0) {
+        If=exp(log(sI0f)+cm);
+     } else if (sI0f<0.0) {
+        If=-exp(log(-sI0f)+cm);
+     } 
+     if (sQ0f>0.0) {
+        Qf=exp(log(sQ0f)+cm);
+     } else if (sI0f<0.0) {
+        Qf=-exp(log(-sQ0f)+cm);
+     } 
+     if (sU0f>0.0) {
+        Uf=exp(log(sU0f)+cm);
+     } else if (sI0f<0.0) {
+        Uf=-exp(log(-sU0f)+cm);
+     } 
+     if (sV0f>0.0) {
+        Vf=exp(log(sV0f)+cm);
+     } else if (sI0f<0.0) {
+        Vf=-exp(log(-sV0f)+cm);
     }
-     /* smearing */
+
+     /* smearing, beam */
+     double scalef=1.0;
     double phterm =(phterm0*0.5*deltaf);
     if (phterm!=0.0) {
      sinph=(sin(phterm)/phterm);
-      If *=fabsf(sinph); /* catch -ve values due to rounding off */
+      scalef *=fabs(sinph); /* catch -ve values due to rounding off */
     }

     if (dobeam) {
@ -396,13 +432,13 @@ const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int
      //int boffset2=sta2*K*T*F + k1*T*F + cf*T + itm;
      int boffset2=itm*N*K*F+k*N*F+cf*N+sta2;
      float beam2=__ldg(&beam[boffset2]);
-      If *=(double)(beam1*beam2);
+      scalef *=(double)(beam1*beam2);
     }


     /* form complex value */
-     prodterm.x *=If;
-     prodterm.y *=If;
+     prodterm.x *=scalef;
+     prodterm.y *=scalef;

     /* check for type of source */
     if (stypeT!=STYPE_POINT) {
@ -420,10 +456,99 @@ const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int
      }
     }

-//printf("Input k=%d uvw %lE,%lE,%lE sI=%f phterm0=%lf\n",k,u,v,w,sIf,phterm0);
-//printf("phterm %lf smearing %f\n",phterm,sinph);
-//printf("k=%d  uvw %lE,%lE,%lE If=%f phterm %lf out=%f,%f\n",k,u,v,w,If,phterm,prodterm.x,prodterm.y);
-    return prodterm;
+    double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy;
+    Ix=If*prodterm.x;
+    Iy=If*prodterm.y;
+    Qx=Qf*prodterm.x;
+    Qy=Qf*prodterm.y;
+    Ux=Uf*prodterm.x;
+    Uy=Uf*prodterm.y;
+    Vx=Vf*prodterm.x;
+    Vy=Vf*prodterm.y;
+
+
+
+    output[0]=Ix+Qx;
+    output[1]=Iy+Qy;
+    output[2]=Ux-Vy;
+    output[3]=Vx+Uy;
+    output[4]=Ux+Vy;
+    output[5]=-Vx+Uy;
+    output[6]=Ix-Qx;
+    output[7]=Iy-Qy;
+
+}
+
+
+__device__ void 
+compute_prodterm(int sta1, int sta2, int N, int K, int T,
+ double phterm0, double If, double Qf, double Uf, double Vf,
+ double myfreq, double deltaf, int dobeam, int itm, int k, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) {
+     /* F==1 is assumed, output: 8x1 array */
+     double sinph,cosph;
+     sincos(phterm0*myfreq,&sinph,&cosph);
+     cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph);
+
+     /* smearing, beam */
+     double scalef=1.0;
+     double phterm =(phterm0*0.5*deltaf);
+     if (phterm!=0.0) {
+      sinph=(sin(phterm)/phterm);
+      scalef *=fabs(sinph); /* catch -ve values due to rounding off */
+     }
+
+     if (dobeam) {
+      /* get beam info */
+      int boffset1=itm*N*K+k*N+sta1;
+      //  printf("itm=%d, k1=%d, sta1=%d, sta2=%d, boffset1=%d, boffset2=%d\n", itm, k1, sta1, sta2, boffset1, boffset2);
+      float beam1=__ldg(&beam[boffset1]);
+      int boffset2=itm*N*K+k*N+sta2;
+      float beam2=__ldg(&beam[boffset2]);
+      scalef *=(double)(beam1*beam2);
+     }
+
+
+     /* form complex value */
+     prodterm.x *=scalef;
+     prodterm.y *=scalef;
+
+     /* check for type of source */
+     if (stypeT!=STYPE_POINT) {
+      double uscaled=u*myfreq;
+      double vscaled=v*myfreq;
+      double wscaled=w*myfreq;
+      if (stypeT==STYPE_SHAPELET) {
+       prodterm=cuCmul(shapelet_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
+      } else if (stypeT==STYPE_GAUSSIAN) {
+       prodterm=cuCmul(gaussian_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
+      } else if (stypeT==STYPE_DISK) {
+       prodterm=cuCmul(disk_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
+      } else if (stypeT==STYPE_RING) {
+       prodterm=cuCmul(ring_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
+      }
+     }
+
+
+    double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy;
+    Ix=If*prodterm.x;
+    Iy=If*prodterm.y;
+    Qx=Qf*prodterm.x;
+    Qy=Qf*prodterm.y;
+    Ux=Uf*prodterm.x;
+    Uy=Uf*prodterm.y;
+    Vx=Vf*prodterm.x;
+    Vy=Vf*prodterm.y;
+
+
+
+    output[0]=Ix+Qx;
+    output[1]=Iy+Qy;
+    output[2]=Ux-Vy;
+    output[3]=Vx+Uy;
+    output[4]=Ux+Vy;
+    output[5]=-Vx+Uy;
+    output[6]=Ix-Qx;
+    output[7]=Iy-Qy;

 }

@ -431,8 +556,11 @@ const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int
 __global__ void 
 kernel_coherencies(int B, int N, int T, int K, int F,
  const double *__restrict__ u, const double *__restrict__ v, const double *__restrict__ w,
-  baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, const double *__restrict__ sI,
-  const unsigned char *__restrict__ stype, const double *__restrict__ sI0, const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
+  baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, 
+  const double *__restrict__ sI, const double *__restrict__ sQ, const double *__restrict__ sU, const double *__restrict__ sV,
+  const unsigned char *__restrict__ stype, const double *__restrict__ sI0, 
+const double *__restrict__ sQ0, const double *__restrict__ sU0, const double *__restrict__ sV0,
+  const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {

  /* global thread index */
  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
@ -451,46 +579,96 @@ kernel_coherencies(int B, int N, int T, int K, int F,

   //TODO: figure out if this max_f makes any sense
   #define MAX_F 20
-   cuDoubleComplex l_coh[MAX_F];
+   double l_coh[MAX_F][8];
   for(int cf=0; cf<F; cf++) {
-        l_coh[cf] = make_cuDoubleComplex(0.0, 0.0);
+        l_coh[cf][0]=0.0;
+        l_coh[cf][1]=0.0;
+        l_coh[cf][2]=0.0;
+        l_coh[cf][3]=0.0;
+        l_coh[cf][4]=0.0;
+        l_coh[cf][5]=0.0;
+        l_coh[cf][6]=0.0;
+        l_coh[cf][7]=0.0;
   }

-    //use simply for-loop, if K is very large this may be slow and may need further parallelization
-    for (int k=0; k<K; k++) {
+
+   // split to two cases, F==1 and F>1
+   if (F==1) {
+     //use simply for-loop, if K is very large this may be slow and may need further parallelization
+     for (int k=0; k<K; k++) {
        //source specific params
-        double sIf,sI0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
+        double sIf,sQf,sUf,sVf;
        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
        sIf=__ldg(&sI[k]);
-        if (F>1) {
-            sI0f=__ldg(&sI0[k]);
-            spec_idxf=__ldg(&spec_idx[k]);
-            spec_idx1f=__ldg(&spec_idx1[k]);
-            spec_idx2f=__ldg(&spec_idx2[k]);
-            myf0=__ldg(&f0[k]);
-        }
+        sQf=__ldg(&sQ[k]);
+        sUf=__ldg(&sU[k]);
+        sVf=__ldg(&sV[k]);
+
+        unsigned char stypeT=__ldg(&stype[k]);
+
+        double llcoh[8];
+        compute_prodterm(sta1, sta2, N, K, T, phterm0, sIf, sQf, sUf, sVf,
+               __ldg(&(freqs[0])), deltaf, dobeam, tslot, k, beam, exs, stypeT, u_n, v_n, w_n, llcoh);
+
+         l_coh[0][0] +=llcoh[0];
+         l_coh[0][1] +=llcoh[1];
+         l_coh[0][2] +=llcoh[2];
+         l_coh[0][3] +=llcoh[3];
+         l_coh[0][4] +=llcoh[4];
+         l_coh[0][5] +=llcoh[5];
+         l_coh[0][6] +=llcoh[6];
+         l_coh[0][7] +=llcoh[7];
+
+     }
+   } else {
+     //use simply for-loop, if K is very large this may be slow and may need further parallelization
+     for (int k=0; k<K; k++) {
+        //source specific params
+        double sI0f,sQ0f,sU0f,sV0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
+        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
+        sI0f=__ldg(&sI0[k]);
+        sQ0f=__ldg(&sQ0[k]);
+        sU0f=__ldg(&sU0[k]);
+        sV0f=__ldg(&sV0[k]);
+        spec_idxf=__ldg(&spec_idx[k]);
+        spec_idx1f=__ldg(&spec_idx1[k]);
+        spec_idx2f=__ldg(&spec_idx2[k]);
+        myf0=__ldg(&f0[k]);

        unsigned char stypeT=__ldg(&stype[k]);

        for(int cf=0; cf<F; cf++) {
-            l_coh[cf] = cuCadd(l_coh[cf], compute_prodterm(sta1, sta2, N, K, T, F, phterm0, sIf, sI0f, spec_idxf, spec_idx1f, spec_idx2f, 
-               myf0, freqs, deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n));
+            double llcoh[8];
+            compute_prodterm_multifreq(sta1, sta2, N, K, T, F, phterm0, sI0f, sQ0f, sU0f, sV0f, spec_idxf, spec_idx1f, spec_idx2f, 
+               myf0, __ldg(&(freqs[cf])), deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n,llcoh);
+         l_coh[cf][0] +=llcoh[0];
+         l_coh[cf][1] +=llcoh[1];
+         l_coh[cf][2] +=llcoh[2];
+         l_coh[cf][3] +=llcoh[3];
+         l_coh[cf][4] +=llcoh[4];
+         l_coh[cf][5] +=llcoh[5];
+         l_coh[cf][6] +=llcoh[6];
+         l_coh[cf][7] +=llcoh[7];
        }

-    }
+     }
+
+
+   }
+


     //write output with right multi frequency offset
    double *coh1 = &coh[8*n];
    for(int cf=0; cf<F; cf++) {
-        coh1[cf*8*B+0] = l_coh[cf].x;
-        coh1[cf*8*B+1] = l_coh[cf].y;
-        coh1[cf*8*B+2] = 0.0;
-        coh1[cf*8*B+3] = 0.0;
-        coh1[cf*8*B+4] = 0.0;
-        coh1[cf*8*B+5] = 0.0;
-        coh1[cf*8*B+6] = l_coh[cf].x;
-        coh1[cf*8*B+7] = l_coh[cf].y;
+        coh1[cf*8*B+0] = l_coh[cf][0];
+        coh1[cf*8*B+1] = l_coh[cf][1];
+        coh1[cf*8*B+2] = l_coh[cf][2];
+        coh1[cf*8*B+3] = l_coh[cf][3];
+        coh1[cf*8*B+4] = l_coh[cf][4];
+        coh1[cf*8*B+5] = l_coh[cf][5];
+        coh1[cf*8*B+6] = l_coh[cf][6];
+        coh1[cf*8*B+7] = l_coh[cf][7];
    }

  
@ -498,6 +676,196 @@ kernel_coherencies(int B, int N, int T, int K, int F,

 }

+/* kernel to calculate residuals */
+__global__ void 
+kernel_residuals(int B, int N, int T, int K, int F,
+  const double *__restrict__ u, const double *__restrict__ v, const double *__restrict__ w,
+  const double *__restrict__ p, int nchunk,
+  baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, 
+  const double *__restrict__ sI, const double *__restrict__ sQ, const double *__restrict__ sU, const double *__restrict__ sV,
+  const unsigned char *__restrict__ stype, const double *__restrict__ sI0, 
+const double *__restrict__ sQ0, const double *__restrict__ sU0, const double *__restrict__ sV0,
+  const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
+
+  /* global thread index */
+  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
+
+  /* each thread will calculate for one baseline, over all sources */
+  if (n<B) {
+   int sta1=barr[n].sta1;
+   int sta2=barr[n].sta2;
+   /* find out which time slot this baseline is from */
+   int tslot=n/((N*(N-1)/2));
+   /* find out which chunk to select from p : 0,1..nchunk-1 */
+   int chunk=(n*nchunk)/B;
+   /* create G1 and G2 Jones matrices from p */
+   cuDoubleComplex G1[4],G2[4];
+   G1[0].x=__ldg(&p[chunk*8*N+sta1*8]);
+   G1[0].y=__ldg(&p[chunk*8*N+sta1*8+1]);
+   G1[1].x=__ldg(&p[chunk*8*N+sta1*8+2]);
+   G1[1].y=__ldg(&p[chunk*8*N+sta1*8+3]);
+   G1[2].x=__ldg(&p[chunk*8*N+sta1*8+4]);
+   G1[2].y=__ldg(&p[chunk*8*N+sta1*8+5]);
+   G1[3].x=__ldg(&p[chunk*8*N+sta1*8+6]);
+   G1[3].y=__ldg(&p[chunk*8*N+sta1*8+7]);
+   G2[0].x=__ldg(&p[chunk*8*N+sta2*8]);
+   G2[0].y=__ldg(&p[chunk*8*N+sta2*8+1]);
+   G2[1].x=__ldg(&p[chunk*8*N+sta2*8+2]);
+   G2[1].y=__ldg(&p[chunk*8*N+sta2*8+3]);
+   G2[2].x=__ldg(&p[chunk*8*N+sta2*8+4]);
+   G2[2].y=__ldg(&p[chunk*8*N+sta2*8+5]);
+   G2[3].x=__ldg(&p[chunk*8*N+sta2*8+6]);
+   G2[3].y=__ldg(&p[chunk*8*N+sta2*8+7]);
+
+   double u_n=(u[n]);
+   double v_n=(v[n]);
+   double w_n=(w[n]);
+
+   //TODO: figure out if this max_f makes any sense
+   #define MAX_F 20
+   double l_coh[MAX_F][8];
+   for(int cf=0; cf<F; cf++) {
+        l_coh[cf][0]=0.0;
+        l_coh[cf][1]=0.0;
+        l_coh[cf][2]=0.0;
+        l_coh[cf][3]=0.0;
+        l_coh[cf][4]=0.0;
+        l_coh[cf][5]=0.0;
+        l_coh[cf][6]=0.0;
+        l_coh[cf][7]=0.0;
+   }
+
+
+   // split to two cases, F==1 and F>1
+   if (F==1) {
+     //use simply for-loop, if K is very large this may be slow and may need further parallelization
+     for (int k=0; k<K; k++) {
+        //source specific params
+        double sIf,sQf,sUf,sVf;
+        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
+        sIf=__ldg(&sI[k]);
+        sQf=__ldg(&sQ[k]);
+        sUf=__ldg(&sU[k]);
+        sVf=__ldg(&sV[k]);
+
+        unsigned char stypeT=__ldg(&stype[k]);
+
+        double llcoh[8];
+        compute_prodterm(sta1, sta2, N, K, T, phterm0, sIf, sQf, sUf, sVf,
+               __ldg(&(freqs[0])), deltaf, dobeam, tslot, k, beam, exs, stypeT, u_n, v_n, w_n, llcoh);
+
+         l_coh[0][0] +=llcoh[0];
+         l_coh[0][1] +=llcoh[1];
+         l_coh[0][2] +=llcoh[2];
+         l_coh[0][3] +=llcoh[3];
+         l_coh[0][4] +=llcoh[4];
+         l_coh[0][5] +=llcoh[5];
+         l_coh[0][6] +=llcoh[6];
+         l_coh[0][7] +=llcoh[7];
+
+
+     }
+     cuDoubleComplex L1[4],L2[4];
+     L1[0].x=l_coh[0][0];
+     L1[0].y=l_coh[0][1];
+     L1[1].x=l_coh[0][2];
+     L1[1].y=l_coh[0][3];
+     L1[2].x=l_coh[0][4];
+     L1[2].y=l_coh[0][5];
+     L1[3].x=l_coh[0][6];
+     L1[3].y=l_coh[0][7];
+     /* L2=G1*L1 */
+     amb(G1,L1,L2);
+     /* L1=L2*G2^H */
+     ambt(L2,G2,L1);
+     l_coh[0][0]=L1[0].x;
+     l_coh[0][1]=L1[0].y;
+     l_coh[0][2]=L1[1].x;
+     l_coh[0][3]=L1[1].y;
+     l_coh[0][4]=L1[2].x;
+     l_coh[0][5]=L1[2].y;
+     l_coh[0][6]=L1[3].x;
+     l_coh[0][7]=L1[3].y;
+
+   } else {
+     //use simply for-loop, if K is very large this may be slow and may need further parallelization
+     for (int k=0; k<K; k++) {
+        //source specific params
+        double sI0f,sQ0f,sU0f,sV0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
+        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
+        sI0f=__ldg(&sI0[k]);
+        sQ0f=__ldg(&sQ0[k]);
+        sU0f=__ldg(&sU0[k]);
+        sV0f=__ldg(&sV0[k]);
+        spec_idxf=__ldg(&spec_idx[k]);
+        spec_idx1f=__ldg(&spec_idx1[k]);
+        spec_idx2f=__ldg(&spec_idx2[k]);
+        myf0=__ldg(&f0[k]);
+
+        unsigned char stypeT=__ldg(&stype[k]);
+
+        for(int cf=0; cf<F; cf++) {
+            double llcoh[8];
+            compute_prodterm_multifreq(sta1, sta2, N, K, T, F, phterm0, sI0f, sQ0f, sU0f, sV0f, spec_idxf, spec_idx1f, spec_idx2f, 
+               myf0, __ldg(&(freqs[cf])), deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n,llcoh);
+         l_coh[cf][0] +=llcoh[0];
+         l_coh[cf][1] +=llcoh[1];
+         l_coh[cf][2] +=llcoh[2];
+         l_coh[cf][3] +=llcoh[3];
+         l_coh[cf][4] +=llcoh[4];
+         l_coh[cf][5] +=llcoh[5];
+         l_coh[cf][6] +=llcoh[6];
+         l_coh[cf][7] +=llcoh[7];
+        }
+
+     }
+
+     cuDoubleComplex L1[4],L2[4];
+     for(int cf=0; cf<F; cf++) {
+       L1[0].x=l_coh[cf][0];
+       L1[0].y=l_coh[cf][1];
+       L1[1].x=l_coh[cf][2];
+       L1[1].y=l_coh[cf][3];
+       L1[2].x=l_coh[cf][4];
+       L1[2].y=l_coh[cf][5];
+       L1[3].x=l_coh[cf][6];
+       L1[3].y=l_coh[cf][7];
+       /* L2=G1*L1 */
+       amb(G1,L1,L2);
+       /* L1=L2*G2^H */
+       ambt(L2,G2,L1);
+       l_coh[cf][0]=L1[0].x;
+       l_coh[cf][1]=L1[0].y;
+       l_coh[cf][2]=L1[1].x;
+       l_coh[cf][3]=L1[1].y;
+       l_coh[cf][4]=L1[2].x;
+       l_coh[cf][5]=L1[2].y;
+       l_coh[cf][6]=L1[3].x;
+       l_coh[cf][7]=L1[3].y;
+     }
+
+
+   }
+
+
+
+     //write output with right multi frequency offset
+    double *coh1 = &coh[8*n];
+    for(int cf=0; cf<F; cf++) {
+        coh1[cf*8*B+0] = l_coh[cf][0];
+        coh1[cf*8*B+1] = l_coh[cf][1];
+        coh1[cf*8*B+2] = l_coh[cf][2];
+        coh1[cf*8*B+3] = l_coh[cf][3];
+        coh1[cf*8*B+4] = l_coh[cf][4];
+        coh1[cf*8*B+5] = l_coh[cf][5];
+        coh1[cf*8*B+6] = l_coh[cf][6];
+        coh1[cf*8*B+7] = l_coh[cf][7];
+    }
+
+  
+  }
+
+}

 /* kernel to convert time (JD) to GMST angle*/
 __global__ void 
@ -585,7 +953,7 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud

 /* 
  calculate coherencies:
-  B: total baselines
+  B: total baselines (could be more than one timeslot)
  N: no of stations
  T: no of time slots
  K: no of sources
@ -608,8 +976,8 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud
  dobeam: enable beam if >0
 */
 void
-cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI,
-  unsigned char *stype, double *sI0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
+cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
+  unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
 #ifdef CUDA_DBG
  cudaError_t error;
  error = cudaGetLastError();
@ -620,8 +988,8 @@ cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v,
  /* note: make sure we do not exceed max no of blocks available, 
   otherwise (too many baselines, loop over source id) */
  int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock;
-  kernel_coherencies<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI,
-    stype, sI0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
+  kernel_coherencies<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV,
+    stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
@ -634,6 +1002,33 @@ cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v,
 }


+/* p : parameters 8Nxnchunk values */
+void
+cudakernel_residuals(int B, int N, int T, int K, int F, double *u, double *v, double *w, double *p, int nchunk, baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
+  unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
+#ifdef CUDA_DBG
+  cudaError_t error;
+  error = cudaGetLastError();
+#endif
+
+  /* spawn threads to handle baselines, these threads will spawn threads for sources */
+  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
+  /* note: make sure we do not exceed max no of blocks available, 
+   otherwise (too many baselines, loop over source id) */
+  int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock;
+  kernel_residuals<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,p,nchunk,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV,
+    stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
+  cudaDeviceSynchronize();
+#ifdef CUDA_DBG
+  error = cudaGetLastError();
+  if(error != cudaSuccess) {
+    // print the CUDA error message and exit
+    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
+    exit(-1);
+  }
+#endif
+}
+
 /* convert time JD to GMST angle
  store result at the same location */
 void
--- a/src/lib/predict_withbeam_gpu.c
+++ b/src/lib/predict_withbeam_gpu.c
@ -92,7 +92,7 @@ dtofcopy(int N, float **x_d, double *x) {
  checkCudaError(err,__FILE__,__LINE__);

  /* copy memory */
-  err=cudaMemcpyAsync(xc,xhost,N*sizeof(float),cudaMemcpyHostToDevice,0);
+  err=cudaMemcpy(xc,xhost,N*sizeof(float),cudaMemcpyHostToDevice);
  checkCudaError(err,__FILE__,__LINE__);

  /* free host buffer */
@ -120,6 +120,15 @@ precalcoh_threadfn(void *data) {
  err=cudaSetDevice(card);
  checkCudaError(err,__FILE__,__LINE__);

+  /* make sure enough heap memory is available for shapelet computations */
+  size_t plim;
+  err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize);
+  checkCudaError(err,__FILE__,__LINE__);
+  if (plim<GPU_HEAP_SIZE*1024*1024) { 
+   err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, GPU_HEAP_SIZE*1024*1024);
+   checkCudaError(err,__FILE__,__LINE__);
+  }
+
  double *ud,*vd,*wd;
  double *cohd;
  baseline_t *barrd;
@ -129,7 +138,7 @@ precalcoh_threadfn(void *data) {
  float **xx_p=0,**yy_p=0,**zz_p=0;
  float **xxd,**yyd,**zzd;
  /* allocate memory in GPU */
-  err=cudaMalloc((void**) &cohd, t->Nbase*8*sizeof(float)); /* coherencies only for 1 cluster, Nf=1 */
+  err=cudaMalloc((void**) &cohd, t->Nbase*8*sizeof(double)); /* coherencies only for 1 cluster, Nf=1 */
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t));
  checkCudaError(err,__FILE__,__LINE__);
@ -218,9 +227,16 @@ precalcoh_threadfn(void *data) {
  float *beamd;
  double *lld,*mmd,*nnd;
  double *sId; float *rad,*decd;
+  double *sQd,*sUd,*sVd;
  unsigned char *styped;
  double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
+  double *sQ0d,*sU0d,*sV0d;
  int **host_p,**dev_p;
+
+  complex double *tempdcoh;
+  err=cudaMallocHost((void**)&tempdcoh,sizeof(complex double)*(size_t)t->Nbase*4);
+  checkCudaError(err,__FILE__,__LINE__);
+
 /******************* begin loop over clusters **************************/
  for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {

@ -249,11 +265,29 @@ precalcoh_threadfn(void *data) {
     err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);

+
+     if (t->Nf==1) {
     err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);

+     err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     }
+
+
+
     if (t->dobeam) {
      dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
      dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
@ -264,7 +298,8 @@ precalcoh_threadfn(void *data) {
     err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);

-     /* for multi channel data */
+     /* for multi channel data - FIXME: remove this part because Nf==1 always */
+     if (t->Nf>1) {
     err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
@ -286,6 +321,22 @@ precalcoh_threadfn(void *data) {
     err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);

+
+     err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+
+     }
+
     /* extra info for source, if any */
     if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
@ -347,15 +398,10 @@ precalcoh_threadfn(void *data) {

     /* calculate coherencies for all sources in this cluster, add them up */
     cudakernel_coherencies(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,barrd,freqsd,beamd,
-     lld,mmd,nnd,sId,styped,sI0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
+     lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
    
     /* copy back coherencies to host, 
        coherencies on host have 8M stride, on device have 8 stride */
-     complex double *tempdcoh;
-     if ((tempdcoh=(complex double*)calloc((size_t)t->Nbase*4,sizeof(complex double)))==0) {
-       fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-       exit(1);
-     }
     err=cudaMemcpy((double*)tempdcoh, cohd, sizeof(double)*t->Nbase*8, cudaMemcpyDeviceToHost);
     checkCudaError(err,__FILE__,__LINE__);
     /* now copy this with right offset and stride */
@ -363,7 +409,6 @@ precalcoh_threadfn(void *data) {
     my_ccopy(t->Nbase,&tempdcoh[1],4,&(t->coh[4*ncl+1]),4*t->M);
     my_ccopy(t->Nbase,&tempdcoh[2],4,&(t->coh[4*ncl+2]),4*t->M);
     my_ccopy(t->Nbase,&tempdcoh[3],4,&(t->coh[4*ncl+3]),4*t->M);
-     free(tempdcoh);


     for (cj=0; cj<t->carr[ncl].N; cj++) {
@ -402,8 +447,16 @@ precalcoh_threadfn(void *data) {
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(nnd);
     checkCudaError(err,__FILE__,__LINE__);
+     if (t->Nf==1) {
     err=cudaFree(sId);
     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(sQd);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(sUd);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(sVd);
+     checkCudaError(err,__FILE__,__LINE__);
+     }
     if (t->dobeam) {
      err=cudaFree(rad);
      checkCudaError(err,__FILE__,__LINE__);
@ -413,6 +466,7 @@ precalcoh_threadfn(void *data) {
     err=cudaFree(styped);
     checkCudaError(err,__FILE__,__LINE__);

+     if (t->Nf>1) {
     err=cudaFree(sI0d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(f0d);
@ -423,10 +477,21 @@ precalcoh_threadfn(void *data) {
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(spec_idx2d);
     checkCudaError(err,__FILE__,__LINE__);
+
+     err=cudaFree(sQ0d);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(sU0d);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(sV0d);
+     checkCudaError(err,__FILE__,__LINE__);
+     }
  }
 /******************* end loop over clusters **************************/

  /* free memory */
+  err=cudaFreeHost(tempdcoh);
+  checkCudaError(err,__FILE__,__LINE__);
+
  err=cudaFree(ud);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaFree(vd);
@ -545,7 +610,7 @@ precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex d
     threaddata[nth].x=0; /* no input data */

     threaddata[nth].N=N;
-     threaddata[nth].Nbase=Nbase; /* total baselines: actually Nbasextilesz */
+     threaddata[nth].Nbase=Nbase; /* total baselines: actually Nbasextilesz (from input) */
     threaddata[nth].barr=barr;
     threaddata[nth].carr=carr;
     threaddata[nth].M=M;
@ -666,7 +731,6 @@ predictvis_threadfn(void *data) {
  cudaError_t err;
  int ci,ncl,cj;

-printf("Attath %d to card %d\n",t->tid,card);

  err=cudaSetDevice(card);
  checkCudaError(err,__FILE__,__LINE__);
@ -675,12 +739,13 @@ printf("Attath %d to card %d\n",t->tid,card);
  size_t plim;
  err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize);
  checkCudaError(err,__FILE__,__LINE__);
-  if (plim<16*1024*1024) { /* 16MB */
-   err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, 16*1024*1024);
+  if (plim<GPU_HEAP_SIZE*1024*1024) { 
+   err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, GPU_HEAP_SIZE*1024*1024);
   checkCudaError(err,__FILE__,__LINE__);
  }


+
  double *ud,*vd,*wd;
  double *cohd;
  baseline_t *barrd;
@ -782,8 +847,10 @@ printf("Attath %d to card %d\n",t->tid,card);
  float *beamd;
  double *lld,*mmd,*nnd;
  double *sId; float *rad,*decd;
+  double *sQd,*sUd,*sVd;
  unsigned char *styped;
  double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
+  double *sQ0d,*sU0d,*sV0d;
  int **host_p,**dev_p;

  double *xlocal;
@ -792,7 +859,6 @@ printf("Attath %d to card %d\n",t->tid,card);

 /******************* begin loop over clusters **************************/
  for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
-     printf("th %d clus %d:%d working %d\n",t->tid,t->soff,t->soff+t->Ns-1,ncl);
     /* allocate memory for this clusters beam */
     if (t->dobeam) {
      err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float));
@ -820,11 +886,27 @@ printf("Attath %d to card %d\n",t->tid,card);
     checkCudaError(err,__FILE__,__LINE__);


+     if (t->Nf==1) {
     err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);

+     err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     }
+
+
     if (t->dobeam) {
      dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
      dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
@ -836,6 +918,7 @@ printf("Attath %d to card %d\n",t->tid,card);
     checkCudaError(err,__FILE__,__LINE__);

     /* for multi channel data */
+     if (t->Nf>1) {
     err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
@ -858,6 +941,23 @@ printf("Attath %d to card %d\n",t->tid,card);
     checkCudaError(err,__FILE__,__LINE__);


+     err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+
+     }
+
+
+
     /* extra info for source, if any */
     if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
@ -920,7 +1020,7 @@ printf("Attath %d to card %d\n",t->tid,card);

     /* calculate coherencies for all sources in this cluster, add them up */
     cudakernel_coherencies(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,barrd,freqsd,beamd,
-     lld,mmd,nnd,sId,styped,sI0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
+     lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
    
     /* copy back coherencies to host, 
        coherencies on host have 8M stride, on device have 8 stride */
@ -964,8 +1064,16 @@ printf("Attath %d to card %d\n",t->tid,card);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(nnd);
     checkCudaError(err,__FILE__,__LINE__);
+     if (t->Nf==1) {
     err=cudaFree(sId);
     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(sQd);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(sUd);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(sVd);
+     checkCudaError(err,__FILE__,__LINE__);
+     }
     if (t->dobeam) {
      err=cudaFree(rad);
      checkCudaError(err,__FILE__,__LINE__);
@ -975,6 +1083,7 @@ printf("Attath %d to card %d\n",t->tid,card);
     err=cudaFree(styped);
     checkCudaError(err,__FILE__,__LINE__);

+     if (t->Nf>1) {
     err=cudaFree(sI0d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(f0d);
@ -985,6 +1094,14 @@ printf("Attath %d to card %d\n",t->tid,card);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(spec_idx2d);
     checkCudaError(err,__FILE__,__LINE__);
+
+     err=cudaFree(sQ0d);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(sU0d);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(sV0d);
+     checkCudaError(err,__FILE__,__LINE__);
+     }
  }
 /******************* end loop over clusters **************************/

@ -1037,6 +1154,7 @@ printf("Attath %d to card %d\n",t->tid,card);
  free(zz_p);
  }

+
  /* reset error state */
  err=cudaGetLastError(); 
  return NULL;
@ -1089,7 +1207,7 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit



-  /* set common parameters, and split baselines to threads */
+  /* set common parameters, and split clusters to threads */
  ci=0;
  for (nth=0;  nth<Ngpu && ci<M; nth++) {
     if (ci+Nthb0<M) {
@ -1158,6 +1276,583 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit



+  free(xlocal);
+  free(threaddata);
+  destroy_task_hist(&thst);
+  free(th_array);
+  pthread_attr_destroy(&attr);
+
+  return 0;
+}
+
+
+
+static void *
+residual_threadfn(void *data) {
+  thread_data_pred_t *t=(thread_data_pred_t*)data;
+  /* first, select a GPU, if total clusters < MAX_GPU_ID
+    use random selection, elese use this thread id */
+  int card;
+  if (t->M<=MAX_GPU_ID) {
+   card=select_work_gpu(MAX_GPU_ID,t->hst);
+  } else {
+   card=t->tid;
+  }
+  cudaError_t err;
+  int ci,ncl,cj;
+
+
+  err=cudaSetDevice(card);
+  checkCudaError(err,__FILE__,__LINE__);
+
+  /* make sure enough heap memory is available for shapelet computations */
+  size_t plim;
+  err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize);
+  checkCudaError(err,__FILE__,__LINE__);
+  if (plim<GPU_HEAP_SIZE*1024*1024) { 
+   err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, GPU_HEAP_SIZE*1024*1024);
+   checkCudaError(err,__FILE__,__LINE__);
+  }
+
+
+
+  double *ud,*vd,*wd;
+  double *cohd;
+  baseline_t *barrd;
+  double *freqsd;
+  float *longd=0,*latd=0; double *timed;
+  int *Nelemd;
+  float **xx_p=0,**yy_p=0,**zz_p=0;
+  float **xxd,**yyd,**zzd;
+  /* allocate memory in GPU */
+  err=cudaMalloc((void**) &cohd, t->Nbase*8*t->Nf*sizeof(double)); /* coherencies only for 1 cluster, Nf freq, used to store sum of clusters*/
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t));
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaMalloc((void**) &Nelemd, t->N*sizeof(int));
+  checkCudaError(err,__FILE__,__LINE__);
+  
+  /* copy to device */
+  /* u,v,w and l,m,n coords need to be double for precision */
+  err=cudaMalloc((void**) &ud, t->Nbase*sizeof(double));
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaMalloc((void**) &vd, t->Nbase*sizeof(double));
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaMalloc((void**) &wd, t->Nbase*sizeof(double));
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaMemcpy(ud, t->u, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaMemcpy(vd, t->v, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaMemcpy(wd, t->w, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
+  checkCudaError(err,__FILE__,__LINE__);
+
+
+  err=cudaMemcpy(barrd, t->barr, t->Nbase*sizeof(baseline_t), cudaMemcpyHostToDevice);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaMalloc((void**) &freqsd, t->Nf*sizeof(double));
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaMemcpy(freqsd, t->freqs, t->Nf*sizeof(double), cudaMemcpyHostToDevice);
+  checkCudaError(err,__FILE__,__LINE__);
+
+
+  /* check if beam is actually calculated */
+  if (t->dobeam) {
+  dtofcopy(t->N,&longd,t->longitude);
+  dtofcopy(t->N,&latd,t->latitude);
+  err=cudaMalloc((void**) &timed, t->tilesz*sizeof(double));
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaMemcpy(timed, t->time_utc, t->tilesz*sizeof(double), cudaMemcpyHostToDevice);
+  checkCudaError(err,__FILE__,__LINE__);
+  /* convert time jd to GMST angle */
+  cudakernel_convert_time(t->tilesz,timed);
+
+  err=cudaMemcpy(Nelemd, t->Nelem, t->N*sizeof(int), cudaMemcpyHostToDevice);
+  checkCudaError(err,__FILE__,__LINE__);
+
+  /* jagged arrays for element locations */
+  err=cudaMalloc((void**)&xxd, t->N*sizeof(int*));
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaMalloc((void**)&yyd, t->N*sizeof(int*));
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaMalloc((void**)&zzd, t->N*sizeof(int*));
+  checkCudaError(err,__FILE__,__LINE__);
+  /* allocate host memory to store pointers */
+  if ((xx_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
+      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+      exit(1);
+  }
+  if ((yy_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
+      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+      exit(1);
+  }
+  if ((zz_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
+      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+      exit(1);
+  }
+  for (ci=0; ci<t->N; ci++) {
+    err=cudaMalloc((void**)&xx_p[ci], t->Nelem[ci]*sizeof(double));
+    checkCudaError(err,__FILE__,__LINE__);
+    err=cudaMalloc((void**)&yy_p[ci], t->Nelem[ci]*sizeof(double));
+    checkCudaError(err,__FILE__,__LINE__);
+    err=cudaMalloc((void**)&zz_p[ci], t->Nelem[ci]*sizeof(double));
+    checkCudaError(err,__FILE__,__LINE__);
+  }
+  /* now copy data */
+  for (ci=0; ci<t->N; ci++) {
+    dtofcopy(t->Nelem[ci],&xx_p[ci],t->xx[ci]);
+    dtofcopy(t->Nelem[ci],&yy_p[ci],t->yy[ci]);
+    dtofcopy(t->Nelem[ci],&zz_p[ci],t->zz[ci]);
+  }
+  /* now copy pointer locations to device */
+  err=cudaMemcpy(xxd, xx_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaMemcpy(yyd, yy_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaMemcpy(zzd, zz_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
+  checkCudaError(err,__FILE__,__LINE__);
+  }
+
+
+  float *beamd;
+  double *lld,*mmd,*nnd;
+  double *sId; float *rad,*decd;
+  double *sQd,*sUd,*sVd;
+  unsigned char *styped;
+  double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
+  double *sQ0d,*sU0d,*sV0d;
+  int **host_p,**dev_p;
+
+  double *xlocal;
+  err=cudaMallocHost((void**)&xlocal,sizeof(double)*(size_t)t->Nbase*8*t->Nf);
+  checkCudaError(err,__FILE__,__LINE__);
+
+  double *pd; /* parameter array per cluster */
+
+/******************* begin loop over clusters **************************/
+  for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
+     /* allocate memory for this clusters beam */
+     if (t->dobeam) {
+      err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float));
+      checkCudaError(err,__FILE__,__LINE__);
+     } else {
+       beamd=0;
+     }
+
+
+     /* copy cluster details to GPU */
+     err=cudaMalloc((void**)&styped, t->carr[ncl].N*sizeof(unsigned char));
+     checkCudaError(err,__FILE__,__LINE__);
+
+     err=cudaMalloc((void**) &lld, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(lld, t->carr[ncl].ll, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &mmd, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(mmd, t->carr[ncl].mm, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &nnd, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+
+     /* parameter vector size may change depending on hybrid parameter */
+     err=cudaMalloc((void**) &pd, t->N*8*t->carr[ncl].nchunk*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(pd, &(t->p[t->carr[ncl].p[0]]), t->N*8*t->carr[ncl].nchunk*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+
+
+     if (t->Nf==1) {
+     err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+
+     err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     }
+
+
+     if (t->dobeam) {
+      dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
+      dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
+     } else {
+       rad=0;
+       decd=0;
+     }
+     err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+
+     /* for multi channel data */
+     if (t->Nf>1) {
+     err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &f0d, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(f0d, t->carr[ncl].f0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &spec_idxd, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(spec_idxd, t->carr[ncl].spec_idx, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &spec_idx1d, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(spec_idx1d, t->carr[ncl].spec_idx1, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &spec_idx2d, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+
+
+     err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+
+     }
+
+
+
+     /* extra info for source, if any */
+     if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
+      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+      exit(1);
+     }
+     err=cudaMalloc((void**)&dev_p, t->carr[ncl].N*sizeof(int*));
+     checkCudaError(err,__FILE__,__LINE__);
+
+
+     for (cj=0; cj<t->carr[ncl].N; cj++) {
+
+        if (t->carr[ncl].stype[cj]==STYPE_POINT) {
+          host_p[cj]=0;
+        } else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) {
+          exinfo_shapelet *d=(exinfo_shapelet*)t->carr[ncl].ex[cj];
+          err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_shapelet));
+          checkCudaError(err,__FILE__,__LINE__);
+          double *modes;
+          err=cudaMalloc((void**)&modes, d->n0*d->n0*sizeof(double));
+          checkCudaError(err,__FILE__,__LINE__);
+          err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_shapelet), cudaMemcpyHostToDevice);
+          checkCudaError(err,__FILE__,__LINE__);
+          err=cudaMemcpy(modes, d->modes, d->n0*d->n0*sizeof(double), cudaMemcpyHostToDevice);
+          checkCudaError(err,__FILE__,__LINE__);
+          exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj];
+          err=cudaMemcpy(&(d_p->modes), &modes, sizeof(double*), cudaMemcpyHostToDevice);
+          checkCudaError(err,__FILE__,__LINE__);
+        } else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) {
+          exinfo_gaussian *d=(exinfo_gaussian*)t->carr[ncl].ex[cj];
+          err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_gaussian));
+          checkCudaError(err,__FILE__,__LINE__);
+          err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_gaussian), cudaMemcpyHostToDevice);
+          checkCudaError(err,__FILE__,__LINE__);
+        } else if (t->carr[ncl].stype[cj]==STYPE_DISK) {
+          exinfo_disk *d=(exinfo_disk*)t->carr[ncl].ex[cj];
+          err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_disk));
+          checkCudaError(err,__FILE__,__LINE__);
+          err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_disk), cudaMemcpyHostToDevice);
+          checkCudaError(err,__FILE__,__LINE__);
+        } else if (t->carr[ncl].stype[cj]==STYPE_RING) {
+          exinfo_ring *d=(exinfo_ring*)t->carr[ncl].ex[cj];
+          err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_ring));
+          checkCudaError(err,__FILE__,__LINE__);
+          err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_ring), cudaMemcpyHostToDevice);
+          checkCudaError(err,__FILE__,__LINE__);
+        }
+        
+
+     }
+     /* now copy pointer locations to device */
+     err=cudaMemcpy(dev_p, host_p, t->carr[ncl].N*sizeof(int*), cudaMemcpyHostToDevice);
+     checkCudaError(err,__FILE__,__LINE__);
+
+
+     if (t->dobeam) {
+      /* now calculate beam for all sources in this cluster */
+      cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd);
+     }
+
+
+     /* calculate coherencies for all sources in this cluster, add them up */
+     cudakernel_residuals(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,pd,t->carr[ncl].nchunk,barrd,freqsd,beamd,
+     lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
+    
+     /* copy back coherencies to host, 
+        coherencies on host have 8M stride, on device have 8 stride */
+     err=cudaMemcpy(xlocal, cohd, sizeof(double)*t->Nbase*8*t->Nf, cudaMemcpyDeviceToHost);
+     checkCudaError(err,__FILE__,__LINE__);
+     my_daxpy(t->Nbase*8*t->Nf,xlocal,1.0,t->x);
+
+     for (cj=0; cj<t->carr[ncl].N; cj++) {
+        if (t->carr[ncl].stype[cj]==STYPE_POINT) {
+        } else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) {
+          exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj];
+          double *modes=0;
+          err=cudaMemcpy(&modes, &(d_p->modes), sizeof(double*), cudaMemcpyDeviceToHost);
+          err=cudaFree(modes);
+          checkCudaError(err,__FILE__,__LINE__);
+          err=cudaFree(host_p[cj]);
+          checkCudaError(err,__FILE__,__LINE__);
+        } else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) {
+          err=cudaFree(host_p[cj]);
+          checkCudaError(err,__FILE__,__LINE__);
+        } else if (t->carr[ncl].stype[cj]==STYPE_DISK) {
+          err=cudaFree(host_p[cj]);
+          checkCudaError(err,__FILE__,__LINE__);
+        } else if (t->carr[ncl].stype[cj]==STYPE_RING) {
+          err=cudaFree(host_p[cj]);
+          checkCudaError(err,__FILE__,__LINE__);
+        }
+     }
+     free(host_p);
+
+     err=cudaFree(dev_p);
+     checkCudaError(err,__FILE__,__LINE__);
+
+     if (t->dobeam) {
+      err=cudaFree(beamd);
+      checkCudaError(err,__FILE__,__LINE__);
+     }
+     err=cudaFree(lld);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(mmd);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(nnd);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(pd);
+     checkCudaError(err,__FILE__,__LINE__);
+     if (t->Nf==1) {
+     err=cudaFree(sId);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(sQd);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(sUd);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(sVd);
+     checkCudaError(err,__FILE__,__LINE__);
+     }
+     if (t->dobeam) {
+      err=cudaFree(rad);
+      checkCudaError(err,__FILE__,__LINE__);
+      err=cudaFree(decd);
+      checkCudaError(err,__FILE__,__LINE__);
+     }
+     err=cudaFree(styped);
+     checkCudaError(err,__FILE__,__LINE__);
+
+     if (t->Nf>1) {
+     err=cudaFree(sI0d);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(f0d);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(spec_idxd);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(spec_idx1d);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(spec_idx2d);
+     checkCudaError(err,__FILE__,__LINE__);
+
+     err=cudaFree(sQ0d);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(sU0d);
+     checkCudaError(err,__FILE__,__LINE__);
+     err=cudaFree(sV0d);
+     checkCudaError(err,__FILE__,__LINE__);
+     }
+  }
+/******************* end loop over clusters **************************/
+
+  /* free memory */
+  err=cudaFreeHost(xlocal);
+  checkCudaError(err,__FILE__,__LINE__);
+
+  err=cudaFree(ud);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaFree(vd);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaFree(wd);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaFree(cohd);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaFree(barrd);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaFree(freqsd);
+  checkCudaError(err,__FILE__,__LINE__);
+
+  if (t->dobeam) {
+  err=cudaFree(longd);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaFree(latd);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaFree(timed);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaFree(Nelemd);
+  checkCudaError(err,__FILE__,__LINE__);
+
+
+  for (ci=0; ci<t->N; ci++) {
+    err=cudaFree(xx_p[ci]);
+    checkCudaError(err,__FILE__,__LINE__);
+    err=cudaFree(yy_p[ci]);
+    checkCudaError(err,__FILE__,__LINE__);
+    err=cudaFree(zz_p[ci]);
+    checkCudaError(err,__FILE__,__LINE__);
+  }
+
+  err=cudaFree(xxd);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaFree(yyd);
+  checkCudaError(err,__FILE__,__LINE__);
+  err=cudaFree(zzd);
+  checkCudaError(err,__FILE__,__LINE__);
+
+  free(xx_p);
+  free(yy_p);
+  free(zz_p);
+  }
+
+
+  /* reset error state */
+  err=cudaGetLastError(); 
+  return NULL;
+
+}
+
+
+/* p: 8NMx1 parameter array, but M is 'effective' clusters, need to use carr to find the right offset
+   ccid: which cluster to use as correction
+   rho: MMSE robust parameter J+rho I inverted
+
+   phase_only: if >0, and if there is any correction done, use only phase of diagonal elements for correction 
+*/
+int
+calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
+double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int ccid, double rho, int phase_only) {
+
+  int nth,ci;
+
+  int Nthb0,Nthb;
+  pthread_attr_t attr;
+  pthread_t *th_array;
+  thread_data_pred_t *threaddata;
+  taskhist thst;
+  init_task_hist(&thst);
+
+  /* oversubsribe GPU */
+  int Ngpu=MAX_GPU_ID+1;
+
+  /* calculate min clusters thread can handle */
+  Nthb0=(M+Ngpu-1)/Ngpu;
+
+  /* setup threads : note: Ngpu is no of GPUs used */
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
+
+  if ((th_array=(pthread_t*)malloc((size_t)Ngpu*sizeof(pthread_t)))==0) {
+   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+   exit(1);
+  }
+  if ((threaddata=(thread_data_pred_t*)malloc((size_t)Ngpu*sizeof(thread_data_pred_t)))==0) {
+    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+    exit(1);
+  }
+
+  /* arrays to store result */
+  double *xlocal;
+  if ((xlocal=(double*)calloc((size_t)Nbase*8*tilesz*Nchan*Ngpu,sizeof(double)))==0) {
+    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+    exit(1);
+  }
+
+
+  /* set common parameters, and split clusters to threads */
+  ci=0;
+  for (nth=0;  nth<Ngpu && ci<M; nth++) {
+     if (ci+Nthb0<M) {
+      Nthb=Nthb0;
+     } else {
+      Nthb=M-ci;
+     }
+      
+     threaddata[nth].hst=&thst;  /* for load balancing */
+     threaddata[nth].tid=nth;
+
+     threaddata[nth].u=u; 
+     threaddata[nth].v=v;
+     threaddata[nth].w=w;
+     threaddata[nth].coh=0;
+     threaddata[nth].x=&xlocal[nth*Nbase*8*tilesz*Nchan]; /* distinct arrays to get back the result */
+
+     threaddata[nth].N=N;
+     threaddata[nth].Nbase=Nbase*tilesz; /* total baselines: actually Nbasextilesz */
+     threaddata[nth].barr=barr;
+     threaddata[nth].carr=carr;
+     threaddata[nth].M=M;
+     threaddata[nth].Nf=Nchan;
+     threaddata[nth].freqs=freqs;
+     threaddata[nth].fdelta=fdelta;
+     threaddata[nth].tdelta=tdelta;
+     threaddata[nth].dec0=dec0;
+
+     threaddata[nth].dobeam=dobeam;
+     threaddata[nth].ph_ra0=ph_ra0;
+     threaddata[nth].ph_dec0=ph_dec0;
+     threaddata[nth].ph_freq0=ph_freq0;
+     threaddata[nth].longitude=longitude;
+     threaddata[nth].latitude=latitude;
+     threaddata[nth].time_utc=time_utc;
+     threaddata[nth].tilesz=tilesz;
+     threaddata[nth].Nelem=Nelem;
+     threaddata[nth].xx=xx;
+     threaddata[nth].yy=yy;
+     threaddata[nth].zz=zz;
+
+     /* parameters */
+     threaddata[nth].p=p;
+
+     threaddata[nth].Ns=Nthb;
+     threaddata[nth].soff=ci;
+
+    
+     pthread_create(&th_array[nth],&attr,residual_threadfn,(void*)(&threaddata[nth]));
+     /* next source set */
+     ci=ci+Nthb;
+  }
+
+     
+
+  /* now wait for threads to finish */
+  for(ci=0; ci<nth; ci++) {
+    pthread_join(th_array[ci],NULL);
+    /* subtract to find residual */
+    my_daxpy(Nbase*8*tilesz*Nchan,threaddata[ci].x,-1.0,x);
+  }
+
+
+
  free(xlocal);
  free(threaddata);
  destroy_task_hist(&thst);
--- a/src/lib/sagecal.h
+++ b/src/lib/sagecal.h
@ -2658,11 +2658,16 @@ precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex d
 double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int tileze, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt);


-/* FIXME: add_to_data: no subtraction is done */
 extern int
 predict_visibilities_multifreq_withbeam_gpu(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
 double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int add_to_data);

+extern int
+calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
+ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int ccid, double rho, int phase_only);
+
+
+
 #endif /*!HAVE_CUDA */


@ -2675,6 +2680,12 @@ predict_visibilities_multifreq_withbeam_gpu(double *u,double *v,double *w,double
 #ifndef ARRAY_MAX_ELEM /* if using shared memory, max possible elements for a station */
 #define ARRAY_MAX_ELEM 512
 #endif
+/* default GPU heap size (in MB) needed to calculate some shapelet models,
+    if model has n0>20 or so, try increasing this and recompiling
+   the default GPU values is ~ 8MB */
+#ifndef GPU_HEAP_SIZE
+#define GPU_HEAP_SIZE 20 
+#endif


 extern void
@ -2683,9 +2694,12 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud


 extern void
-cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI,
-  unsigned char *stype, double *sI0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam);
+cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
+  unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam);

+extern void
+cudakernel_residuals(int B, int N, int T, int K, int F, double *u, double *v, double *w, double *p, int nchunk, baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
+  unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam);

 extern void
 cudakernel_convert_time(int T, double *time_utc);