added model prediction and residual calculation using GPU

2017-11-20 12:36:17 +01:00 · 2017-11-20 12:36:17 +01:00 · 028ac53347
parent e07a26b63f
commit 028ac53347
7 changed files with 1254 additions and 93 deletions
--- a/src/MPI/main.cpp
+++ b/src/MPI/main.cpp
@ -239,6 +239,7 @@ ParseCmdLine(int ac, char **av) {
    }
 }
 /* real main program */
 int
 main(int argc, char **argv) {
--- a/src/MPI/sagecal_slave.cpp
+++ b/src/MPI/sagecal_slave.cpp
@ -537,12 +537,27 @@ cout<<myrank<<" : "<<cm<<": downweight ratio ("<<iodata_vec[cm].fratio<<") based
     }
     for(int cm=0; cm<mymscount; cm++) {
 #ifndef HAVE_CUDA
      if (!doBeam) {
       precalculate_coherencies(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
      } else {
       precalculate_coherencies_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,
        beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,iodata_vec[cm].tilesz,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt);
      }
 #endif
 #ifdef HAVE_CUDA
     if (GPUpredict) {
       precalculate_coherencies_withbeam_gpu(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,
  beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,iodata_vec[cm].tilesz,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,doBeam,Data::Nt);
     } else {
      if (!doBeam) {
       precalculate_coherencies(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
      } else {
       precalculate_coherencies_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,
        beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,iodata_vec[cm].tilesz,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt);
      }
     }
 #endif
     }
     /******************** ADMM  *******************************/
@ -734,12 +749,27 @@ cout<<myrank<<" : "<<cm<<": downweight ratio ("<<iodata_vec[cm].fratio<<") based
     /* write residuals to output */
     for(int cm=0; cm<mymscount; cm++) {
 #ifndef HAVE_CUDA
      if (!doBeam) {
-      calculate_residuals_multifreq(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
+       calculate_residuals_multifreq(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
-     } else {
+      } else {
-      calculate_residuals_multifreq_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
+       calculate_residuals_multifreq_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
       beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
      }
 #endif
 #ifdef HAVE_CUDA
     if (GPUpredict) {
       calculate_residuals_multifreq_withbeam_gpu(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
          beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,doBeam,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
     } else {
      if (!doBeam) {
       calculate_residuals_multifreq(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
      } else {
       calculate_residuals_multifreq_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
       beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
      }
     }
 #endif
     }
     tilex+=iodata_vec[0].tilesz;
--- a/src/MS/main.cpp
+++ b/src/MS/main.cpp
@ -526,18 +526,27 @@ main(int argc, char **argv) {
   /* FIXME: uvmin is not needed in calibration, because its taken care of by flags */
    if (!Data::DoSim) {
    /****************** calibration **************************/
-////#ifndef HAVE_CUDA
+#ifndef HAVE_CUDA
    if (!doBeam) {
     precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
    } else {
     precalculate_coherencies_withbeam(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt);
    }
-////#endif
+#endif
-////#ifdef HAVE_CUDA
+#ifdef HAVE_CUDA
-////     precalculate_coherencies_withbeam_gpu(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
+   if (GPUpredict) {
-////  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt);
+     precalculate_coherencies_withbeam_gpu(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
-////#endif
+  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt);
   } else {
    if (!doBeam) {
     precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
    } else {
     precalculate_coherencies_withbeam(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt);
    }
   }
 #endif
 #ifndef HAVE_CUDA
@ -651,12 +660,28 @@ main(int argc, char **argv) {
    } else {
     /* since residual is calculated not using x (instead using xo), no need to unwhiten data
        in case x was whitened */
 #ifndef HAVE_CUDA
     if (!doBeam) {
      calculate_residuals_multifreq(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
     } else {
      calculate_residuals_multifreq_withbeam(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
 beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
     }
 #endif
 #ifdef HAVE_CUDA
    if (GPUpredict) {
      calculate_residuals_multifreq_withbeam_gpu(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
 beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
    } else {
     if (!doBeam) {
      calculate_residuals_multifreq(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
     } else {
      calculate_residuals_multifreq_withbeam(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
 beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
     }
    }
 #endif
    }
    /****************** end calibration **************************/
    /****************** begin diagnostics ************************/
--- a/src/lib/clmfit.c
+++ b/src/lib/clmfit.c
@ -1224,6 +1224,7 @@ attach_gpu_to_thread1(int card,  cublasHandle_t *cbhandle, cusolverDnHandle_t *s
    status=cusolverDnCreate(solver_handle);
    if (status != CUSOLVER_STATUS_SUCCESS) {
     fprintf(stderr,"%s: %d: CUSOLV create fail %d\n",__FILE__,__LINE__,status);
     fprintf(stderr,"common problems: not initialized %d, alloc fail %d, no compute %d\n",CUSOLVER_STATUS_NOT_INITIALIZED, CUSOLVER_STATUS_ALLOC_FAILED, CUSOLVER_STATUS_ARCH_MISMATCH);
     exit(1);
    }
  }
--- a/src/lib/predict_model.cu
+++ b/src/lib/predict_model.cu
@ -26,6 +26,25 @@
 /* enable this for checking for kernel failure */
 //#define CUDA_DBG
 /* matrix multiplications */
 /* C=A*B */
 __device__ void
 amb(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) {
 c[0]=cuCadd(cuCmul(a[0],b[0]),cuCmul(a[1],b[2]));
 c[1]=cuCadd(cuCmul(a[0],b[1]),cuCmul(a[1],b[3]));
 c[2]=cuCadd(cuCmul(a[2],b[0]),cuCmul(a[3],b[2]));
 c[3]=cuCadd(cuCmul(a[2],b[1]),cuCmul(a[3],b[3]));
 }
 /* C=A*B^H */
 __device__ void
 ambt(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) {
 c[0]=cuCadd(cuCmul(a[0],cuConj(b[0])),cuCmul(a[1],cuConj(b[1])));
 c[1]=cuCadd(cuCmul(a[0],cuConj(b[2])),cuCmul(a[1],cuConj(b[3])));
 c[2]=cuCadd(cuCmul(a[2],cuConj(b[0])),cuCmul(a[3],cuConj(b[1])));
 c[3]=cuCadd(cuCmul(a[2],cuConj(b[2])),cuCmul(a[3],cuConj(b[3])));
 }
 __device__ void
 radec2azel_gmst__(float ra, float dec, float longitude, float latitude, float thetaGMST, float *az, float *el) {
  float thetaLST=thetaGMST+longitude*180.0f/M_PI;
@ -327,6 +346,11 @@ shapelet_contrib__(int *dd, float u, float v, float w) {
  __sincosf((float)dp->eP,&sinph,&cosph);
  ut=a*(cosph*up-sinph*vp);
  vt=b*(sinph*up+cosph*vp);
  /* if u,v is way off the scale (beta) of shapelet modes, the result is almost always zero,
     so check this here and return 0, otherwise spurious nans may result */
  if (__fdiv_rz(100.0f,__fsqrt_rz(ut*ut+vt*vt))<dp->beta) {
   return make_cuDoubleComplex(0.0,0.0);
  }
  /* note: we decompose f(-l,m) so the Fourier transform is F(-u,v)
   so negate the u grid */
  Av=(float*)malloc((size_t)((dp->n0)*(dp->n0))*sizeof(float));
@ -345,45 +369,57 @@ shapelet_contrib__(int *dd, float u, float v, float w) {
  free(Av);
  free(cplx);
  //return 2.0*M_PI*(realsum+_Complex_I*imagsum);
  realsum*=2.0f*M_PI*a*b;
  imagsum*=2.0f*M_PI*a*b;
  /* additional safeguards */
  if ( isnan(realsum) ) { realsum=0.0f; }
  if ( isnan(imagsum) ) { imagsum=0.0f; }
  return make_cuDoubleComplex((double)realsum,(double)imagsum);
 }
-
+__device__ void 
-__device__ cuDoubleComplex 
+compute_prodterm_multifreq(int sta1, int sta2, int N, int K, int T, int F,
-compute_prodterm(int sta1, int sta2, int N, int K, int T, int F,
+double phterm0, double sI0f, double sQ0f, double sU0f, double sV0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0,
-double phterm0, double sIf, double sI0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0,
+ double myfreq, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) {
-const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w) {
+     /* F>1 is assumed output: 8x1 array */
     double sinph,cosph;
     double myfreq=__ldg(&freqs[cf]);
     sincos(phterm0*myfreq,&sinph,&cosph);
     cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph);
-     double If;
+     double If,Qf,Uf,Vf;
-     if (F==1) {
+     If=Qf=Uf=Vf=0.0;
-      /* flux: do not use spectra here, because F=1*/
+     /* evaluate spectra */
-      If=sIf;
+     double fratio=log(myfreq/myf0);
-     } else {
+     double fratio1=fratio*fratio;
-      /* evaluate spectra */
+     double fratio2=fratio1*fratio;
-      double fratio=log(myfreq/myf0);
+     double cm=spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2;
-      double fratio1=fratio*fratio;
+     /* catch -ve flux */
-      double fratio2=fratio1*fratio;
+     if (sI0f>0.0) {
-      /* catch -ve flux */
+        If=exp(log(sI0f)+cm);
-      if (sI0f>0.0) {
+     } else if (sI0f<0.0) {
-        If=exp(log(sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
+        If=-exp(log(-sI0f)+cm);
-      } else if (sI0f<0.0) {
+     } 
-        If=-exp(log(-sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
+     if (sQ0f>0.0) {
-      } else {
+        Qf=exp(log(sQ0f)+cm);
-        If=0.0;
+     } else if (sI0f<0.0) {
-      }
+        Qf=-exp(log(-sQ0f)+cm);
     } 
     if (sU0f>0.0) {
        Uf=exp(log(sU0f)+cm);
     } else if (sI0f<0.0) {
        Uf=-exp(log(-sU0f)+cm);
     } 
     if (sV0f>0.0) {
        Vf=exp(log(sV0f)+cm);
     } else if (sI0f<0.0) {
        Vf=-exp(log(-sV0f)+cm);
     }
-     /* smearing */
+
     /* smearing, beam */
     double scalef=1.0;
     double phterm =(phterm0*0.5*deltaf);
     if (phterm!=0.0) {
      sinph=(sin(phterm)/phterm);
-      If *=fabsf(sinph); /* catch -ve values due to rounding off */
+      scalef *=fabs(sinph); /* catch -ve values due to rounding off */
     }
     if (dobeam) {
@ -396,13 +432,13 @@ const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int
      //int boffset2=sta2*K*T*F + k1*T*F + cf*T + itm;
      int boffset2=itm*N*K*F+k*N*F+cf*N+sta2;
      float beam2=__ldg(&beam[boffset2]);
-      If *=(double)(beam1*beam2);
+      scalef *=(double)(beam1*beam2);
     }
     /* form complex value */
-     prodterm.x *=If;
+     prodterm.x *=scalef;
-     prodterm.y *=If;
+     prodterm.y *=scalef;
     /* check for type of source */
     if (stypeT!=STYPE_POINT) {
@ -420,10 +456,99 @@ const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int
      }
     }
-//printf("Input k=%d uvw %lE,%lE,%lE sI=%f phterm0=%lf\n",k,u,v,w,sIf,phterm0);
+    double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy;
-//printf("phterm %lf smearing %f\n",phterm,sinph);
+    Ix=If*prodterm.x;
-//printf("k=%d  uvw %lE,%lE,%lE If=%f phterm %lf out=%f,%f\n",k,u,v,w,If,phterm,prodterm.x,prodterm.y);
+    Iy=If*prodterm.y;
-    return prodterm;
+    Qx=Qf*prodterm.x;
    Qy=Qf*prodterm.y;
    Ux=Uf*prodterm.x;
    Uy=Uf*prodterm.y;
    Vx=Vf*prodterm.x;
    Vy=Vf*prodterm.y;
    output[0]=Ix+Qx;
    output[1]=Iy+Qy;
    output[2]=Ux-Vy;
    output[3]=Vx+Uy;
    output[4]=Ux+Vy;
    output[5]=-Vx+Uy;
    output[6]=Ix-Qx;
    output[7]=Iy-Qy;
 }
 __device__ void 
 compute_prodterm(int sta1, int sta2, int N, int K, int T,
 double phterm0, double If, double Qf, double Uf, double Vf,
 double myfreq, double deltaf, int dobeam, int itm, int k, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) {
     /* F==1 is assumed, output: 8x1 array */
     double sinph,cosph;
     sincos(phterm0*myfreq,&sinph,&cosph);
     cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph);
     /* smearing, beam */
     double scalef=1.0;
     double phterm =(phterm0*0.5*deltaf);
     if (phterm!=0.0) {
      sinph=(sin(phterm)/phterm);
      scalef *=fabs(sinph); /* catch -ve values due to rounding off */
     }
     if (dobeam) {
      /* get beam info */
      int boffset1=itm*N*K+k*N+sta1;
      //  printf("itm=%d, k1=%d, sta1=%d, sta2=%d, boffset1=%d, boffset2=%d\n", itm, k1, sta1, sta2, boffset1, boffset2);
      float beam1=__ldg(&beam[boffset1]);
      int boffset2=itm*N*K+k*N+sta2;
      float beam2=__ldg(&beam[boffset2]);
      scalef *=(double)(beam1*beam2);
     }
     /* form complex value */
     prodterm.x *=scalef;
     prodterm.y *=scalef;
     /* check for type of source */
     if (stypeT!=STYPE_POINT) {
      double uscaled=u*myfreq;
      double vscaled=v*myfreq;
      double wscaled=w*myfreq;
      if (stypeT==STYPE_SHAPELET) {
       prodterm=cuCmul(shapelet_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
      } else if (stypeT==STYPE_GAUSSIAN) {
       prodterm=cuCmul(gaussian_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
      } else if (stypeT==STYPE_DISK) {
       prodterm=cuCmul(disk_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
      } else if (stypeT==STYPE_RING) {
       prodterm=cuCmul(ring_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
      }
     }
    double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy;
    Ix=If*prodterm.x;
    Iy=If*prodterm.y;
    Qx=Qf*prodterm.x;
    Qy=Qf*prodterm.y;
    Ux=Uf*prodterm.x;
    Uy=Uf*prodterm.y;
    Vx=Vf*prodterm.x;
    Vy=Vf*prodterm.y;
    output[0]=Ix+Qx;
    output[1]=Iy+Qy;
    output[2]=Ux-Vy;
    output[3]=Vx+Uy;
    output[4]=Ux+Vy;
    output[5]=-Vx+Uy;
    output[6]=Ix-Qx;
    output[7]=Iy-Qy;
 }
@ -431,8 +556,11 @@ const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int
 __global__ void 
 kernel_coherencies(int B, int N, int T, int K, int F,
  const double *__restrict__ u, const double *__restrict__ v, const double *__restrict__ w,
-  baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, const double *__restrict__ sI,
+  baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, 
-  const unsigned char *__restrict__ stype, const double *__restrict__ sI0, const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
+  const double *__restrict__ sI, const double *__restrict__ sQ, const double *__restrict__ sU, const double *__restrict__ sV,
  const unsigned char *__restrict__ stype, const double *__restrict__ sI0, 
 const double *__restrict__ sQ0, const double *__restrict__ sU0, const double *__restrict__ sV0,
  const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
  /* global thread index */
  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
@ -451,46 +579,96 @@ kernel_coherencies(int B, int N, int T, int K, int F,
   //TODO: figure out if this max_f makes any sense
   #define MAX_F 20
-   cuDoubleComplex l_coh[MAX_F];
+   double l_coh[MAX_F][8];
   for(int cf=0; cf<F; cf++) {
-        l_coh[cf] = make_cuDoubleComplex(0.0, 0.0);
+        l_coh[cf][0]=0.0;
        l_coh[cf][1]=0.0;
        l_coh[cf][2]=0.0;
        l_coh[cf][3]=0.0;
        l_coh[cf][4]=0.0;
        l_coh[cf][5]=0.0;
        l_coh[cf][6]=0.0;
        l_coh[cf][7]=0.0;
   }
-    //use simply for-loop, if K is very large this may be slow and may need further parallelization
+
-    for (int k=0; k<K; k++) {
+   // split to two cases, F==1 and F>1
   if (F==1) {
     //use simply for-loop, if K is very large this may be slow and may need further parallelization
     for (int k=0; k<K; k++) {
        //source specific params
-        double sIf,sI0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
+        double sIf,sQf,sUf,sVf;
        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
        sIf=__ldg(&sI[k]);
-        if (F>1) {
+        sQf=__ldg(&sQ[k]);
-            sI0f=__ldg(&sI0[k]);
+        sUf=__ldg(&sU[k]);
-            spec_idxf=__ldg(&spec_idx[k]);
+        sVf=__ldg(&sV[k]);
-            spec_idx1f=__ldg(&spec_idx1[k]);
+
-            spec_idx2f=__ldg(&spec_idx2[k]);
+        unsigned char stypeT=__ldg(&stype[k]);
-            myf0=__ldg(&f0[k]);
+
-        }
+        double llcoh[8];
        compute_prodterm(sta1, sta2, N, K, T, phterm0, sIf, sQf, sUf, sVf,
               __ldg(&(freqs[0])), deltaf, dobeam, tslot, k, beam, exs, stypeT, u_n, v_n, w_n, llcoh);
         l_coh[0][0] +=llcoh[0];
         l_coh[0][1] +=llcoh[1];
         l_coh[0][2] +=llcoh[2];
         l_coh[0][3] +=llcoh[3];
         l_coh[0][4] +=llcoh[4];
         l_coh[0][5] +=llcoh[5];
         l_coh[0][6] +=llcoh[6];
         l_coh[0][7] +=llcoh[7];
     }
   } else {
     //use simply for-loop, if K is very large this may be slow and may need further parallelization
     for (int k=0; k<K; k++) {
        //source specific params
        double sI0f,sQ0f,sU0f,sV0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
        sI0f=__ldg(&sI0[k]);
        sQ0f=__ldg(&sQ0[k]);
        sU0f=__ldg(&sU0[k]);
        sV0f=__ldg(&sV0[k]);
        spec_idxf=__ldg(&spec_idx[k]);
        spec_idx1f=__ldg(&spec_idx1[k]);
        spec_idx2f=__ldg(&spec_idx2[k]);
        myf0=__ldg(&f0[k]);
        unsigned char stypeT=__ldg(&stype[k]);
        for(int cf=0; cf<F; cf++) {
-            l_coh[cf] = cuCadd(l_coh[cf], compute_prodterm(sta1, sta2, N, K, T, F, phterm0, sIf, sI0f, spec_idxf, spec_idx1f, spec_idx2f, 
+            double llcoh[8];
-               myf0, freqs, deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n));
+            compute_prodterm_multifreq(sta1, sta2, N, K, T, F, phterm0, sI0f, sQ0f, sU0f, sV0f, spec_idxf, spec_idx1f, spec_idx2f, 
               myf0, __ldg(&(freqs[cf])), deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n,llcoh);
         l_coh[cf][0] +=llcoh[0];
         l_coh[cf][1] +=llcoh[1];
         l_coh[cf][2] +=llcoh[2];
         l_coh[cf][3] +=llcoh[3];
         l_coh[cf][4] +=llcoh[4];
         l_coh[cf][5] +=llcoh[5];
         l_coh[cf][6] +=llcoh[6];
         l_coh[cf][7] +=llcoh[7];
        }
-    }
+     }
   }
     //write output with right multi frequency offset
    double *coh1 = &coh[8*n];
    for(int cf=0; cf<F; cf++) {
-        coh1[cf*8*B+0] = l_coh[cf].x;
+        coh1[cf*8*B+0] = l_coh[cf][0];
-        coh1[cf*8*B+1] = l_coh[cf].y;
+        coh1[cf*8*B+1] = l_coh[cf][1];
-        coh1[cf*8*B+2] = 0.0;
+        coh1[cf*8*B+2] = l_coh[cf][2];
-        coh1[cf*8*B+3] = 0.0;
+        coh1[cf*8*B+3] = l_coh[cf][3];
-        coh1[cf*8*B+4] = 0.0;
+        coh1[cf*8*B+4] = l_coh[cf][4];
-        coh1[cf*8*B+5] = 0.0;
+        coh1[cf*8*B+5] = l_coh[cf][5];
-        coh1[cf*8*B+6] = l_coh[cf].x;
+        coh1[cf*8*B+6] = l_coh[cf][6];
-        coh1[cf*8*B+7] = l_coh[cf].y;
+        coh1[cf*8*B+7] = l_coh[cf][7];
    }
@ -498,6 +676,196 @@ kernel_coherencies(int B, int N, int T, int K, int F,
 }
 /* kernel to calculate residuals */
 __global__ void 
 kernel_residuals(int B, int N, int T, int K, int F,
  const double *__restrict__ u, const double *__restrict__ v, const double *__restrict__ w,
  const double *__restrict__ p, int nchunk,
  baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, 
  const double *__restrict__ sI, const double *__restrict__ sQ, const double *__restrict__ sU, const double *__restrict__ sV,
  const unsigned char *__restrict__ stype, const double *__restrict__ sI0, 
 const double *__restrict__ sQ0, const double *__restrict__ sU0, const double *__restrict__ sV0,
  const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
  /* global thread index */
  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
  /* each thread will calculate for one baseline, over all sources */
  if (n<B) {
   int sta1=barr[n].sta1;
   int sta2=barr[n].sta2;
   /* find out which time slot this baseline is from */
   int tslot=n/((N*(N-1)/2));
   /* find out which chunk to select from p : 0,1..nchunk-1 */
   int chunk=(n*nchunk)/B;
   /* create G1 and G2 Jones matrices from p */
   cuDoubleComplex G1[4],G2[4];
   G1[0].x=__ldg(&p[chunk*8*N+sta1*8]);
   G1[0].y=__ldg(&p[chunk*8*N+sta1*8+1]);
   G1[1].x=__ldg(&p[chunk*8*N+sta1*8+2]);
   G1[1].y=__ldg(&p[chunk*8*N+sta1*8+3]);
   G1[2].x=__ldg(&p[chunk*8*N+sta1*8+4]);
   G1[2].y=__ldg(&p[chunk*8*N+sta1*8+5]);
   G1[3].x=__ldg(&p[chunk*8*N+sta1*8+6]);
   G1[3].y=__ldg(&p[chunk*8*N+sta1*8+7]);
   G2[0].x=__ldg(&p[chunk*8*N+sta2*8]);
   G2[0].y=__ldg(&p[chunk*8*N+sta2*8+1]);
   G2[1].x=__ldg(&p[chunk*8*N+sta2*8+2]);
   G2[1].y=__ldg(&p[chunk*8*N+sta2*8+3]);
   G2[2].x=__ldg(&p[chunk*8*N+sta2*8+4]);
   G2[2].y=__ldg(&p[chunk*8*N+sta2*8+5]);
   G2[3].x=__ldg(&p[chunk*8*N+sta2*8+6]);
   G2[3].y=__ldg(&p[chunk*8*N+sta2*8+7]);
   double u_n=(u[n]);
   double v_n=(v[n]);
   double w_n=(w[n]);
   //TODO: figure out if this max_f makes any sense
   #define MAX_F 20
   double l_coh[MAX_F][8];
   for(int cf=0; cf<F; cf++) {
        l_coh[cf][0]=0.0;
        l_coh[cf][1]=0.0;
        l_coh[cf][2]=0.0;
        l_coh[cf][3]=0.0;
        l_coh[cf][4]=0.0;
        l_coh[cf][5]=0.0;
        l_coh[cf][6]=0.0;
        l_coh[cf][7]=0.0;
   }
   // split to two cases, F==1 and F>1
   if (F==1) {
     //use simply for-loop, if K is very large this may be slow and may need further parallelization
     for (int k=0; k<K; k++) {
        //source specific params
        double sIf,sQf,sUf,sVf;
        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
        sIf=__ldg(&sI[k]);
        sQf=__ldg(&sQ[k]);
        sUf=__ldg(&sU[k]);
        sVf=__ldg(&sV[k]);
        unsigned char stypeT=__ldg(&stype[k]);
        double llcoh[8];
        compute_prodterm(sta1, sta2, N, K, T, phterm0, sIf, sQf, sUf, sVf,
               __ldg(&(freqs[0])), deltaf, dobeam, tslot, k, beam, exs, stypeT, u_n, v_n, w_n, llcoh);
         l_coh[0][0] +=llcoh[0];
         l_coh[0][1] +=llcoh[1];
         l_coh[0][2] +=llcoh[2];
         l_coh[0][3] +=llcoh[3];
         l_coh[0][4] +=llcoh[4];
         l_coh[0][5] +=llcoh[5];
         l_coh[0][6] +=llcoh[6];
         l_coh[0][7] +=llcoh[7];
     }
     cuDoubleComplex L1[4],L2[4];
     L1[0].x=l_coh[0][0];
     L1[0].y=l_coh[0][1];
     L1[1].x=l_coh[0][2];
     L1[1].y=l_coh[0][3];
     L1[2].x=l_coh[0][4];
     L1[2].y=l_coh[0][5];
     L1[3].x=l_coh[0][6];
     L1[3].y=l_coh[0][7];
     /* L2=G1*L1 */
     amb(G1,L1,L2);
     /* L1=L2*G2^H */
     ambt(L2,G2,L1);
     l_coh[0][0]=L1[0].x;
     l_coh[0][1]=L1[0].y;
     l_coh[0][2]=L1[1].x;
     l_coh[0][3]=L1[1].y;
     l_coh[0][4]=L1[2].x;
     l_coh[0][5]=L1[2].y;
     l_coh[0][6]=L1[3].x;
     l_coh[0][7]=L1[3].y;
   } else {
     //use simply for-loop, if K is very large this may be slow and may need further parallelization
     for (int k=0; k<K; k++) {
        //source specific params
        double sI0f,sQ0f,sU0f,sV0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
        sI0f=__ldg(&sI0[k]);
        sQ0f=__ldg(&sQ0[k]);
        sU0f=__ldg(&sU0[k]);
        sV0f=__ldg(&sV0[k]);
        spec_idxf=__ldg(&spec_idx[k]);
        spec_idx1f=__ldg(&spec_idx1[k]);
        spec_idx2f=__ldg(&spec_idx2[k]);
        myf0=__ldg(&f0[k]);
        unsigned char stypeT=__ldg(&stype[k]);
        for(int cf=0; cf<F; cf++) {
            double llcoh[8];
            compute_prodterm_multifreq(sta1, sta2, N, K, T, F, phterm0, sI0f, sQ0f, sU0f, sV0f, spec_idxf, spec_idx1f, spec_idx2f, 
               myf0, __ldg(&(freqs[cf])), deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n,llcoh);
         l_coh[cf][0] +=llcoh[0];
         l_coh[cf][1] +=llcoh[1];
         l_coh[cf][2] +=llcoh[2];
         l_coh[cf][3] +=llcoh[3];
         l_coh[cf][4] +=llcoh[4];
         l_coh[cf][5] +=llcoh[5];
         l_coh[cf][6] +=llcoh[6];
         l_coh[cf][7] +=llcoh[7];
        }
     }
     cuDoubleComplex L1[4],L2[4];
     for(int cf=0; cf<F; cf++) {
       L1[0].x=l_coh[cf][0];
       L1[0].y=l_coh[cf][1];
       L1[1].x=l_coh[cf][2];
       L1[1].y=l_coh[cf][3];
       L1[2].x=l_coh[cf][4];
       L1[2].y=l_coh[cf][5];
       L1[3].x=l_coh[cf][6];
       L1[3].y=l_coh[cf][7];
       /* L2=G1*L1 */
       amb(G1,L1,L2);
       /* L1=L2*G2^H */
       ambt(L2,G2,L1);
       l_coh[cf][0]=L1[0].x;
       l_coh[cf][1]=L1[0].y;
       l_coh[cf][2]=L1[1].x;
       l_coh[cf][3]=L1[1].y;
       l_coh[cf][4]=L1[2].x;
       l_coh[cf][5]=L1[2].y;
       l_coh[cf][6]=L1[3].x;
       l_coh[cf][7]=L1[3].y;
     }
   }
     //write output with right multi frequency offset
    double *coh1 = &coh[8*n];
    for(int cf=0; cf<F; cf++) {
        coh1[cf*8*B+0] = l_coh[cf][0];
        coh1[cf*8*B+1] = l_coh[cf][1];
        coh1[cf*8*B+2] = l_coh[cf][2];
        coh1[cf*8*B+3] = l_coh[cf][3];
        coh1[cf*8*B+4] = l_coh[cf][4];
        coh1[cf*8*B+5] = l_coh[cf][5];
        coh1[cf*8*B+6] = l_coh[cf][6];
        coh1[cf*8*B+7] = l_coh[cf][7];
    }
  }
 }
 /* kernel to convert time (JD) to GMST angle*/
 __global__ void 
@ -585,7 +953,7 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud
 /* 
  calculate coherencies:
-  B: total baselines
+  B: total baselines (could be more than one timeslot)
  N: no of stations
  T: no of time slots
  K: no of sources
@ -608,8 +976,8 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud
  dobeam: enable beam if >0
 */
 void
-cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI,
+cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
-  unsigned char *stype, double *sI0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
+  unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
 #ifdef CUDA_DBG
  cudaError_t error;
  error = cudaGetLastError();
@ -620,8 +988,8 @@ cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v,
  /* note: make sure we do not exceed max no of blocks available, 
   otherwise (too many baselines, loop over source id) */
  int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock;
-  kernel_coherencies<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI,
+  kernel_coherencies<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV,
-    stype, sI0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
+    stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
@ -634,6 +1002,33 @@ cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v,
 }
 /* p : parameters 8Nxnchunk values */
 void
 cudakernel_residuals(int B, int N, int T, int K, int F, double *u, double *v, double *w, double *p, int nchunk, baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
  unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
 #ifdef CUDA_DBG
  cudaError_t error;
  error = cudaGetLastError();
 #endif
  /* spawn threads to handle baselines, these threads will spawn threads for sources */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
  /* note: make sure we do not exceed max no of blocks available, 
   otherwise (too many baselines, loop over source id) */
  int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock;
  kernel_residuals<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,p,nchunk,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV,
    stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess) {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* convert time JD to GMST angle
  store result at the same location */
 void
--- a/src/lib/predict_withbeam_gpu.c
+++ b/src/lib/predict_withbeam_gpu.c
@ -92,7 +92,7 @@ dtofcopy(int N, float **x_d, double *x) {
  checkCudaError(err,__FILE__,__LINE__);
  /* copy memory */
-  err=cudaMemcpyAsync(xc,xhost,N*sizeof(float),cudaMemcpyHostToDevice,0);
+  err=cudaMemcpy(xc,xhost,N*sizeof(float),cudaMemcpyHostToDevice);
  checkCudaError(err,__FILE__,__LINE__);
  /* free host buffer */
@ -120,6 +120,15 @@ precalcoh_threadfn(void *data) {
  err=cudaSetDevice(card);
  checkCudaError(err,__FILE__,__LINE__);
  /* make sure enough heap memory is available for shapelet computations */
  size_t plim;
  err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize);
  checkCudaError(err,__FILE__,__LINE__);
  if (plim<GPU_HEAP_SIZE*1024*1024) { 
   err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, GPU_HEAP_SIZE*1024*1024);
   checkCudaError(err,__FILE__,__LINE__);
  }
  double *ud,*vd,*wd;
  double *cohd;
  baseline_t *barrd;
@ -129,7 +138,7 @@ precalcoh_threadfn(void *data) {
  float **xx_p=0,**yy_p=0,**zz_p=0;
  float **xxd,**yyd,**zzd;
  /* allocate memory in GPU */
-  err=cudaMalloc((void**) &cohd, t->Nbase*8*sizeof(float)); /* coherencies only for 1 cluster, Nf=1 */
+  err=cudaMalloc((void**) &cohd, t->Nbase*8*sizeof(double)); /* coherencies only for 1 cluster, Nf=1 */
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t));
  checkCudaError(err,__FILE__,__LINE__);
@ -218,9 +227,16 @@ precalcoh_threadfn(void *data) {
  float *beamd;
  double *lld,*mmd,*nnd;
  double *sId; float *rad,*decd;
  double *sQd,*sUd,*sVd;
  unsigned char *styped;
  double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
  double *sQ0d,*sU0d,*sV0d;
  int **host_p,**dev_p;
  complex double *tempdcoh;
  err=cudaMallocHost((void**)&tempdcoh,sizeof(complex double)*(size_t)t->Nbase*4);
  checkCudaError(err,__FILE__,__LINE__);
 /******************* begin loop over clusters **************************/
  for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
@ -249,11 +265,29 @@ precalcoh_threadfn(void *data) {
     err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     if (t->Nf==1) {
     err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     }
     if (t->dobeam) {
      dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
      dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
@ -264,7 +298,8 @@ precalcoh_threadfn(void *data) {
     err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
-     /* for multi channel data */
+     /* for multi channel data - FIXME: remove this part because Nf==1 always */
     if (t->Nf>1) {
     err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
@ -286,6 +321,22 @@ precalcoh_threadfn(void *data) {
     err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     }
     /* extra info for source, if any */
     if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
@ -347,15 +398,10 @@ precalcoh_threadfn(void *data) {
     /* calculate coherencies for all sources in this cluster, add them up */
     cudakernel_coherencies(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,barrd,freqsd,beamd,
-     lld,mmd,nnd,sId,styped,sI0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
+     lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
     /* copy back coherencies to host, 
        coherencies on host have 8M stride, on device have 8 stride */
     complex double *tempdcoh;
     if ((tempdcoh=(complex double*)calloc((size_t)t->Nbase*4,sizeof(complex double)))==0) {
       fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
       exit(1);
     }
     err=cudaMemcpy((double*)tempdcoh, cohd, sizeof(double)*t->Nbase*8, cudaMemcpyDeviceToHost);
     checkCudaError(err,__FILE__,__LINE__);
     /* now copy this with right offset and stride */
@ -363,7 +409,6 @@ precalcoh_threadfn(void *data) {
     my_ccopy(t->Nbase,&tempdcoh[1],4,&(t->coh[4*ncl+1]),4*t->M);
     my_ccopy(t->Nbase,&tempdcoh[2],4,&(t->coh[4*ncl+2]),4*t->M);
     my_ccopy(t->Nbase,&tempdcoh[3],4,&(t->coh[4*ncl+3]),4*t->M);
     free(tempdcoh);
     for (cj=0; cj<t->carr[ncl].N; cj++) {
@ -402,8 +447,16 @@ precalcoh_threadfn(void *data) {
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(nnd);
     checkCudaError(err,__FILE__,__LINE__);
     if (t->Nf==1) {
     err=cudaFree(sId);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sQd);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sUd);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sVd);
     checkCudaError(err,__FILE__,__LINE__);
     }
     if (t->dobeam) {
      err=cudaFree(rad);
      checkCudaError(err,__FILE__,__LINE__);
@ -413,6 +466,7 @@ precalcoh_threadfn(void *data) {
     err=cudaFree(styped);
     checkCudaError(err,__FILE__,__LINE__);
     if (t->Nf>1) {
     err=cudaFree(sI0d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(f0d);
@ -423,10 +477,21 @@ precalcoh_threadfn(void *data) {
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(spec_idx2d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sQ0d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sU0d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sV0d);
     checkCudaError(err,__FILE__,__LINE__);
     }
  }
 /******************* end loop over clusters **************************/
  /* free memory */
  err=cudaFreeHost(tempdcoh);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaFree(ud);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaFree(vd);
@ -545,7 +610,7 @@ precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex d
     threaddata[nth].x=0; /* no input data */
     threaddata[nth].N=N;
-     threaddata[nth].Nbase=Nbase; /* total baselines: actually Nbasextilesz */
+     threaddata[nth].Nbase=Nbase; /* total baselines: actually Nbasextilesz (from input) */
     threaddata[nth].barr=barr;
     threaddata[nth].carr=carr;
     threaddata[nth].M=M;
@ -666,7 +731,6 @@ predictvis_threadfn(void *data) {
  cudaError_t err;
  int ci,ncl,cj;
 printf("Attath %d to card %d\n",t->tid,card);
  err=cudaSetDevice(card);
  checkCudaError(err,__FILE__,__LINE__);
@ -675,12 +739,13 @@ printf("Attath %d to card %d\n",t->tid,card);
  size_t plim;
  err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize);
  checkCudaError(err,__FILE__,__LINE__);
-  if (plim<16*1024*1024) { /* 16MB */
+  if (plim<GPU_HEAP_SIZE*1024*1024) { 
-   err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, 16*1024*1024);
+   err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, GPU_HEAP_SIZE*1024*1024);
   checkCudaError(err,__FILE__,__LINE__);
  }
  double *ud,*vd,*wd;
  double *cohd;
  baseline_t *barrd;
@ -782,8 +847,10 @@ printf("Attath %d to card %d\n",t->tid,card);
  float *beamd;
  double *lld,*mmd,*nnd;
  double *sId; float *rad,*decd;
  double *sQd,*sUd,*sVd;
  unsigned char *styped;
  double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
  double *sQ0d,*sU0d,*sV0d;
  int **host_p,**dev_p;
  double *xlocal;
@ -792,7 +859,6 @@ printf("Attath %d to card %d\n",t->tid,card);
 /******************* begin loop over clusters **************************/
  for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
     printf("th %d clus %d:%d working %d\n",t->tid,t->soff,t->soff+t->Ns-1,ncl);
     /* allocate memory for this clusters beam */
     if (t->dobeam) {
      err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float));
@ -820,11 +886,27 @@ printf("Attath %d to card %d\n",t->tid,card);
     checkCudaError(err,__FILE__,__LINE__);
     if (t->Nf==1) {
     err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     }
     if (t->dobeam) {
      dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
      dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
@ -836,6 +918,7 @@ printf("Attath %d to card %d\n",t->tid,card);
     checkCudaError(err,__FILE__,__LINE__);
     /* for multi channel data */
     if (t->Nf>1) {
     err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
@ -858,6 +941,23 @@ printf("Attath %d to card %d\n",t->tid,card);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     }
     /* extra info for source, if any */
     if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
@ -920,7 +1020,7 @@ printf("Attath %d to card %d\n",t->tid,card);
     /* calculate coherencies for all sources in this cluster, add them up */
     cudakernel_coherencies(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,barrd,freqsd,beamd,
-     lld,mmd,nnd,sId,styped,sI0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
+     lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
     /* copy back coherencies to host, 
        coherencies on host have 8M stride, on device have 8 stride */
@ -964,8 +1064,16 @@ printf("Attath %d to card %d\n",t->tid,card);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(nnd);
     checkCudaError(err,__FILE__,__LINE__);
     if (t->Nf==1) {
     err=cudaFree(sId);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sQd);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sUd);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sVd);
     checkCudaError(err,__FILE__,__LINE__);
     }
     if (t->dobeam) {
      err=cudaFree(rad);
      checkCudaError(err,__FILE__,__LINE__);
@ -975,6 +1083,7 @@ printf("Attath %d to card %d\n",t->tid,card);
     err=cudaFree(styped);
     checkCudaError(err,__FILE__,__LINE__);
     if (t->Nf>1) {
     err=cudaFree(sI0d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(f0d);
@ -985,6 +1094,14 @@ printf("Attath %d to card %d\n",t->tid,card);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(spec_idx2d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sQ0d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sU0d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sV0d);
     checkCudaError(err,__FILE__,__LINE__);
     }
  }
 /******************* end loop over clusters **************************/
@ -1037,6 +1154,7 @@ printf("Attath %d to card %d\n",t->tid,card);
  free(zz_p);
  }
  /* reset error state */
  err=cudaGetLastError(); 
  return NULL;
@ -1089,7 +1207,7 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit
-  /* set common parameters, and split baselines to threads */
+  /* set common parameters, and split clusters to threads */
  ci=0;
  for (nth=0;  nth<Ngpu && ci<M; nth++) {
     if (ci+Nthb0<M) {
@ -1158,6 +1276,583 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit
  free(xlocal);
  free(threaddata);
  destroy_task_hist(&thst);
  free(th_array);
  pthread_attr_destroy(&attr);
  return 0;
 }
 static void *
 residual_threadfn(void *data) {
  thread_data_pred_t *t=(thread_data_pred_t*)data;
  /* first, select a GPU, if total clusters < MAX_GPU_ID
    use random selection, elese use this thread id */
  int card;
  if (t->M<=MAX_GPU_ID) {
   card=select_work_gpu(MAX_GPU_ID,t->hst);
  } else {
   card=t->tid;
  }
  cudaError_t err;
  int ci,ncl,cj;
  err=cudaSetDevice(card);
  checkCudaError(err,__FILE__,__LINE__);
  /* make sure enough heap memory is available for shapelet computations */
  size_t plim;
  err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize);
  checkCudaError(err,__FILE__,__LINE__);
  if (plim<GPU_HEAP_SIZE*1024*1024) { 
   err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, GPU_HEAP_SIZE*1024*1024);
   checkCudaError(err,__FILE__,__LINE__);
  }
  double *ud,*vd,*wd;
  double *cohd;
  baseline_t *barrd;
  double *freqsd;
  float *longd=0,*latd=0; double *timed;
  int *Nelemd;
  float **xx_p=0,**yy_p=0,**zz_p=0;
  float **xxd,**yyd,**zzd;
  /* allocate memory in GPU */
  err=cudaMalloc((void**) &cohd, t->Nbase*8*t->Nf*sizeof(double)); /* coherencies only for 1 cluster, Nf freq, used to store sum of clusters*/
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**) &Nelemd, t->N*sizeof(int));
  checkCudaError(err,__FILE__,__LINE__);
  /* copy to device */
  /* u,v,w and l,m,n coords need to be double for precision */
  err=cudaMalloc((void**) &ud, t->Nbase*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**) &vd, t->Nbase*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**) &wd, t->Nbase*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMemcpy(ud, t->u, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMemcpy(vd, t->v, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMemcpy(wd, t->w, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMemcpy(barrd, t->barr, t->Nbase*sizeof(baseline_t), cudaMemcpyHostToDevice);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**) &freqsd, t->Nf*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMemcpy(freqsd, t->freqs, t->Nf*sizeof(double), cudaMemcpyHostToDevice);
  checkCudaError(err,__FILE__,__LINE__);
  /* check if beam is actually calculated */
  if (t->dobeam) {
  dtofcopy(t->N,&longd,t->longitude);
  dtofcopy(t->N,&latd,t->latitude);
  err=cudaMalloc((void**) &timed, t->tilesz*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMemcpy(timed, t->time_utc, t->tilesz*sizeof(double), cudaMemcpyHostToDevice);
  checkCudaError(err,__FILE__,__LINE__);
  /* convert time jd to GMST angle */
  cudakernel_convert_time(t->tilesz,timed);
  err=cudaMemcpy(Nelemd, t->Nelem, t->N*sizeof(int), cudaMemcpyHostToDevice);
  checkCudaError(err,__FILE__,__LINE__);
  /* jagged arrays for element locations */
  err=cudaMalloc((void**)&xxd, t->N*sizeof(int*));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&yyd, t->N*sizeof(int*));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&zzd, t->N*sizeof(int*));
  checkCudaError(err,__FILE__,__LINE__);
  /* allocate host memory to store pointers */
  if ((xx_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
      exit(1);
  }
  if ((yy_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
      exit(1);
  }
  if ((zz_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
      exit(1);
  }
  for (ci=0; ci<t->N; ci++) {
    err=cudaMalloc((void**)&xx_p[ci], t->Nelem[ci]*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
    err=cudaMalloc((void**)&yy_p[ci], t->Nelem[ci]*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
    err=cudaMalloc((void**)&zz_p[ci], t->Nelem[ci]*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
  }
  /* now copy data */
  for (ci=0; ci<t->N; ci++) {
    dtofcopy(t->Nelem[ci],&xx_p[ci],t->xx[ci]);
    dtofcopy(t->Nelem[ci],&yy_p[ci],t->yy[ci]);
    dtofcopy(t->Nelem[ci],&zz_p[ci],t->zz[ci]);
  }
  /* now copy pointer locations to device */
  err=cudaMemcpy(xxd, xx_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMemcpy(yyd, yy_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMemcpy(zzd, zz_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
  checkCudaError(err,__FILE__,__LINE__);
  }
  float *beamd;
  double *lld,*mmd,*nnd;
  double *sId; float *rad,*decd;
  double *sQd,*sUd,*sVd;
  unsigned char *styped;
  double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
  double *sQ0d,*sU0d,*sV0d;
  int **host_p,**dev_p;
  double *xlocal;
  err=cudaMallocHost((void**)&xlocal,sizeof(double)*(size_t)t->Nbase*8*t->Nf);
  checkCudaError(err,__FILE__,__LINE__);
  double *pd; /* parameter array per cluster */
 /******************* begin loop over clusters **************************/
  for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
     /* allocate memory for this clusters beam */
     if (t->dobeam) {
      err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float));
      checkCudaError(err,__FILE__,__LINE__);
     } else {
       beamd=0;
     }
     /* copy cluster details to GPU */
     err=cudaMalloc((void**)&styped, t->carr[ncl].N*sizeof(unsigned char));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &lld, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(lld, t->carr[ncl].ll, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &mmd, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(mmd, t->carr[ncl].mm, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &nnd, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     /* parameter vector size may change depending on hybrid parameter */
     err=cudaMalloc((void**) &pd, t->N*8*t->carr[ncl].nchunk*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(pd, &(t->p[t->carr[ncl].p[0]]), t->N*8*t->carr[ncl].nchunk*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     if (t->Nf==1) {
     err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     }
     if (t->dobeam) {
      dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
      dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
     } else {
       rad=0;
       decd=0;
     }
     err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     /* for multi channel data */
     if (t->Nf>1) {
     err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &f0d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(f0d, t->carr[ncl].f0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &spec_idxd, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(spec_idxd, t->carr[ncl].spec_idx, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &spec_idx1d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(spec_idx1d, t->carr[ncl].spec_idx1, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &spec_idx2d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     }
     /* extra info for source, if any */
     if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
      exit(1);
     }
     err=cudaMalloc((void**)&dev_p, t->carr[ncl].N*sizeof(int*));
     checkCudaError(err,__FILE__,__LINE__);
     for (cj=0; cj<t->carr[ncl].N; cj++) {
        if (t->carr[ncl].stype[cj]==STYPE_POINT) {
          host_p[cj]=0;
        } else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) {
          exinfo_shapelet *d=(exinfo_shapelet*)t->carr[ncl].ex[cj];
          err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_shapelet));
          checkCudaError(err,__FILE__,__LINE__);
          double *modes;
          err=cudaMalloc((void**)&modes, d->n0*d->n0*sizeof(double));
          checkCudaError(err,__FILE__,__LINE__);
          err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_shapelet), cudaMemcpyHostToDevice);
          checkCudaError(err,__FILE__,__LINE__);
          err=cudaMemcpy(modes, d->modes, d->n0*d->n0*sizeof(double), cudaMemcpyHostToDevice);
          checkCudaError(err,__FILE__,__LINE__);
          exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj];
          err=cudaMemcpy(&(d_p->modes), &modes, sizeof(double*), cudaMemcpyHostToDevice);
          checkCudaError(err,__FILE__,__LINE__);
        } else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) {
          exinfo_gaussian *d=(exinfo_gaussian*)t->carr[ncl].ex[cj];
          err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_gaussian));
          checkCudaError(err,__FILE__,__LINE__);
          err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_gaussian), cudaMemcpyHostToDevice);
          checkCudaError(err,__FILE__,__LINE__);
        } else if (t->carr[ncl].stype[cj]==STYPE_DISK) {
          exinfo_disk *d=(exinfo_disk*)t->carr[ncl].ex[cj];
          err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_disk));
          checkCudaError(err,__FILE__,__LINE__);
          err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_disk), cudaMemcpyHostToDevice);
          checkCudaError(err,__FILE__,__LINE__);
        } else if (t->carr[ncl].stype[cj]==STYPE_RING) {
          exinfo_ring *d=(exinfo_ring*)t->carr[ncl].ex[cj];
          err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_ring));
          checkCudaError(err,__FILE__,__LINE__);
          err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_ring), cudaMemcpyHostToDevice);
          checkCudaError(err,__FILE__,__LINE__);
        }
     }
     /* now copy pointer locations to device */
     err=cudaMemcpy(dev_p, host_p, t->carr[ncl].N*sizeof(int*), cudaMemcpyHostToDevice);
     checkCudaError(err,__FILE__,__LINE__);
     if (t->dobeam) {
      /* now calculate beam for all sources in this cluster */
      cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd);
     }
     /* calculate coherencies for all sources in this cluster, add them up */
     cudakernel_residuals(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,pd,t->carr[ncl].nchunk,barrd,freqsd,beamd,
     lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
     /* copy back coherencies to host, 
        coherencies on host have 8M stride, on device have 8 stride */
     err=cudaMemcpy(xlocal, cohd, sizeof(double)*t->Nbase*8*t->Nf, cudaMemcpyDeviceToHost);
     checkCudaError(err,__FILE__,__LINE__);
     my_daxpy(t->Nbase*8*t->Nf,xlocal,1.0,t->x);
     for (cj=0; cj<t->carr[ncl].N; cj++) {
        if (t->carr[ncl].stype[cj]==STYPE_POINT) {
        } else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) {
          exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj];
          double *modes=0;
          err=cudaMemcpy(&modes, &(d_p->modes), sizeof(double*), cudaMemcpyDeviceToHost);
          err=cudaFree(modes);
          checkCudaError(err,__FILE__,__LINE__);
          err=cudaFree(host_p[cj]);
          checkCudaError(err,__FILE__,__LINE__);
        } else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) {
          err=cudaFree(host_p[cj]);
          checkCudaError(err,__FILE__,__LINE__);
        } else if (t->carr[ncl].stype[cj]==STYPE_DISK) {
          err=cudaFree(host_p[cj]);
          checkCudaError(err,__FILE__,__LINE__);
        } else if (t->carr[ncl].stype[cj]==STYPE_RING) {
          err=cudaFree(host_p[cj]);
          checkCudaError(err,__FILE__,__LINE__);
        }
     }
     free(host_p);
     err=cudaFree(dev_p);
     checkCudaError(err,__FILE__,__LINE__);
     if (t->dobeam) {
      err=cudaFree(beamd);
      checkCudaError(err,__FILE__,__LINE__);
     }
     err=cudaFree(lld);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(mmd);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(nnd);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(pd);
     checkCudaError(err,__FILE__,__LINE__);
     if (t->Nf==1) {
     err=cudaFree(sId);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sQd);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sUd);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sVd);
     checkCudaError(err,__FILE__,__LINE__);
     }
     if (t->dobeam) {
      err=cudaFree(rad);
      checkCudaError(err,__FILE__,__LINE__);
      err=cudaFree(decd);
      checkCudaError(err,__FILE__,__LINE__);
     }
     err=cudaFree(styped);
     checkCudaError(err,__FILE__,__LINE__);
     if (t->Nf>1) {
     err=cudaFree(sI0d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(f0d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(spec_idxd);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(spec_idx1d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(spec_idx2d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sQ0d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sU0d);
     checkCudaError(err,__FILE__,__LINE__);
     err=cudaFree(sV0d);
     checkCudaError(err,__FILE__,__LINE__);
     }
  }
 /******************* end loop over clusters **************************/
  /* free memory */
  err=cudaFreeHost(xlocal);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaFree(ud);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaFree(vd);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaFree(wd);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaFree(cohd);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaFree(barrd);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaFree(freqsd);
  checkCudaError(err,__FILE__,__LINE__);
  if (t->dobeam) {
  err=cudaFree(longd);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaFree(latd);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaFree(timed);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaFree(Nelemd);
  checkCudaError(err,__FILE__,__LINE__);
  for (ci=0; ci<t->N; ci++) {
    err=cudaFree(xx_p[ci]);
    checkCudaError(err,__FILE__,__LINE__);
    err=cudaFree(yy_p[ci]);
    checkCudaError(err,__FILE__,__LINE__);
    err=cudaFree(zz_p[ci]);
    checkCudaError(err,__FILE__,__LINE__);
  }
  err=cudaFree(xxd);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaFree(yyd);
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaFree(zzd);
  checkCudaError(err,__FILE__,__LINE__);
  free(xx_p);
  free(yy_p);
  free(zz_p);
  }
  /* reset error state */
  err=cudaGetLastError(); 
  return NULL;
 }
 /* p: 8NMx1 parameter array, but M is 'effective' clusters, need to use carr to find the right offset
   ccid: which cluster to use as correction
   rho: MMSE robust parameter J+rho I inverted
   phase_only: if >0, and if there is any correction done, use only phase of diagonal elements for correction 
 */
 int
 calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
 double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int ccid, double rho, int phase_only) {
  int nth,ci;
  int Nthb0,Nthb;
  pthread_attr_t attr;
  pthread_t *th_array;
  thread_data_pred_t *threaddata;
  taskhist thst;
  init_task_hist(&thst);
  /* oversubsribe GPU */
  int Ngpu=MAX_GPU_ID+1;
  /* calculate min clusters thread can handle */
  Nthb0=(M+Ngpu-1)/Ngpu;
  /* setup threads : note: Ngpu is no of GPUs used */
  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
  if ((th_array=(pthread_t*)malloc((size_t)Ngpu*sizeof(pthread_t)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
  }
  if ((threaddata=(thread_data_pred_t*)malloc((size_t)Ngpu*sizeof(thread_data_pred_t)))==0) {
    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  /* arrays to store result */
  double *xlocal;
  if ((xlocal=(double*)calloc((size_t)Nbase*8*tilesz*Nchan*Ngpu,sizeof(double)))==0) {
    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  /* set common parameters, and split clusters to threads */
  ci=0;
  for (nth=0;  nth<Ngpu && ci<M; nth++) {
     if (ci+Nthb0<M) {
      Nthb=Nthb0;
     } else {
      Nthb=M-ci;
     }
     threaddata[nth].hst=&thst;  /* for load balancing */
     threaddata[nth].tid=nth;
     threaddata[nth].u=u; 
     threaddata[nth].v=v;
     threaddata[nth].w=w;
     threaddata[nth].coh=0;
     threaddata[nth].x=&xlocal[nth*Nbase*8*tilesz*Nchan]; /* distinct arrays to get back the result */
     threaddata[nth].N=N;
     threaddata[nth].Nbase=Nbase*tilesz; /* total baselines: actually Nbasextilesz */
     threaddata[nth].barr=barr;
     threaddata[nth].carr=carr;
     threaddata[nth].M=M;
     threaddata[nth].Nf=Nchan;
     threaddata[nth].freqs=freqs;
     threaddata[nth].fdelta=fdelta;
     threaddata[nth].tdelta=tdelta;
     threaddata[nth].dec0=dec0;
     threaddata[nth].dobeam=dobeam;
     threaddata[nth].ph_ra0=ph_ra0;
     threaddata[nth].ph_dec0=ph_dec0;
     threaddata[nth].ph_freq0=ph_freq0;
     threaddata[nth].longitude=longitude;
     threaddata[nth].latitude=latitude;
     threaddata[nth].time_utc=time_utc;
     threaddata[nth].tilesz=tilesz;
     threaddata[nth].Nelem=Nelem;
     threaddata[nth].xx=xx;
     threaddata[nth].yy=yy;
     threaddata[nth].zz=zz;
     /* parameters */
     threaddata[nth].p=p;
     threaddata[nth].Ns=Nthb;
     threaddata[nth].soff=ci;
     pthread_create(&th_array[nth],&attr,residual_threadfn,(void*)(&threaddata[nth]));
     /* next source set */
     ci=ci+Nthb;
  }
  /* now wait for threads to finish */
  for(ci=0; ci<nth; ci++) {
    pthread_join(th_array[ci],NULL);
    /* subtract to find residual */
    my_daxpy(Nbase*8*tilesz*Nchan,threaddata[ci].x,-1.0,x);
  }
  free(xlocal);
  free(threaddata);
  destroy_task_hist(&thst);
--- a/src/lib/sagecal.h
+++ b/src/lib/sagecal.h
@ -2658,11 +2658,16 @@ precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex d
 double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int tileze, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt);
 /* FIXME: add_to_data: no subtraction is done */
 extern int
 predict_visibilities_multifreq_withbeam_gpu(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
 double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int add_to_data);
 extern int
 calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
 double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int ccid, double rho, int phase_only);
 #endif /*!HAVE_CUDA */
@ -2675,6 +2680,12 @@ predict_visibilities_multifreq_withbeam_gpu(double *u,double *v,double *w,double
 #ifndef ARRAY_MAX_ELEM /* if using shared memory, max possible elements for a station */
 #define ARRAY_MAX_ELEM 512
 #endif
 /* default GPU heap size (in MB) needed to calculate some shapelet models,
    if model has n0>20 or so, try increasing this and recompiling
   the default GPU values is ~ 8MB */
 #ifndef GPU_HEAP_SIZE
 #define GPU_HEAP_SIZE 20 
 #endif
 extern void
@ -2683,9 +2694,12 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud
 extern void
-cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI,
+cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
-  unsigned char *stype, double *sI0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam);
+  unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam);
 extern void
 cudakernel_residuals(int B, int N, int T, int K, int F, double *u, double *v, double *w, double *p, int nchunk, baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
  unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam);
 extern void
 cudakernel_convert_time(int T, double *time_utc);