diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8d2dcda
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,25 @@
+*/*.png
+*/*.out
+*/*.output
+*/*.solutions
+*/*.dot
+test/sm.ms/
+test/nvprof-resultaten/
+src/MS/sagecal
+test/analysis-*.txt
+test/extended_*.*
+
+*/*/*.a
+*/*/*.o
+*/*/*.swp
+*/*/*.swo
+*/*/*.out
+*/*/*.output
+
+*/*/*/*.o
+*/*/*/*.a
+*/*/*/*.swp
+*/*/*/*.swo
+*/*/*/*.out
+*/*/*/*.output
+
diff --git a/INSTALL b/INSTALL.md
similarity index 75%
rename from INSTALL
rename to INSTALL.md
index 41bfa99..6cba268 100644
--- a/INSTALL
+++ b/INSTALL.md
@@ -1,7 +1,8 @@
 vr  2 dec 2016 23:07:19 CET
-SAGECal Installation
-====================
-0) Prerequsites:
+
+# SAGECal Installation
+
+## 1 Prerequsites:
  - CASACORE http://casacore.googlecode.com/
  - glib http://developer.gnome.org/glib
  - BLAS/LAPACK
@@ -15,12 +16,13 @@ SAGECal Installation
   -- Intel MKL and other libraries
  - Get the source for SAGECal : git clone git://git.code.sf.net/p/sagecal/code sagecal-code
 
-1) The basic way to build is
+## 2 The basic way to build is
   1.a) go to ./src/lib  and run make (which will create libsagecal.a)
   1.b) go to ./src/MS and run make (which will create the executable)
 
 
-2) In ./src/lib and ./src/MS you MUST edit the Makefiles to suit your system. Some common items to edit are:
+## 3 Build settings
+In ./src/lib and ./src/MS you MUST edit the Makefiles to suit your system. Some common items to edit are:
  - LAPACK: directory where LAPACK/OpenBLAS is installed
  - GLIBI/GLIBL: include/lib files for glib
  - CASA_LIBDIR/CASA_INCDIR/CASA_LIBS : casacore include/library location and files:
@@ -39,23 +41,23 @@ SAGECal Installation
 
 
 
-SAGECAL-MPI Installation
-========================
-0) Prerequsites:
+# SAGECAL-MPI Installation
+
+## 1 Prerequsites:
  - Same as above 
  - MPI (e.g. OpenMPI)
 
-1) Build ./src/lib as above (using mpicc -DMPI_BUILD)
+## 2 Build ./src/lib as above (using mpicc -DMPI_BUILD)
 
-2) Build ./src/MPI using mpicc++
+## 3 Build ./src/MPI using mpicc++
 
 
 
-BUILDSKY Installation
-=====================
-1) See INSTALL in ./src/buildsky
+## BUILDSKY Installation
+
+  - See INSTALL in ./src/buildsky
 
 
-RESTORE Installation
-=====================
-1) See INSTALL in ./src/restore
+## RESTORE Installation
+
+  - See INSTALL in ./src/restore
diff --git a/README b/README.md
similarity index 91%
rename from README
rename to README.md
index 1dbc075..9ba22a0 100644
--- a/README
+++ b/README.md
@@ -1,15 +1,17 @@
-SAGECAL
-=======
+# SAGECAL
+
+
 Read INSTALL for installation. This file gives a brief guide to use SAGECal.
 Warning: this file may be obsolete. use sagecal -h to see up-to-date options.
 
 
-Step by Step Introduction:
-#######################################################################
+## Step by Step Introduction:
+
 1a)Calibrate data in the standard way using BBS/CASA or anything else. 
 Use NDPP to average the data in your MS to a few channels (also average in time to about 10sec). Also flag any spikes in the data.
 1b)For subtraction of the ATeam from raw data (CasA,CygA,...), no initial calibration is necessary. Just run sagecal on raw data, but it is better to scale the sky model to match the apparent flux of the sources that are being subtracted.
-#######################################################################
+
+
 2) Sky Model:
 3a)Make an image of your MS (using ExCon/casapy). 
 Use Duchamp to create a mask for the image. Use buildsky to create a sky model. (see the README file on top level directory). Also create a proper cluster file.
@@ -39,15 +41,15 @@ e.g.
 
 P1C1 0 12 42.996 85 43 21.514 0.030498 0 0 0 -5.713060 0 0 0 0 115039062.0
 P5C1 1 18 5.864 85 58 39.755 0.041839 0 0 0 -6.672879 0 0 0 0 115039062.0
-# A Gaussian mjor,minor 0.1375,0.0917 deg diameter -> radius(rad), PA 43.4772 deg (-> rad)
-# Position Angle: "West from North (counter-clockwise)" (0 deg = North, 90 deg = West). 
-# Note: PyBDSM and BBS use "North from East (counter-clockwise)" (0 deg = East, 90 deg = North). 
+#A Gaussian mjor,minor 0.1375,0.0917 deg diameter -> radius(rad), PA 43.4772 deg (-> rad)
+#Position Angle: "West from North (counter-clockwise)" (0 deg = North, 90 deg = West). 
+#Note: PyBDSM and BBS use "North from East (counter-clockwise)" (0 deg = East, 90 deg = North). 
 G0  5 34 31.75 22 00 52.86 100 0 0 0 0.00 0 0.0012  0.0008 -2.329615801 130.0e6
-# A Disk radius=0.041 deg
+#A Disk radius=0.041 deg
 D01 23 23 25.67 58 48 58 80 0 0 0 0 0 0.000715 0.000715 0 130e6
-# A Ring radius=0.031 deg
+#A Ring radius=0.031 deg
 R01 23 23 25.416 58 48 57 70 0 0 0 0 0 0.00052 0.00052 0 130e6
-# A shapelet ('S3C61MD.fits.modes' file must be in the current directory)
+#A shapelet ('S3C61MD.fits.modes' file must be in the current directory)
 S3C61MD 2 22 49.796414 86 18 55.913266 0.135 0 0 0 -6.6 0 1 1 0.0 115000000.0
 
 
diff --git a/src/MS/Makefile b/src/MS/Makefile
index e2145cb..cd817ce 100644
--- a/src/MS/Makefile
+++ b/src/MS/Makefile
@@ -36,5 +36,4 @@ data.o:data.cpp data.h
 sagecal:$(OBJECTS) ../lib/Radio/libsagecal.a ../lib/Dirac/libdirac.a 
 	$(CXX) $(CXXFLAGS) $(LDFLAGS) $(INCLUDES) $(GLIBI) $(LIBPATH)  -o $@  $(OBJECTS) $(MY_LIBS) $(LAPACK) $(CASA_LIBS)  $(GLIBL)
 clean:
-	rm *.o 
-
+	rm *.o *.tmp *.fits *.swp *.swo *.o *.output
diff --git a/src/MS/Makefile.gpu b/src/MS/Makefile.gpu
index 0f3875b..9efaddd 100644
--- a/src/MS/Makefile.gpu
+++ b/src/MS/Makefile.gpu
@@ -10,9 +10,9 @@ LAPACK_DIR=/cm/shared/package/openblas/0.2.17mt/lib
 #LAPACK_DIR=/usr/lib/atlas/sse/
 
 CUDAINC=-I/cm/shared/apps/cuda80/toolkit/8.0.44/include/
-CUDALIB=-L/cm/shared/apps/cuda80/toolkit/8.0.44/lib64/ -lcuda -lcudart
+CUDALIB=-lcublas -lcusolver -lcudadevrt
 
-CULALIB=-lcublas -lcusolver -lcudadevrt
+# CULALIB=-lcublas -lcusolver -lcudadevrt
 # NVML
 NVML_INC=/usr/include/nvidia/gdk/
 NVML_LIB=-lnvidia-ml -L/usr/lib64/nvidia/
diff --git a/src/MS/main.cpp b/src/MS/main.cpp
index 75566c8..829bdbf 100644
--- a/src/MS/main.cpp
+++ b/src/MS/main.cpp
@@ -23,6 +23,9 @@
 #include <stdio.h>
 #include <string.h>
 #include <pthread.h>
+#include <casacore/casa/Quanta/Quantum.h>
+
+#include "cuda_profiler_api.h"
 
 #include <Dirac.h>
 #include <Radio.h>
@@ -241,11 +244,7 @@ main(int argc, char **argv) {
      Data::readMSlist(Data::MSlist,&msnames);
     }
     if (Data::TableName) {
-     if (!doBeam) {
-      Data::readAuxData(Data::TableName,&iodata);
-     } else {
       Data::readAuxData(Data::TableName,&iodata,&beam);
-     }
      cout<<"Only one MS"<<endl;
     } else if (Data::MSlist) {
      Data::readAuxDataList(msnames,&iodata);
@@ -256,11 +255,12 @@ main(int argc, char **argv) {
      srand(time(0)); /* use different seed */
     }
 
-    openblas_set_num_threads(1);//Data::Nt;
+    // openblas_set_num_threads(1);//Data::Nt;
+    // export OMP_NUM_THREADS=1
     /**********************************************************/
      int M,Mt,ci,cj,ck;  
    /* parameters */
-   double *p,*pinit,*pfreq;
+   double *p,*pinit;
    double **pm;
    complex double *coh;
    FILE *sfp=0;
@@ -333,19 +333,6 @@ main(int argc, char **argv) {
     }
   }
 
-#ifdef USE_MIC
-  /* need for bitwise copyable parameter passing */
-  int *mic_pindex,*mic_chunks;
-  if ((mic_chunks=(int*)calloc((size_t)M,sizeof(int)))==0) {
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     exit(1);
-  }
-  if ((mic_pindex=(int*)calloc((size_t)Mt,sizeof(int)))==0) {
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     exit(1);
-  }
-  int cl=0;
-#endif
 
   /* update cluster array with correct pointers to parameters */
   cj=0;
@@ -354,14 +341,8 @@ main(int argc, char **argv) {
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
      exit(1);
     }
-#ifdef USE_MIC
-    mic_chunks[ci]=carr[ci].nchunk;
-#endif
     for (ck=0; ck<carr[ci].nchunk; ck++) {
       carr[ci].p[ck]=cj*8*iodata.N;
-#ifdef USE_MIC
-      mic_pindex[cl++]=carr[ci].p[ck];
-#endif
       cj++;
     }
   }
@@ -418,8 +399,8 @@ main(int argc, char **argv) {
 
     double res_0,res_1,res_00,res_01;   
    /* previous residual */
-   double res_prev=CLM_DBL_MAX;
-   double res_ratio=5; /* how much can the residual increase before resetting solutions */
+   // double res_prev=CLM_DBL_MAX;
+   // double res_ratio=5; /* how much can the residual increase before resetting solutions */
    res_0=res_1=res_00=res_01=0.0;
 
     /**********************************************************/
@@ -462,19 +443,18 @@ main(int argc, char **argv) {
 
 
     /* starting iterations are doubled */
-    int start_iter=1;
-    int sources_precessed=0;
+    // int start_iter=1;
+    // int sources_precessed=0;
 
     double inv_c=1.0/CONST_C;
 
+#ifdef HAVE_CUDA
+    cudaProfilerStart();
+#endif
     while (msitr[0]->more()) {
       start_time = time(0);
       if (iodata.Nms==1) {
-       if (!doBeam) {
-        Data::loadData(msitr[0]->table(),iodata,&iodata.fratio);
-       } else {
         Data::loadData(msitr[0]->table(),iodata,beam,&iodata.fratio);
-       }
       } else { 
        Data::loadDataList(msitr,iodata,&iodata.fratio);
       }
@@ -489,45 +469,16 @@ main(int argc, char **argv) {
     preset_flags_and_data(iodata.Nbase*iodata.tilesz,iodata.flag,barr,iodata.x,Data::Nt);
     /* if data is being whitened, whiten x here,
      no need for a copy because we use xo for residual calculation */
-    if (Data::whiten) {
-     whiten_data(iodata.Nbase*iodata.tilesz,iodata.x,iodata.u,iodata.v,iodata.freq0,Data::Nt);
-    }
     /* precess source locations (also beam pointing) from J2000 to JAPP if we do any beam predictions,
       using first time slot as epoch */
-    if (doBeam && !sources_precessed) {
-      precess_source_locations(beam.time_utc[iodata.tilesz/2],carr,M,&beam.p_ra0,&beam.p_dec0,Data::Nt);
-      sources_precessed=1;
-    }
+    // sources_precessed=1;
 
 
-#ifdef USE_MIC
-  double *mic_u,*mic_v,*mic_w,*mic_x;
-  mic_u=iodata.u;
-  mic_v=iodata.v;
-  mic_w=iodata.w;
-  mic_x=iodata.x;
-  int mic_Nbase=iodata.Nbase;
-  int mic_tilesz=iodata.tilesz;
-  int mic_N=iodata.N;
-  double mic_freq0=iodata.freq0;
-  double mic_deltaf=iodata.deltaf;
-  double mic_data_min_uvcut=Data::min_uvcut;
-  int mic_data_Nt=Data::Nt;
-  int mic_data_max_emiter=Data::max_emiter;
-  int mic_data_max_iter=Data::max_iter;
-  int mic_data_max_lbfgs=Data::max_lbfgs;
-  int mic_data_lbfgs_m=Data::lbfgs_m;
-  int mic_data_gpu_threads=Data::gpu_threads;
-  int mic_data_linsolv=Data::linsolv;
-  int mic_data_solver_mode=Data::solver_mode;
-  int mic_data_randomize=Data::randomize;
-  double mic_data_nulow=Data::nulow;
-  double mic_data_nuhigh=Data::nuhigh;
-#endif
 
    /* FIXME: uvmin is not needed in calibration, because its taken care of by flags */
     if (!Data::DoSim) {
     /****************** calibration **************************/
+<<<<<<< HEAD
 #ifndef HAVE_CUDA
     if (!doBeam) {
      precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
@@ -685,15 +636,19 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea
 #endif
 
     }
+=======
+#ifdef HAVE_CUDA
+     precalculate_coherencies_withbeam_gpu(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
+  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt);
+#endif
+#ifndef HAVE_CUDA
+     precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
+#endif
+>>>>>>> master
     /****************** end calibration **************************/
     /****************** begin diagnostics ************************/
-#ifdef HAVE_CUDA
-    if (Data::DoDiag) {
-     calculate_diagnostics(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,Data::DoDiag,Data::Nt);
-    }
-#endif /* HAVE_CUDA */
-    /****************** end diagnostics **************************/
    } else {
+<<<<<<< HEAD
     /************ simulation only mode ***************************/
     if (!solfile) {
 #ifndef HAVE_CUDA
@@ -724,6 +679,23 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea
     }
     /************ end simulation only mode ***************************/
    }
+=======
+#ifdef HAVE_CUDA
+      predict_visibilities_multifreq_withbeam_gpu(iodata.u,iodata.v,iodata.w,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
+  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt,(Data::DoSim>1?1:0));
+#endif
+#ifndef HAVE_CUDA
+     precalculate_coherencies_withbeam(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
+  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt);
+#endif    
+}
+
+#ifdef HAVE_CUDA
+    cudaDeviceSynchronize();
+    cudaProfilerStop();
+    exit(0);
+#endif
+>>>>>>> master
 
    tilex+=iodata.tilesz;
    /* print solutions to file */
@@ -740,33 +712,11 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea
    }
 
     /**********************************************************/
-      /* also write back */
-    if (iodata.Nms==1) {
-     Data::writeData(msitr[0]->table(),iodata);
-    } else {
-     Data::writeDataList(msitr,iodata);
-    }
     for(int cm=0; cm<iodata.Nms; cm++) {
       (*msitr[cm])++;
     }
-   if (!Data::DoSim) {
-   /* if residual has increased too much, or all are flagged (0 residual)
-      or NaN
-      reset solutions to original
-      initial values */
-   if (res_1==0.0 || !isfinite(res_1) || res_1>res_ratio*res_prev) {
-     cout<<"Resetting Solution"<<endl;
-     /* reset solutions so next iteration has default initial values */
-     memcpy(p,pinit,(size_t)iodata.N*8*Mt*sizeof(double));
-     /* also assume iterations have restarted from scratch */
-     start_iter=1;
-     /* also forget min residual (otherwise will try to reset it always) */
-     res_prev=res_1;
-   } else if (res_1<res_prev) { /* only store the min value */
-    res_prev=res_1;
-   }
-   }
     end_time = time(0);
+
     elapsed_time = ((double) (end_time-start_time)) / 60.0;
     if (!Data::DoSim) {
     if (solver_mode==SM_OSLM_OSRLM_RLBFGS||solver_mode==SM_RLM_RLBFGS||solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) { 
@@ -790,10 +740,6 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea
     Data::freeData(iodata,beam);
    }
 
-#ifdef USE_MIC
-   free(mic_pindex);
-   free(mic_chunks);
-#endif
     /**********************************************************/
 
   exinfo_gaussian *exg;
diff --git a/src/lib/Dirac/Makefile b/src/lib/Dirac/Makefile
index fce2655..dd8aaf7 100644
--- a/src/lib/Dirac/Makefile
+++ b/src/lib/Dirac/Makefile
@@ -1,11 +1,13 @@
 CC=gcc
 CXX=g++
-#CFLAGS= -Wall -O3 -g #-pg
-CFLAGS= -Wall -O3 -fopt-info-optimized
+CFLAGS= -Wall -g -pg
+# Extra args for making callgraphs.
+# CFLAGS= -Wall -pg -O2 -ansi -fPIC -fpermissive -fno-omit-frame-pointer -DNDEBUG -fno-inline-functions -fno-inline-functions-called-once -fno-optimize-sibling-calls
+# CFLAGS= -Wall -O3 -fopt-info-optimized
 CLIBS= -lm -lpthread
 #LAPACK=-L/usr/lib/atlas/sse -llapack -lblas
 #LAPACK=-L/usr/local/GotoBLAS2/lib -lgoto2 -lpthread -lgfortran
-LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
+LAPACK=-L/cm/shared/package/openblas/0.2.17mt/lib -lopenblas -lgfortran -lpthread
 
 
 INCLUDES= -I. 
diff --git a/src/lib/Dirac/Makefile.gpu b/src/lib/Dirac/Makefile.gpu
index 9089a43..c7336b6 100644
--- a/src/lib/Dirac/Makefile.gpu
+++ b/src/lib/Dirac/Makefile.gpu
@@ -4,7 +4,6 @@ NVCC=nvcc
 CFLAGS= -Wall -O3 -g -DHAVE_CUDA -DHYBRID_CODE -pg
 CLIBS= -lm -lpthread
 LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
-# LAPACK=-lblas -lgfortran -lpthread
 
 CUDAINC=/usr/local/cuda/include
 CUDALIB=-L/usr/local/cuda/lib64 -lcuda -lcudart
diff --git a/src/lib/Dirac/clmfit.c b/src/lib/Dirac/clmfit.c
index 8af5bcd..4ca4d3d 100644
--- a/src/lib/Dirac/clmfit.c
+++ b/src/lib/Dirac/clmfit.c
@@ -146,7 +146,7 @@ clevmar_der_single(
 
   /* calculate no of cuda threads and blocks */
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
 
   err=cudaSetDevice(card);
   checkCudaError(err,__FILE__,__LINE__);
@@ -199,7 +199,7 @@ clevmar_der_single(
   checkCudaError(err,__FILE__,__LINE__);
 
 
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
   int work_size=0;
   int *devInfo; 
   int devInfo_h=0;
@@ -709,7 +709,7 @@ clevmar_der_single_cuda(
 
   /* calculate no of cuda threads and blocks */
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
 
 
   unsigned long int moff; /* make sure offsets are multiples of 4 */
@@ -742,7 +742,7 @@ clevmar_der_single_cuda(
   checkCudaError(err,__FILE__,__LINE__);
   err=cudaMalloc((void**)&ed, N*sizeof(double));
   checkCudaError(err,__FILE__,__LINE__);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
   if (solve_axb==1) {
     err=cudaMalloc((void**)&taud, M*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
@@ -1378,7 +1378,7 @@ mlm_der_single_cuda(
 
   /* calculate no of cuda threads and blocks */
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
 
 
   if (opts) {
@@ -1435,7 +1435,7 @@ mlm_der_single_cuda(
   /* we need coherencies for only this cluster */
   err=cudaMalloc((void**) &cohd, Nbase*8*sizeof(double));
   checkCudaError(err,__FILE__,__LINE__);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
   if (solve_axb==1) {
     /* QR solver ********************************/
     err=cudaMalloc((void**)&taud, M*sizeof(double));
diff --git a/src/lib/Dirac/clmfit_fl.c b/src/lib/Dirac/clmfit_fl.c
index 6cad047..4809807 100644
--- a/src/lib/Dirac/clmfit_fl.c
+++ b/src/lib/Dirac/clmfit_fl.c
@@ -143,7 +143,7 @@ oslevmar_der_single_cuda_fl(
 
   /* calculate no of cuda threads and blocks */
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
 
 
   unsigned long int moff;
@@ -704,7 +704,7 @@ clevmar_der_single_cuda_fl(
 
   /* calculate no of cuda threads and blocks */
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
 
 
   unsigned long int moff;
diff --git a/src/lib/Dirac/clmfit_nocuda.c b/src/lib/Dirac/clmfit_nocuda.c
index edd539a..7d7a449 100644
--- a/src/lib/Dirac/clmfit_nocuda.c
+++ b/src/lib/Dirac/clmfit_nocuda.c
@@ -188,7 +188,7 @@ clevmar_der_single_nocuda(
   WORK=Ud=Sd=VTd=0;
   me_data_t *dt=(me_data_t*)adata;
   setweights(M,aones,1.0,dt->Nt);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
   if (solve_axb==0) {
 
   } else if (solve_axb==1) {
@@ -739,7 +739,7 @@ mlm_der_single(
 #endif
       exit(1);
   }
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
   if (solve_axb==1) {
     /* QR solver ********************************/
     /* workspace query */
@@ -1221,7 +1221,7 @@ oslevmar_der_single_nocuda(
   me_data_t *dt=(me_data_t*)adata;
   setweights(M,aones,1.0,dt->Nt);
 
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
   if (solve_axb==0) {
 
   } else if (solve_axb==1) {
diff --git a/src/lib/Dirac/consensus_poly.c b/src/lib/Dirac/consensus_poly.c
index 938f4d5..d45d492 100644
--- a/src/lib/Dirac/consensus_poly.c
+++ b/src/lib/Dirac/consensus_poly.c
@@ -283,185 +283,6 @@ find_prod_inverse(double *B, double *Bi, int Npoly, int Nf, double *fratio) {
 
 
 
-typedef struct thread_data_prod_inv_ {
- int startM;
- int endM;
- int M;
- int Nf;
- int Npoly;
- double *B;
- double *Bi;
- double *rho;
-} thread_data_prod_inv_t;
-
-
-/* worker thread function for calculating sum and inverse */ 
-static void*
-sum_inv_threadfn(void *data) {
- thread_data_prod_inv_t *t=(thread_data_prod_inv_t*)data;
- double w[1],*WORK,*U,*S,*VT;
-
- int k,ci,status,lwork=0;
- int Np2=t->Npoly*t->Npoly;
- /* allocate memory for the SVD here */
-  if ((U=(double*)calloc((size_t)Np2,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-  if ((VT=(double*)calloc((size_t)Np2,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-  if ((S=(double*)calloc((size_t)t->Npoly,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-
-  /* memory for SVD: use first location of Bi */
-  status=my_dgesvd('A','A',t->Npoly,t->Npoly,&(t->Bi[t->startM*Np2]),t->Npoly,S,U,t->Npoly,VT,t->Npoly,w,-1);
-  if (!status) {
-    lwork=(int)w[0];
-  } else {
-    printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
-    exit(1);
-  }
-  if ((WORK=(double*)calloc((size_t)lwork,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-
-
-
- /* iterate over clusters */
- for (k=t->startM; k<=t->endM; k++) {
-   memset(&(t->Bi[k*Np2]),0,sizeof(double)*Np2);
-   /* find sum */
-   for (ci=0; ci<t->Nf; ci++) {
-    /* outer product */
-    my_dgemm('N','T',t->Npoly,t->Npoly,1,t->rho[k+ci*t->M],&t->B[ci*t->Npoly],t->Npoly,&t->B[ci*t->Npoly],t->Npoly,1.0,&(t->Bi[k*Np2]),t->Npoly);
-   }
-   /* find SVD */
-   status=my_dgesvd('A','A',t->Npoly,t->Npoly,&(t->Bi[k*Np2]),t->Npoly,S,U,t->Npoly,VT,t->Npoly,WORK,lwork);
-   if (status) {
-    printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
-    exit(1);
-   }
-
-   /* find 1/singular values, and multiply columns of U with new singular values */
-   for (ci=0; ci<t->Npoly; ci++) {
-    if (S[ci]>CLM_EPSILON) {
-     S[ci]=1.0/S[ci];
-    } else {
-     S[ci]=0.0;
-    }
-    my_dscal(t->Npoly,S[ci],&U[ci*t->Npoly]);
-   }
-
-   /* find product U 1/S V^T */
-   my_dgemm('N','N',t->Npoly,t->Npoly,t->Npoly,1.0,U,t->Npoly,VT,t->Npoly,0.0,&(t->Bi[k*Np2]),t->Npoly);
-
- }
-
- free(U);
- free(VT);
- free(S);
- free(WORK);
- return NULL;
-}
-
-/* build matrix with polynomial terms
-  B : Npoly x Nf, each row is one basis function
-  Bi: Npoly x Npoly pseudo inverse of sum( B(:,col) x B(:,col)' ) : M times
-  Npoly : total basis functions
-  Nf: frequencies
-  M: clusters
-  rho: NfxM array of regularization factors (for each freq, M values)
-  Sum taken is a weighted sum, using weights in rho, rho is assumed to change for each freq,cluster pair 
-
-  Nt: no. of threads
-*/
-int
-find_prod_inverse_full(double *B, double *Bi, int Npoly, int Nf, int M, double *rho, int Nt) {
-
-  pthread_attr_t attr;
-  pthread_t *th_array;
-  thread_data_prod_inv_t *threaddata;
-
-  int ci,Nthb0,Nthb,nth,nth1;
-  /* clusters per thread */
-  Nthb0=(M+Nt-1)/Nt;
-
-  /* setup threads */
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
-
-  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
-   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-   exit(1);
-  }
-  if ((threaddata=(thread_data_prod_inv_t*)malloc((size_t)Nt*sizeof(thread_data_prod_inv_t)))==0) {
-    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-
-
-  ci=0;
-  for (nth=0;  nth<Nt && ci<M; nth++) {
-    if (ci+Nthb0<M) { 
-     Nthb=Nthb0;
-    } else {
-     Nthb=M-ci;
-    }
-    threaddata[nth].B=B;
-    threaddata[nth].Bi=Bi;
-    threaddata[nth].rho=rho;
-    threaddata[nth].Npoly=Npoly;
-    threaddata[nth].Nf=Nf;
-    threaddata[nth].M=M;
-    threaddata[nth].startM=ci;
-    threaddata[nth].endM=ci+Nthb-1;
-
-    pthread_create(&th_array[nth],&attr,sum_inv_threadfn,(void*)(&threaddata[nth]));
-    ci=ci+Nthb;
-  }
-
-  for(nth1=0; nth1<nth; nth1++) {
-   pthread_join(th_array[nth1],NULL);
-  }
-  
-  pthread_attr_destroy(&attr);
-
-
-  free(th_array);
-  free(threaddata);
-
-#ifdef DEBUG
-  int k,cj;
-  for (k=0; k<M; k++) {
-    printf("dir_%d=",k);
-    for (cj=0; cj<Nf; cj++) {
-      printf("%lf ",rho[k+cj*M]);
-    }
-    printf("\n");
-  }
-  for (k=0; k<M; k++) {
-  printf("Bii_%d=[\n",k);
-  for (ci=0; ci<Npoly; ci++) {
-   for(cj=0; cj<Npoly; cj++) {
-    printf("%lf ",Bi[k*Npoly*Npoly+ci*Npoly+cj]);
-   }
-   printf("\n");
-  }
-  printf("];\n");
-
-  }
-#endif
-
-
-  return 0;
-}
-
-
 /* update Z
    Z: 8N Npoly x M double array (real and complex need to be updated separate)
    N : stations
@@ -499,7 +320,7 @@ update_global_z(double *Z,int N,int M,int Npoly,double *z,double *Bi) {
     printf("%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
-
+ 
  int ci,np;
  for (ci=0; ci<M; ci++) {
   for (np=0; np<Npoly; np++) {
@@ -512,337 +333,17 @@ update_global_z(double *Z,int N,int M,int Npoly,double *z,double *Bi) {
   /* find Q=R B^T */
   memset(Q,0,sizeof(double)*2*N*Npoly*4);
   my_dgemm('N','N',8*N,Npoly,Npoly,1.0,R,8*N,Bi,Npoly,1.0,Q,8*N);
-  /* copy back to Z */
-  for (np=0; np<Npoly; np++) {
-   my_dcopy(2*N, &Q[np*8*N], 1, &Z[8*N*Npoly*ci+8*N*np], 4);
-   my_dcopy(2*N, &Q[np*8*N+2*N], 1, &Z[8*N*Npoly*ci+8*N*np+1], 4);
-   my_dcopy(2*N, &Q[np*8*N+2*2*N], 1, &Z[8*N*Npoly*ci+8*N*np+2], 4);
-   my_dcopy(2*N, &Q[np*8*N+3*2*N], 1, &Z[8*N*Npoly*ci+8*N*np+3], 4);
-  }
-
- }
-
- free(R);
- free(Q);
- return 0;
-}
-
-
-typedef struct thread_data_update_z_ {
- int startM;
- int endM;
- int N;
- int M;
- int Npoly;
- double *Z;
- double *z;
- double *Bi;
-} thread_data_update_z_t;
-
-
-/* worker thread function for updating z */
-static void*
-update_z_threadfn(void *data) {
-  thread_data_update_z_t *t=(thread_data_update_z_t*)data;
-
- /* one block of Z for one direction 2Nx2xNpoly (complex)
-    and 8NxNpoly  real values : select one column : 2NxNpoly (complex)
-    select real,imag : 2NxNpoly each (vector)
-    reshape each to 2NxNpoly matrix => Q
-    Bi : NpolyxNpoly matrix = B^T
-    
-    for each direction (M values)
-    select 2N,2N,... : 2Nx Npoly complex values from z (ordered by M)
-    select real,imag: size 2NxNpoly, 2NxNpoly vectors
-    reshape to 2NxNpoly => R
-    reshape to 2NxNpoly => I (imag)
-    
-    then Q=([R I] Bi^T) for each column
-    Q=[R_1^T I_1^T R_2^T I_2^T]^T Bi^T for 2 columns
-    R_1,I_1,R_2,I_2 : size 2NxNpoly 
-    R : (2N 4) x Npoly
-    so find Q
- */
- double *R,*Q;
- if ((R=(double*)calloc((size_t)2*t->N*t->Npoly*4,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
- }
- if ((Q=(double*)calloc((size_t)2*t->N*t->Npoly*4,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
- }
- 
- int ci,np;
- for (ci=t->startM; ci<=t->endM; ci++) {
-  for (np=0; np<t->Npoly; np++) {
-   /* select 2N */
-   my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M], 4, &R[np*8*t->N], 1); /* R_1 */
-   my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M+1], 4, &R[np*8*t->N+2*t->N], 1); /* I_1 */
-   my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M+2], 4, &R[np*8*t->N+2*2*t->N], 1); /* R_2 */
-   my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M+3], 4, &R[np*8*t->N+3*2*t->N], 1); /* I_2 */
-  }
-  /* find Q=R B^T */
-  memset(Q,0,sizeof(double)*2*t->N*t->Npoly*4);
-  my_dgemm('N','N',8*t->N,t->Npoly,t->Npoly,1.0,R,8*t->N,&t->Bi[ci*t->Npoly*t->Npoly],t->Npoly,1.0,Q,8*t->N);
   /* copy back to Z */ 
-  for (np=0; np<t->Npoly; np++) {
-   my_dcopy(2*t->N, &Q[np*8*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np], 4); 
-   my_dcopy(2*t->N, &Q[np*8*t->N+2*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np+1], 4); 
-   my_dcopy(2*t->N, &Q[np*8*t->N+2*2*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np+2], 4); 
-   my_dcopy(2*t->N, &Q[np*8*t->N+3*2*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np+3], 4); 
+  for (np=0; np<Npoly; np++) {
+   my_dcopy(2*N, &Q[np*8*N], 1, &Z[8*N*Npoly*ci+8*N*np], 4); 
+   my_dcopy(2*N, &Q[np*8*N+2*N], 1, &Z[8*N*Npoly*ci+8*N*np+1], 4); 
+   my_dcopy(2*N, &Q[np*8*N+2*2*N], 1, &Z[8*N*Npoly*ci+8*N*np+2], 4); 
+   my_dcopy(2*N, &Q[np*8*N+3*2*N], 1, &Z[8*N*Npoly*ci+8*N*np+3], 4); 
   }
    
  }
 
  free(R);
  free(Q);
-
- return NULL;
-}
-
-/* update Z
-   Z: 8N Npoly x M double array (real and complex need to be updated separate)
-   N : stations
-   M : clusters
-   Npoly: no of basis functions
-   z : right hand side 8NM Npoly x 1 (note the different ordering from Z)
-   Bi : M values of NpolyxNpoly matrices, Bi^T=Bi assumed
-
-   Nt: no. of threads
-*/
-int 
-update_global_z_multi(double *Z,int N,int M,int Npoly,double *z,double *Bi, int Nt) {
-   pthread_attr_t attr;
-   pthread_t *th_array;
-   thread_data_update_z_t *threaddata;
-
-  int ci,Nthb0,Nthb,nth,nth1;
-  /* clusters per thread */
-  Nthb0=(M+Nt-1)/Nt;
-
-  /* setup threads */
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
-
-  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
-   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-   exit(1);
-  }
-  if ((threaddata=(thread_data_update_z_t*)malloc((size_t)Nt*sizeof(thread_data_update_z_t)))==0) {
-    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-
-
-  ci=0;
-  for (nth=0;  nth<Nt && ci<M; nth++) {
-    if (ci+Nthb0<M) {
-     Nthb=Nthb0;
-    } else {
-     Nthb=M-ci;
-    }
-    threaddata[nth].z=z;
-    threaddata[nth].Z=Z;
-    threaddata[nth].Bi=Bi;
-    threaddata[nth].N=N;
-    threaddata[nth].M=M;
-    threaddata[nth].Npoly=Npoly;
-    threaddata[nth].startM=ci;
-    threaddata[nth].endM=ci+Nthb-1;
-
-    pthread_create(&th_array[nth],&attr,update_z_threadfn,(void*)(&threaddata[nth]));
-    ci=ci+Nthb;
-  }
-
-  for(nth1=0; nth1<nth; nth1++) {
-   pthread_join(th_array[nth1],NULL);
-  }
-
-  pthread_attr_destroy(&attr);
-
-
-  free(th_array);
-  free(threaddata);
- 
- return 0;
-}
-
-/* generate a random integer in the range 0,1,...,maxval */
-int
-random_int(int maxval) {
-  double rat=(double)random()/(double)RAND_MAX;
-  double y=rat*(double)(maxval+1);
-  int x=(int)floor(y);
-  return x;
-}
-
-
-typedef struct thread_data_rho_bb_ {
- int startM;
- int endM;
- int offset;
- int M;
- int N;
- double *rho;
- double *rhoupper;
- double *deltaY;
- double *deltaJ;
- clus_source_t *carr;
-} thread_data_rho_bb_t;
-
-
-/* worker thread function for calculating sum and inverse */
-static void*
-rho_bb_threadfn(void *data) {
- thread_data_rho_bb_t *t=(thread_data_rho_bb_t*)data;
- double alphacorrmin=0.2;
- int ci,ck;
- double ip11,ip12,ip22;
- ck=t->offset;
- for (ci=t->startM; ci<=t->endM; ci++) {
-   ip12=my_ddot(8*t->N*t->carr[ci].nchunk,&t->deltaY[ck],&t->deltaJ[ck]); /* x^T y */
-   /* further computations are only required if there is +ve correlation */
-   if (ip12>CLM_EPSILON) {
-   /* find the inner products */
-   ip11=my_dnrm2(8*t->N*t->carr[ci].nchunk,&t->deltaY[ck]); /* || ||_2 */
-   ip22=my_dnrm2(8*t->N*t->carr[ci].nchunk,&t->deltaJ[ck]); /* || ||_2 */
-   /* square the norm to get dot prod */
-   ip11*=ip11;
-   ip22*=ip22;
-   /* only try to do an update if the 'delta's are finite, also 
-     there is tangible correlation between the two deltas */
-#ifdef DEBUG
-   printf("%d ip11=%lf ip12=%lf ip22=%lf\n",ci,ip11,ip12,ip22);
-#endif
-   if (ip11>CLM_EPSILON && ip22>CLM_EPSILON) {
-     double alphacorr=ip12/sqrt(ip11*ip22);
-     /* decide on whether to do further calculations only if there is sufficient correlation 
-        between the deltas */
-     if (alphacorr>alphacorrmin) {
-     double alphaSD=ip11/ip12;
-     double alphaMG=ip12/ip22;
-     double alphahat;
-     if (2.0*alphaMG>alphaSD) {
-      alphahat=alphaMG;
-     } else {
-      alphahat=alphaSD-alphaMG*0.5;
-     }
-#ifdef DEBUG
-     printf("alphacorr=%lf alphaSD=%lf alphaMG=%lf alphahat=%lf rho=%lf\n",alphacorr,alphaSD,alphaMG,alphahat,t->rho[ci]);
-#endif
-      /* decide on whether to update rho based on heuristics */
-      if (alphahat> 0.001 && alphahat<t->rhoupper[ci]) {
-#ifdef DEBUG
-       printf("updating rho from %lf to %lf\n",t->rho[ci],alphahat);
-#endif
-       t->rho[ci]=alphahat;
-      }
-     }
-   }
-  
-   } 
-   ck+=t->N*8*t->carr[ci].nchunk;
- }
- return NULL;
-}
-
-
-/* Barzilai & Borwein update of rho [Xu et al] */
-/* rho: Mx1 values, to be updated
-   rhoupper: Mx1 values, upper limit of rho
-   N: no of stations
-   M : clusters
-   Mt: actual clusters (include hybrid parameter) Mt >= M
-   carr: cluster info array, to get hybrid parameters: Mx1
-   Yhat: current Yhat : 8*N*Mt 
-   Yhat_k0 : old Yhat at previous update of rho : 8*N*Mt
-   J: current solution : 8*N*Mt
-   J_k0: old solution at previous update of rho : 8*N*Mt
-   Nt: no. of threads
-*/ 
-int
-update_rho_bb(double *rho, double *rhoupper, int N, int M, int Mt, clus_source_t *carr, double *Yhat, double *Yhat_k0, double *J, double *J_k0, int Nt) {
-
- double *deltaY; /* Yhat - Yhat_k0 */
- double *deltaJ; /* J - J_k0 (with J_k0 projected to tangent plane of J)*/
- if ((deltaY=(double*)calloc((size_t)8*N*Mt,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
- }
- if ((deltaJ=(double*)calloc((size_t)8*N*Mt,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
- }
- 
-
- my_dcopy(8*N*Mt, Yhat, 1, deltaY, 1); 
- my_daxpy(8*N*Mt, Yhat_k0, -1.0, deltaY);
-//no need to remove unitary ambiguity from J-Jold
- my_dcopy(8*N*Mt, J, 1, deltaJ, 1); 
- my_daxpy(8*N*Mt, J_k0,-1.0, deltaJ);
-
-  pthread_attr_t attr;
-  pthread_t *th_array;
-  thread_data_rho_bb_t *threaddata;
-
-  int ci,cj,ck,Nthb0,Nthb,nth,nth1;
-  /* clusters per thread */
-  Nthb0=(M+Nt-1)/Nt;
-
-  /* setup threads */
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
-
-  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
-   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-   exit(1);
-  }
-  if ((threaddata=(thread_data_rho_bb_t*)malloc((size_t)Nt*sizeof(thread_data_rho_bb_t)))==0) {
-    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-
-
-  ci=0;
-  ck=0;
-  for (nth=0;  nth<Nt && ci<M; nth++) {
-    if (ci+Nthb0<M) {
-     Nthb=Nthb0;
-    } else {
-     Nthb=M-ci;
-    }
-    threaddata[nth].N=N;
-    threaddata[nth].M=M;
-    threaddata[nth].offset=ck;
-    threaddata[nth].startM=ci;
-    threaddata[nth].endM=ci+Nthb-1;
-    threaddata[nth].rho=rho;
-    threaddata[nth].rhoupper=rhoupper;
-    threaddata[nth].deltaY=deltaY;
-    threaddata[nth].deltaJ=deltaJ;
-    threaddata[nth].carr=carr;
-    /* find the right offset too, since ci is not always incremented by 1 need to add up */
-    for (cj=ci; cj<ci+Nthb && cj<M; cj++) {
-      ck+=N*8*carr[cj].nchunk;
-    }
-
-
-    pthread_create(&th_array[nth],&attr,rho_bb_threadfn,(void*)(&threaddata[nth]));
-    ci=ci+Nthb;
-  }
-
-  for(nth1=0; nth1<nth; nth1++) {
-   pthread_join(th_array[nth1],NULL);
-  }
-
-  pthread_attr_destroy(&attr);
-
-  free(th_array);
-  free(threaddata);
-
-
- free(deltaY);
- free(deltaJ);
  return 0;
 }
diff --git a/src/lib/Dirac/dataio.c b/src/lib/Dirac/dataio.c
deleted file mode 100644
index f2d59d3..0000000
--- a/src/lib/Dirac/dataio.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *
- Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- 
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
- 
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- $Id$
- */
-
-
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-
-#include "Dirac.h"
-
-int 
-open_data_stream(int file, double **d, int *count, int *N, double *freq0, double *ra0, double *dec0) {
-  struct stat statbuf;
-
-  int ig;
-  
-
- /* find the file size */
- if (fstat (file,&statbuf) < 0) {
-   fprintf(stderr,"%s: %d: no file open\n",__FILE__,__LINE__);
-   exit(1);
- }
-
- //printf("file size (bytes) %d\n",(int)statbuf.st_size);
- /* total double values is size/8 */
- *count=statbuf.st_size/8;
- //printf("total double values %d\n",*count);
-  
-  /* map the file to memory */
-  *d= (double*)mmap(NULL,  statbuf.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, file, 0);
-  if ( !d) {
-     fprintf(stderr,"%s: %d: no file open\n",__FILE__,__LINE__);
-		 exit(1);
-  }
-
-  /* remove header from data */
-  *N=(int)(*d)[0];
-  *freq0=(*d)[1];
-  *ra0=(*d)[2];
-  *dec0=(*d)[3];
-  /* read ignored stations and discard them */
-  ig=(int)(*d)[4]; 
-  /* make correct value for N */
-  *N=*N-ig;
- 
-  printf("Ignoring %d stations\n",ig);
-  /* increment to data */
-  *d=&((*d)[5+ig]); 
-  
-  return(0);
-}
-
-
-int
-close_data_stream(double *d, int count) {
-  
-  /* sync to disk */
-  msync(d, (size_t)count*sizeof(double), MS_SYNC );
-  munmap((void*)d, (size_t)count*sizeof(double));
-  return 0;
-}
-
diff --git a/src/lib/Dirac/lbfgs.c b/src/lib/Dirac/lbfgs.c
index 221e6a7..776101d 100644
--- a/src/lib/Dirac/lbfgs.c
+++ b/src/lib/Dirac/lbfgs.c
@@ -102,7 +102,7 @@ pipeline_slave_code_b(void *data)
     checkCudaError(err,__FILE__,__LINE__);
   } else if (dp->status[tid]==PT_DO_CCOST) {
    /* divide total baselines by 2 */
-   int BlocksPerGrid=(dp->Nbase[tid]+dp->lmdata[tid]->ThreadsPerBlock-1)/dp->lmdata[tid]->ThreadsPerBlock;
+   int BlocksPerGrid= 2*(dp->Nbase[tid]+dp->lmdata[tid]->ThreadsPerBlock-1)/dp->lmdata[tid]->ThreadsPerBlock;
    int  boff=dp->boff[tid];
    /* copy the current solution to device */
    err=cudaMemcpy(dp->cpp[tid], dp->lmdata[tid]->p, m*sizeof(double), cudaMemcpyHostToDevice);
@@ -901,12 +901,12 @@ lbfgs_fit_common(
   /* parameters per thread (GPU) */
   int Nparm=(m+2-1)/2;
   /* find number of blocks */
-  int BlocksPerGrid = (Nparm+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid = 2* (Nparm+ThreadsPerBlock-1)/ThreadsPerBlock;
   ci=0;
   int nth;
   for (nth=0; nth<2; nth++) {
    threaddata[nth].ThreadsPerBlock=ThreadsPerBlock;
-   threaddata[nth].BlocksPerGrid=BlocksPerGrid;
+   threaddata[nth].BlocksPerGrid= 2*BlocksPerGrid;
    threaddata[nth].card=nth;
    threaddata[nth].Nbase=dp->Nbase;
    threaddata[nth].tilesz=dp->tilesz;
diff --git a/src/lib/Dirac/myblas.c b/src/lib/Dirac/myblas.c
index cbf1d75..35d237a 100644
--- a/src/lib/Dirac/myblas.c
+++ b/src/lib/Dirac/myblas.c
@@ -206,7 +206,7 @@ __attribute__ ((target(MIC)))
 }
 
 
-/* following routines used in LAPACK solvers */
+/* following routines used in LAPACK dirac */
 /* cholesky factorization: real symmetric */
 int
 my_dpotrf(char uplo, int N, double *A, int lda) {
diff --git a/src/lib/Dirac/oslmfit.c b/src/lib/Dirac/oslmfit.c
index 82be194..f19f447 100644
--- a/src/lib/Dirac/oslmfit.c
+++ b/src/lib/Dirac/oslmfit.c
@@ -157,7 +157,7 @@ oslevmar_der_single_cuda(
 
   /* calculate no of cuda threads and blocks */
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
 
 
   unsigned long int moff;
@@ -190,7 +190,7 @@ oslevmar_der_single_cuda(
   checkCudaError(err,__FILE__,__LINE__);
   err=cudaMalloc((void**)&ed, N*sizeof(double));
   checkCudaError(err,__FILE__,__LINE__);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
   if (solve_axb==1) {
     err=cudaMalloc((void**)&taud, M*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
diff --git a/src/lib/Dirac/robustlm.c b/src/lib/Dirac/robustlm.c
index 07b0f6e..05e21d8 100644
--- a/src/lib/Dirac/robustlm.c
+++ b/src/lib/Dirac/robustlm.c
@@ -159,7 +159,7 @@ rlevmar_der_single_cuda(
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
   int ThreadsPerBlock1=DEFAULT_TH_PER_BK; /* DEFAULT_TH_PER_BK/8 for accessing each element of a baseline */
   int ThreadsPerBlock2=Nd/2; /* for evaluating nu */
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
 
 
   unsigned long int moff;
@@ -196,7 +196,7 @@ rlevmar_der_single_cuda(
   checkCudaError(err,__FILE__,__LINE__);
   err=cudaMalloc((void**)&ed, N*sizeof(double));
   checkCudaError(err,__FILE__,__LINE__);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
   if (solve_axb==1) {
     err=cudaMalloc((void**)&taud, M*sizeof(double));
     checkCudaError(err,__FILE__,__LINE__);
@@ -805,7 +805,7 @@ rlevmar_der_single_cuda_fl(
   /* FIXME: might need a large value for large no of baselines */
   int ThreadsPerBlock1=DEFAULT_TH_PER_BK; /* for accessing each element of a baseline */
   int ThreadsPerBlock2=Nd/2; /* for evaluating nu */
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
 
 
   unsigned long int moff;
@@ -1382,7 +1382,7 @@ osrlevmar_der_single_cuda_fl(
   /* FIXME: might need a large value for large no of baselines */
   int ThreadsPerBlock1=DEFAULT_TH_PER_BK; /* for accessing each element of a baseline */
   int ThreadsPerBlock2=Nd/2; /* for evaluating nu */
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
 
 
   unsigned long int moff;
@@ -2180,7 +2180,7 @@ rlevmar_der_single_nocuda(
   setweights(M,aones,1.0,lmdata->Nt);
   /*W set initial weights to 1 */
   setweights(N,wtd,1.0,lmdata->Nt);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
   if (solve_axb==0) {
 
   } else if (solve_axb==1) {
@@ -2766,7 +2766,7 @@ osrlevmar_der_single_nocuda(
   /*W set initial weights to 1 */
   setweights(N,wtd,1.0,lmdata0->Nt);
 
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
   if (solve_axb==0) {
 
   } else if (solve_axb==1) {
diff --git a/src/lib/Dirac/rtr_solve_cuda.c b/src/lib/Dirac/rtr_solve_cuda.c
index 2d9b912..ed4ae25 100644
--- a/src/lib/Dirac/rtr_solve_cuda.c
+++ b/src/lib/Dirac/rtr_solve_cuda.c
@@ -582,7 +582,7 @@ rtr_solve_cuda_fl(
 
   /* calculate no of cuda threads and blocks */
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
 
 
   /* reshape x to make J: 2Nx2 complex double 
diff --git a/src/lib/Dirac/rtr_solve_robust_cuda.c b/src/lib/Dirac/rtr_solve_robust_cuda.c
index 45d62f1..163bf29 100644
--- a/src/lib/Dirac/rtr_solve_robust_cuda.c
+++ b/src/lib/Dirac/rtr_solve_robust_cuda.c
@@ -602,7 +602,7 @@ rtr_solve_cuda_robust_fl(
 
   /* calculate no of cuda threads and blocks */
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
 
 
   /* reshape x to make J: 2Nx2 complex double 
@@ -985,7 +985,7 @@ nsd_solve_cuda_robust_fl(
 
   /* calculate no of cuda threads and blocks */
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
 
 
   /* reshape x to make J: 2Nx2 complex double 
diff --git a/src/lib/Dirac/rtr_solve_robust_cuda_admm.c b/src/lib/Dirac/rtr_solve_robust_cuda_admm.c
index 7451c6b..d77c735 100644
--- a/src/lib/Dirac/rtr_solve_robust_cuda_admm.c
+++ b/src/lib/Dirac/rtr_solve_robust_cuda_admm.c
@@ -517,7 +517,7 @@ rtr_solve_cuda_robust_admm_fl(
 
   /* calculate no of cuda threads and blocks */
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
 
 
   /* reshape x to make J: 2Nx2 complex double 
@@ -947,7 +947,7 @@ nsd_solve_cuda_robust_admm_fl(
 
   /* calculate no of cuda threads and blocks */
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
 
 
   /* reshape x to make J: 2Nx2 complex double 
diff --git a/src/lib/Radio/Makefile b/src/lib/Radio/Makefile
index 43bb3be..49bcd1c 100644
--- a/src/lib/Radio/Makefile
+++ b/src/lib/Radio/Makefile
@@ -2,11 +2,12 @@ CC=gcc
 CXX=g++
 #CFLAGS= -Wall -O3 -g #-pg
 CFLAGS= -Wall -O3 -fopt-info-optimized
+# CFLAGS= -Wall -pg -O2 -ansi -fPIC -fpermissive -fno-omit-frame-pointer -DNDEBUG -fno-inline-functions -fno-inline-functions-called-once -fno-optimize-sibling-calls
 CLIBS= -lm -lpthread
 #LAPACK=-L/usr/lib/atlas/sse -llapack -lblas
 #LAPACK=-L/usr/local/GotoBLAS2/lib -lgoto2 -lpthread -lgfortran
-LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
-
+#LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
+LAPACK=-L/cm/shared/package/openblas/0.2.17mt/lib -lopenblas -lgfortran -lpthread
 
 INCLUDES= -I. -I../Dirac/ 
 LIBPATH=
diff --git a/src/lib/Radio/Makefile.gpu b/src/lib/Radio/Makefile.gpu
index d9e2d7c..eaead10 100644
--- a/src/lib/Radio/Makefile.gpu
+++ b/src/lib/Radio/Makefile.gpu
@@ -3,7 +3,8 @@ CXX=g++
 NVCC=nvcc
 CFLAGS= -Wall -O3 -g -DHAVE_CUDA -DHYBRID_CODE -pg
 CLIBS= -lm -lpthread
-LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
+#LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
+LAPACK=-L/cm/shared/package/openblas/0.2.17mt/lib -lopenblas -lgfortran -lpthread
 # LAPACK=-lblas -lgfortran -lpthread
 
 CUDAINC=/usr/local/cuda/include
diff --git a/src/lib/Radio/predict_model.cu b/src/lib/Radio/predict_model.cu
index 64b0a44..6678c34 100644
--- a/src/lib/Radio/predict_model.cu
+++ b/src/lib/Radio/predict_model.cu
@@ -26,25 +26,6 @@
 /* enable this for checking for kernel failure */
 //#define CUDA_DBG
 
-/* matrix multiplications */
-/* C=A*B */
-__device__ void
-amb(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) {
- c[0]=cuCadd(cuCmul(a[0],b[0]),cuCmul(a[1],b[2]));
- c[1]=cuCadd(cuCmul(a[0],b[1]),cuCmul(a[1],b[3]));
- c[2]=cuCadd(cuCmul(a[2],b[0]),cuCmul(a[3],b[2]));
- c[3]=cuCadd(cuCmul(a[2],b[1]),cuCmul(a[3],b[3]));
-}
-/* C=A*B^H */
-__device__ void
-ambt(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) {
- c[0]=cuCadd(cuCmul(a[0],cuConj(b[0])),cuCmul(a[1],cuConj(b[1])));
- c[1]=cuCadd(cuCmul(a[0],cuConj(b[2])),cuCmul(a[1],cuConj(b[3])));
- c[2]=cuCadd(cuCmul(a[2],cuConj(b[0])),cuCmul(a[3],cuConj(b[1])));
- c[3]=cuCadd(cuCmul(a[2],cuConj(b[2])),cuCmul(a[3],cuConj(b[3])));
-}
-
-
 __device__ void
 radec2azel_gmst__(float ra, float dec, float longitude, float latitude, float thetaGMST, float *az, float *el) {
   float thetaLST=thetaGMST+longitude*180.0f/M_PI;
@@ -70,49 +51,134 @@ radec2azel_gmst__(float ra, float dec, float longitude, float latitude, float th
   *az=azd;
 }
 
-/* use compiler directives to use/not use shared memory */
-/* ARRAY_MAX_ELEM : define this in header file beforehand, if using shared memory */
-/* master kernel to calculate beam */
+/* slave kernel to calculate phase of manifold vector for given station */
+/* x,y,z: Nx1 arrays of element coords */
+/* sum: scalar to store result */
+/* NOTE: only 1 block should be used here */
 __global__ void 
-kernel_array_beam(int N, int T, int K, int F, 
- const double *__restrict__ freqs, const float *__restrict__ longitude, const float *__restrict__ latitude,
- const double *__restrict__ time_utc, const int *__restrict__ Nelem, 
- const float * const *__restrict__ xx, const float * const *__restrict__ yy, const float * const *__restrict__ zz, 
- const float *__restrict__ ra, const float *__restrict__ dec, 
- float ph_ra0, float ph_dec0, float ph_freq0, float *beam) {
+kernel_array_beam_slave_sin(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) {
+  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
+  extern __shared__ float tmpsum[]; /* assumed to be size Nx1 */
+  if (n<N) {
+    tmpsum[n]=sinf((r1*__ldg(&x[n])+r2*__ldg(&y[n])+r3*__ldg(&z[n])));
+  }
+  __syncthreads();
 
-    /* global thread index, x-dimension (data) */
-    int x=threadIdx.x+blockDim.x*blockIdx.x;
-    /* y-dimension is station */
-    int istat=blockIdx.y;
+ // Build summation tree over elements, handling case where total threads is not a power of two.
+  int nTotalThreads = blockDim_2; // Total number of threads (==N), rounded up to the next power of two
+  while(nTotalThreads > 1) {
+    int halfPoint = (nTotalThreads >> 1); // divide by two
+    if (n < halfPoint) {
+     int thread2 = n + halfPoint;
+     if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 )
+      tmpsum[n] = tmpsum[n]+tmpsum[thread2];
+     }
+    }
+    __syncthreads();
+    nTotalThreads = halfPoint; // Reducing the binary tree size by two
+  }
 
-    // find respective source,freq,time for this thread
-    int n1 = x;
-    int isrc=n1/(T*F);
-    n1=n1-isrc*(T*F);
-    int ifrq=n1/(T);
-    n1=n1-ifrq*(T);
-    int itm=n1;
+  /* now thread 0 will add up results */
+  if (threadIdx.x==0) {
+   *sum=tmpsum[0];
+  }
+}
 
-    //number of elements for this station
-    int Nelems = __ldg(&Nelem[istat]);
+__global__ void 
+kernel_array_beam_slave_cos(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) {
+  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
+  extern __shared__ float tmpsum[]; /* assumed to be size Nx1 */
+  if (n<N) {
+    tmpsum[n]=cosf((r1*__ldg(&x[n])+r2*__ldg(&y[n])+r3*__ldg(&z[n])));
+  }
+  __syncthreads();
+  // Build summation tree over elements, handling case where total threads is not a power of two.
+  int nTotalThreads = blockDim_2; // Total number of threads (==N), rounded up to the next power of two
+  while(nTotalThreads > 1) {
+    int halfPoint = (nTotalThreads >> 1); // divide by two
+    if (n < halfPoint) {
+     int thread2 = n + halfPoint;
+     if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 )
+      tmpsum[n] = tmpsum[n]+tmpsum[thread2];
+     }
+    }
+    __syncthreads();
+    nTotalThreads = halfPoint; // Reducing the binary tree size by two
+  }
 
-    //using shared memory
-    #if (ARRAY_USE_SHMEM==1)
-      __shared__ float sh_x[ARRAY_MAX_ELEM];
-      __shared__ float sh_y[ARRAY_MAX_ELEM];
-      __shared__ float sh_z[ARRAY_MAX_ELEM];
-      for (int i=threadIdx.x; i<Nelems; i+=blockDim.x) {
-        sh_x[i] = __ldg(&xx[istat][i]);
-        sh_y[i] = __ldg(&yy[istat][i]);
-        sh_z[i] = __ldg(&zz[istat][i]);
-      }
-      __syncthreads();
-    #endif
+  /* now thread 0 will add up results */
+  if (threadIdx.x==0) {
+   *sum=tmpsum[0];
+  }
+}
 
-  float r1,r2,r3;
-  //check data limit
-  if (x<(K*T*F)) {
+/* sum: 2x1 array */
+__global__ void 
+kernel_array_beam_slave_sincos(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) {
+  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
+  extern __shared__ float tmpsum[]; /* assumed to be size 2*Nx1 */
+  if (n<N) {
+    float ss,cc;
+    sincosf((r1*__ldg(&x[n])+r2*__ldg(&y[n])+r3*__ldg(&z[n])),&ss,&cc);
+    tmpsum[2*n]=ss;
+    tmpsum[2*n+1]=cc;
+  }
+  __syncthreads();
+
+ // Build summation tree over elements, handling case where total threads is not a power of two.
+  int nTotalThreads = blockDim_2; // Total number of threads (==N), rounded up to the next power of two
+  while(nTotalThreads > 1) {
+    int halfPoint = (nTotalThreads >> 1); // divide by two
+    if (n < halfPoint) {
+     int thread2 = n + halfPoint;
+     if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 )
+      tmpsum[2*n] = tmpsum[2*n]+tmpsum[2*thread2];
+      tmpsum[2*n+1] = tmpsum[2*n+1]+tmpsum[2*thread2+1];
+     }
+    }
+    __syncthreads();
+    nTotalThreads = halfPoint; // Reducing the binary tree size by two
+  }
+
+  /* now thread 0 will add up results */
+  if (threadIdx.x==0) {
+   sum[0]=tmpsum[0];
+   sum[1]=tmpsum[1];
+  }
+}
+
+
+__device__ int
+NearestPowerOf2 (int n){
+  if (!n) return n;  //(0 == 2^0)
+
+  int x = 1;
+  while(x < n) {
+      x <<= 1;
+  }
+  return x;
+}
+
+
+/* master kernel to calculate beam */
+/* tarr: size NTKFx2 buffer to store sin() cos() sums */
+__global__ void 
+kernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude, float *latitude,
+ double *time_utc, int *Nelem, float **xx, float **yy, float **zz, float *ra, float *dec, float ph_ra0, float ph_dec0, float ph_freq0, float *beam, float *tarr) {
+
+  /* global thread index */
+  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
+  /* allowed max threads */
+  int Ntotal=N*T*K*F;
+  if (n<Ntotal) {
+   /* find respective station,source,freq,time for this thread */
+   int istat=n/(K*T*F);
+   int n1=n-istat*(K*T*F);
+   int isrc=n1/(T*F);
+   n1=n1-isrc*(T*F);
+   int ifrq=n1/(T);
+   n1=n1-ifrq*(T);
+   int itm=n1;
 
 /*********************************************************************/
    /* time is already converted to thetaGMST */
@@ -136,11 +202,12 @@ kernel_array_beam(int N, int T, int K, int F,
    sincosf(theta0,&sint0,&cost0);
    sincosf(phi0,&sinph0,&cosph0);
 
+   float r1,r2,r3;
    /*r1=(float)-tpc*(ph_freq0*sint0*cosph0-freqs[ifrq]*sint*cosph);
    r2=(float)-tpc*(ph_freq0*sint0*sinph0-freqs[ifrq]*sint*sinph);
    r3=(float)-tpc*(ph_freq0*cost0-freqs[ifrq]*cost);
    */
-   float f=(float)__ldg(&freqs[ifrq]);
+   float f=__ldg(&freqs[ifrq]);
    float rat1=ph_freq0*sint0;
    float rat2=f*sint;
    r1=-tpc*(rat1*cosph0-rat2*cosph);
@@ -148,26 +215,31 @@ kernel_array_beam(int N, int T, int K, int F,
    r3=-tpc*(ph_freq0*cost0-f*cost);
    
 
-        float ssum = 0.0f;
-        float csum = 0.0f;
-        for (int i=0; i<Nelems; i++) {
-            float ss,cc;
-            #if (ARRAY_USE_SHMEM == 0)
-            sincosf((r1*__ldg(&xx[istat][i])+r2*__ldg(&yy[istat][i])+r3*__ldg(&zz[istat][i])),&ss,&cc);
-            #else
-            sincosf(r1*sh_x[i]+r2*sh_y[i]+r3*sh_z[i],&ss,&cc);
-            #endif
-            ssum += ss;
-            csum += cc;
-        }
-
-
+   /* always use 1 block, assuming total elements<512 */
+   /* shared memory to store either sin or cos values */
 
+#ifdef CUDA_DBG
+   cudaError_t error;
+#endif
+   //int boffset=istat*K*T*F + isrc*T*F + ifrq*T + itm;
+   int boffset=itm*N*K*F+isrc*N*F+ifrq*N+istat;
+   int Nelems=__ldg(&Nelem[istat]);
+   kernel_array_beam_slave_sincos<<<1,Nelems,sizeof(float)*Nelems*2>>>(Nelems,r1,r2,r3,xx[istat],yy[istat],zz[istat],&tarr[2*n],NearestPowerOf2(Nelems));
+   cudaDeviceSynchronize();
+#ifdef CUDA_DBG
+  error = cudaGetLastError();
+  if(error != cudaSuccess) {
+    // print the CUDA error message and exit
+    printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
+  }
+#endif
+   float ssum=__ldg(&tarr[2*n]);
+   float csum=__ldg(&tarr[2*n+1]);
+   
    float Nnor=1.0f/(float)Nelems;
    ssum*=Nnor;
    csum*=Nnor;
    /* store output (amplitude of beam)*/
-   int boffset=itm*N*K*F+isrc*N*F+ifrq*N+istat;
    beam[boffset]=sqrtf(ssum*ssum+csum*csum);
    //printf("thread %d stat %d src %d freq %d time %d : %lf longitude=%lf latitude=%lf time=%lf freq=%lf elem=%d ra=%lf dec=%lf beam=%lf\n",n,istat,isrc,ifrq,itm,time_utc[itm],longitude[istat],latitude[istat],time_utc[itm],freqs[ifrq],Nelem[istat],ra[isrc],dec[isrc],beam[boffset]);
   }
@@ -175,8 +247,8 @@ kernel_array_beam(int N, int T, int K, int F,
 }
 
 /***************************************************************************/
-__device__ cuDoubleComplex
-gaussian_contrib__(int *dd, float u, float v, float w) {
+__device__ cuFloatComplex
+gaussian_contrib(int *dd, float u, float v, float w) {
   exinfo_gaussian *dp=(exinfo_gaussian*)dd;
   float up,vp,a,b,ut,vt,cosph,sinph;
 
@@ -196,13 +268,13 @@ gaussian_contrib__(int *dd, float u, float v, float w) {
   ut=a*(cosph*up-sinph*vp);
   vt=b*(sinph*up+cosph*vp);
 
-  return make_cuDoubleComplex((double)(0.5f*M_PI*expf(-(ut*ut+vt*vt))),0.0);
+  return make_cuFloatComplex(0.5f*M_PI*expf(-(ut*ut+vt*vt)),0.0f);
 }
 
 
 
-__device__ cuDoubleComplex
-ring_contrib__(int *dd, float u, float v, float w) {
+__device__ cuFloatComplex
+ring_contrib(int *dd, float u, float v, float w) {
   exinfo_ring *dp=(exinfo_ring*)dd;
   float up,vp,a,b;
 
@@ -213,11 +285,11 @@ ring_contrib__(int *dd, float u, float v, float w) {
   a=dp->eX; /* diameter */
   b=sqrtf(up*up+vp*vp)*a*2.0f*M_PI;
 
-  return make_cuDoubleComplex((double)j0f(b),0.0);
+  return make_cuFloatComplex(j0f(b),0.0f);
 }
 
-__device__ cuDoubleComplex
-disk_contrib__(int *dd, float u, float v, float w) {
+__device__ cuFloatComplex
+disk_contrib(int *dd, float u, float v, float w) {
   exinfo_disk *dp=(exinfo_disk*)dd;
   float up,vp,a,b;
 
@@ -228,7 +300,7 @@ disk_contrib__(int *dd, float u, float v, float w) {
   a=dp->eX; /* diameter */
   b=sqrtf(up*up+vp*vp)*a*2.0f*M_PI;
 
-  return make_cuDoubleComplex((double)j1f(b),0.0);
+  return make_cuFloatComplex(j1f(b),0.0f);
 }
 
 
@@ -321,8 +393,8 @@ calculate_uv_mode_vectors_scalar(float u, float v, float beta, int n0, float *Av
   free(shpvl);
 }
 
-__device__ cuDoubleComplex
-shapelet_contrib__(int *dd, float u, float v, float w) {
+__device__ cuFloatComplex
+shapelet_contrib(int *dd, float u, float v, float w) {
   exinfo_shapelet *dp=(exinfo_shapelet*)dd;
   int *cplx;
   float *Av;
@@ -346,11 +418,6 @@ shapelet_contrib__(int *dd, float u, float v, float w) {
   __sincosf((float)dp->eP,&sinph,&cosph);
   ut=a*(cosph*up-sinph*vp);
   vt=b*(sinph*up+cosph*vp);
-  /* if u,v is way off the scale (beta) of shapelet modes, the result is almost always zero,
-     so check this here and return 0, otherwise spurious nans may result */
-  if (__fdiv_rz(100.0f,__fsqrt_rz(ut*ut+vt*vt))<dp->beta) {
-   return make_cuDoubleComplex(0.0,0.0);
-  }
   /* note: we decompose f(-l,m) so the Fourier transform is F(-u,v)
    so negate the u grid */
   Av=(float*)malloc((size_t)((dp->n0)*(dp->n0))*sizeof(float));
@@ -369,198 +436,163 @@ shapelet_contrib__(int *dd, float u, float v, float w) {
 
   free(Av);
   free(cplx);
+  //return 2.0*M_PI*(realsum+_Complex_I*imagsum);
   realsum*=2.0f*M_PI*a*b;
   imagsum*=2.0f*M_PI*a*b;
-  /* additional safeguards */
-  if ( isnan(realsum) ) { realsum=0.0f; }
-  if ( isnan(imagsum) ) { imagsum=0.0f; }
-  return make_cuDoubleComplex((double)realsum,(double)imagsum);
+  return make_cuFloatComplex(realsum,imagsum);
 }
 
-__device__ void 
-compute_prodterm_multifreq(int sta1, int sta2, int N, int K, int T, int F,
-double phterm0, double sI0f, double sQ0f, double sU0f, double sV0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0,
- double myfreq, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) {
-     /* F>1 is assumed output: 8x1 array */
-     double sinph,cosph;
-     sincos(phterm0*myfreq,&sinph,&cosph);
-     cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph);
-     double If,Qf,Uf,Vf;
-     If=Qf=Uf=Vf=0.0;
-     /* evaluate spectra */
-     double fratio=log(myfreq/myf0);
-     double fratio1=fratio*fratio;
-     double fratio2=fratio1*fratio;
-     double cm=spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2;
-     /* catch -ve flux */
-     if (sI0f>0.0) {
-        If=exp(log(sI0f)+cm);
-     } else if (sI0f<0.0) {
-        If=-exp(log(-sI0f)+cm);
-     } 
-     if (sQ0f>0.0) {
-        Qf=exp(log(sQ0f)+cm);
-     } else if (sI0f<0.0) {
-        Qf=-exp(log(-sQ0f)+cm);
-     } 
-     if (sU0f>0.0) {
-        Uf=exp(log(sU0f)+cm);
-     } else if (sI0f<0.0) {
-        Uf=-exp(log(-sU0f)+cm);
-     } 
-     if (sV0f>0.0) {
-        Vf=exp(log(sV0f)+cm);
-     } else if (sI0f<0.0) {
-        Vf=-exp(log(-sV0f)+cm);
-     }
 
-     /* smearing, beam */
-     double scalef=1.0;
-     double phterm =(phterm0*0.5*deltaf);
-     if (phterm!=0.0) {
-      sinph=(sin(phterm)/phterm);
-      scalef *=fabs(sinph); /* catch -ve values due to rounding off */
+
+/* slave thread to calculate coherencies, for 1 source */
+/* baseline (sta1,sta2) at time itm */
+/* K: total sources, uset to find right offset 
+   Kused: actual sources calculated in this thread block
+   Koff: offset in source array to start calculation
+   NOTE: only 1 block is used
+ */
+__global__ void 
+kernel_coherencies_slave(int sta1, int sta2, int itm, int B, int N, int T, int K, int Kused, int Koff, int F, float u, float v, float w, float *freqs, float *beam, float *ll, float *mm, float *nn, float *sI,
+  unsigned char *stype, float *sI0, float *f0, float *spec_idx, float *spec_idx1, float *spec_idx2, int **exs, float deltaf, float deltat, float dec0, float *__restrict__ coh,int dobeam,int blockDim_2) {
+  /* which source we work on */
+  unsigned int k=threadIdx.x+blockDim.x*blockIdx.x;
+
+  extern __shared__ float tmpcoh[]; /* assumed to be size 8*F*Kusedx1 */
+
+  if (k<Kused) {
+   int k1=k+Koff; /* actual source id */
+   /* preload all freq independent variables */
+   /* Fourier phase */
+   float phterm0=2.0f*M_PI*(u*__ldg(&ll[k])+v*__ldg(&mm[k])+w*__ldg(&nn[k]));
+   float sIf,sI0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
+   sIf=__ldg(&sI[k]);
+   if (F>1) {
+     sI0f=__ldg(&sI0[k]);
+     spec_idxf=__ldg(&spec_idx[k]);
+     spec_idx1f=__ldg(&spec_idx1[k]);
+     spec_idx2f=__ldg(&spec_idx2[k]);
+     myf0=__ldg(&f0[k]);
+   }
+   unsigned char stypeT=__ldg(&stype[k]);
+   for(int cf=0; cf<F; cf++) {
+     float sinph,cosph;
+     float myfreq=__ldg(&freqs[cf]);
+     sincosf(phterm0*myfreq,&sinph,&cosph);
+     cuFloatComplex prodterm=make_cuFloatComplex(cosph,sinph);
+     float If;
+     if (F==1) {
+      /* flux: do not use spectra here, because F=1*/
+      If=sIf;
+     } else {
+      /* evaluate spectra */
+      float fratio=__logf(myfreq/myf0);
+      float fratio1=fratio*fratio;
+      float fratio2=fratio1*fratio;
+      /* catch -ve flux */
+      if (sI0f>0.0f) {
+        If=__expf(__logf(sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
+      } else if (sI0f<0.0f) {
+        If=-__expf(__logf(-sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
+      } else {
+        If=0.0f;
+      }
+     }
+     /* smearing */
+     float phterm =phterm0*0.5f*deltaf;
+     if (phterm!=0.0f) {
+      sinph=__sinf(phterm)/phterm;
+      If *=fabsf(sinph); /* catch -ve values due to rounding off */
      }
 
      if (dobeam) {
       /* get beam info */
       //int boffset1=sta1*K*T*F + k1*T*F + cf*T + itm;
-
-      int boffset1=itm*N*K*F+k*N*F+cf*N+sta1;
-      //  printf("itm=%d, k1=%d, sta1=%d, sta2=%d, boffset1=%d, boffset2=%d\n", itm, k1, sta1, sta2, boffset1, boffset2);
+      int boffset1=itm*N*K*F+k1*N*F+cf*N+sta1;
       float beam1=__ldg(&beam[boffset1]);
       //int boffset2=sta2*K*T*F + k1*T*F + cf*T + itm;
-      int boffset2=itm*N*K*F+k*N*F+cf*N+sta2;
+      int boffset2=itm*N*K*F+k1*N*F+cf*N+sta2;
       float beam2=__ldg(&beam[boffset2]);
-      scalef *=(double)(beam1*beam2);
+      If *=beam1*beam2;
      }
 
-
      /* form complex value */
-     prodterm.x *=scalef;
-     prodterm.y *=scalef;
+     prodterm.x *=If;
+     prodterm.y *=If;
 
      /* check for type of source */
      if (stypeT!=STYPE_POINT) {
-      double uscaled=u*myfreq;
-      double vscaled=v*myfreq;
-      double wscaled=w*myfreq;
+      float uscaled=u*myfreq;
+      float vscaled=v*myfreq;
+      float wscaled=w*myfreq;
       if (stypeT==STYPE_SHAPELET) {
-       prodterm=cuCmul(shapelet_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
+       prodterm=cuCmulf(shapelet_contrib(exs[k],uscaled,vscaled,wscaled),prodterm);
       } else if (stypeT==STYPE_GAUSSIAN) {
-       prodterm=cuCmul(gaussian_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
+       prodterm=cuCmulf(gaussian_contrib(exs[k],uscaled,vscaled,wscaled),prodterm);
       } else if (stypeT==STYPE_DISK) {
-       prodterm=cuCmul(disk_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
+       prodterm=cuCmulf(disk_contrib(exs[k],uscaled,vscaled,wscaled),prodterm);
       } else if (stypeT==STYPE_RING) {
-       prodterm=cuCmul(ring_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
+       prodterm=cuCmulf(ring_contrib(exs[k],uscaled,vscaled,wscaled),prodterm);
       }
      }
+     
+//printf("k=%d cf=%d freq=%f uvw %f,%f,%f lmn %f,%f,%f phterm %f If %f\n",k,cf,freqs[cf],u,v,w,ll[k],mm[k],nn[k],phterm,If);
 
-    double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy;
-    Ix=If*prodterm.x;
-    Iy=If*prodterm.y;
-    Qx=Qf*prodterm.x;
-    Qy=Qf*prodterm.y;
-    Ux=Uf*prodterm.x;
-    Uy=Uf*prodterm.y;
-    Vx=Vf*prodterm.x;
-    Vy=Vf*prodterm.y;
+     /* write output to shared array */
+     tmpcoh[k*8*F+8*cf]=prodterm.x;
+     tmpcoh[k*8*F+8*cf+1]=prodterm.y;
+     tmpcoh[k*8*F+8*cf+2]=0.0f;
+     tmpcoh[k*8*F+8*cf+3]=0.0f;
+     tmpcoh[k*8*F+8*cf+4]=0.0f;
+     tmpcoh[k*8*F+8*cf+5]=0.0f;
+     tmpcoh[k*8*F+8*cf+6]=prodterm.x;
+     tmpcoh[k*8*F+8*cf+7]=prodterm.y;
+   }
+  }
+  __syncthreads();
 
-
-
-    output[0]=Ix+Qx;
-    output[1]=Iy+Qy;
-    output[2]=Ux-Vy;
-    output[3]=Vx+Uy;
-    output[4]=Ux+Vy;
-    output[5]=-Vx+Uy;
-    output[6]=Ix-Qx;
-    output[7]=Iy-Qy;
-
-}
-
-
-__device__ void 
-compute_prodterm(int sta1, int sta2, int N, int K, int T,
- double phterm0, double If, double Qf, double Uf, double Vf,
- double myfreq, double deltaf, int dobeam, int itm, int k, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) {
-     /* F==1 is assumed, output: 8x1 array */
-     double sinph,cosph;
-     sincos(phterm0*myfreq,&sinph,&cosph);
-     cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph);
-
-     /* smearing, beam */
-     double scalef=1.0;
-     double phterm =(phterm0*0.5*deltaf);
-     if (phterm!=0.0) {
-      sinph=(sin(phterm)/phterm);
-      scalef *=fabs(sinph); /* catch -ve values due to rounding off */
-     }
-
-     if (dobeam) {
-      /* get beam info */
-      int boffset1=itm*N*K+k*N+sta1;
-      //  printf("itm=%d, k1=%d, sta1=%d, sta2=%d, boffset1=%d, boffset2=%d\n", itm, k1, sta1, sta2, boffset1, boffset2);
-      float beam1=__ldg(&beam[boffset1]);
-      int boffset2=itm*N*K+k*N+sta2;
-      float beam2=__ldg(&beam[boffset2]);
-      scalef *=(double)(beam1*beam2);
-     }
-
-
-     /* form complex value */
-     prodterm.x *=scalef;
-     prodterm.y *=scalef;
-
-     /* check for type of source */
-     if (stypeT!=STYPE_POINT) {
-      double uscaled=u*myfreq;
-      double vscaled=v*myfreq;
-      double wscaled=w*myfreq;
-      if (stypeT==STYPE_SHAPELET) {
-       prodterm=cuCmul(shapelet_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
-      } else if (stypeT==STYPE_GAUSSIAN) {
-       prodterm=cuCmul(gaussian_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
-      } else if (stypeT==STYPE_DISK) {
-       prodterm=cuCmul(disk_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
-      } else if (stypeT==STYPE_RING) {
-       prodterm=cuCmul(ring_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
+  // Build summation tree over elements, handling case where total threads is not a power of two.
+  int nTotalThreads = blockDim_2; // Total number of threads (==Kused), rounded up to the next power of two
+  while(nTotalThreads > 1) {
+    int halfPoint = (nTotalThreads >> 1); // divide by two
+    if (k < halfPoint) {
+     int thread2 = k + halfPoint;
+     if (thread2 < blockDim.x) { // Skipping the fictitious threads >Kused ( blockDim.x ... blockDim_2-1 )
+      for(int cf=0; cf<F; cf++) {
+       tmpcoh[k*8*F+8*cf]=tmpcoh[k*8*F+8*cf]+tmpcoh[thread2*8*F+8*cf];
+       tmpcoh[k*8*F+8*cf+1]=tmpcoh[k*8*F+8*cf+1]+tmpcoh[thread2*8*F+8*cf+1];
+       tmpcoh[k*8*F+8*cf+2]=tmpcoh[k*8*F+8*cf+2]+tmpcoh[thread2*8*F+8*cf+2];
+       tmpcoh[k*8*F+8*cf+3]=tmpcoh[k*8*F+8*cf+3]+tmpcoh[thread2*8*F+8*cf+3];
+       tmpcoh[k*8*F+8*cf+4]=tmpcoh[k*8*F+8*cf+4]+tmpcoh[thread2*8*F+8*cf+4];
+       tmpcoh[k*8*F+8*cf+5]=tmpcoh[k*8*F+8*cf+5]+tmpcoh[thread2*8*F+8*cf+5];
+       tmpcoh[k*8*F+8*cf+6]=tmpcoh[k*8*F+8*cf+6]+tmpcoh[thread2*8*F+8*cf+6];
+       tmpcoh[k*8*F+8*cf+7]=tmpcoh[k*8*F+8*cf+7]+tmpcoh[thread2*8*F+8*cf+7];
       }
+
      }
+    }
+    __syncthreads();
+    nTotalThreads = halfPoint; // Reducing the binary tree size by two
+  }
 
-
-    double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy;
-    Ix=If*prodterm.x;
-    Iy=If*prodterm.y;
-    Qx=Qf*prodterm.x;
-    Qy=Qf*prodterm.y;
-    Ux=Uf*prodterm.x;
-    Uy=Uf*prodterm.y;
-    Vx=Vf*prodterm.x;
-    Vy=Vf*prodterm.y;
-
-
-
-    output[0]=Ix+Qx;
-    output[1]=Iy+Qy;
-    output[2]=Ux-Vy;
-    output[3]=Vx+Uy;
-    output[4]=Ux+Vy;
-    output[5]=-Vx+Uy;
-    output[6]=Ix-Qx;
-    output[7]=Iy-Qy;
-
+  /* add up to form final result */
+  if (threadIdx.x==0) {
+    for(int cf=0; cf<F; cf++) {
+     coh[cf*8*B]=tmpcoh[8*cf];
+     coh[cf*8*B+1]=tmpcoh[8*cf+1];
+     coh[cf*8*B+2]=tmpcoh[8*cf+2];
+     coh[cf*8*B+3]=tmpcoh[8*cf+3];
+     coh[cf*8*B+4]=tmpcoh[8*cf+4];
+     coh[cf*8*B+5]=tmpcoh[8*cf+5];
+     coh[cf*8*B+6]=tmpcoh[8*cf+6];
+     coh[cf*8*B+7]=tmpcoh[8*cf+7];
+    }
+  }
 }
 
 /* master kernel to calculate coherencies */
 __global__ void 
-kernel_coherencies(int B, int N, int T, int K, int F,
-  const double *__restrict__ u, const double *__restrict__ v, const double *__restrict__ w,
-  baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, 
-  const double *__restrict__ sI, const double *__restrict__ sQ, const double *__restrict__ sU, const double *__restrict__ sV,
-  const unsigned char *__restrict__ stype, const double *__restrict__ sI0, 
-const double *__restrict__ sQ0, const double *__restrict__ sU0, const double *__restrict__ sV0,
-  const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
+kernel_coherencies(int B, int N, int T, int K, int F,float *u, float *v, float *w,baseline_t *barr, float *freqs, float *beam, float *ll, float *mm, float *nn, float *sI,
+  unsigned char *stype, float *sI0, float *f0, float *spec_idx, float *spec_idx1, float *spec_idx2, int **exs, float deltaf, float deltat, float dec0, float *coh, int dobeam) {
 
   /* global thread index */
   unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
@@ -573,361 +605,54 @@ const double *__restrict__ sQ0, const double *__restrict__ sU0, const double *__
    int tslot=n/((N*(N-1)/2));
 
 
-   double u_n=(u[n]);
-   double v_n=(v[n]);
-   double w_n=(w[n]);
+#ifdef CUDA_DBG
+   cudaError_t error;
+#endif
 
-   //TODO: figure out if this max_f makes any sense
-   #define MAX_F 20
-   double l_coh[MAX_F][8];
-   for(int cf=0; cf<F; cf++) {
-        l_coh[cf][0]=0.0;
-        l_coh[cf][1]=0.0;
-        l_coh[cf][2]=0.0;
-        l_coh[cf][3]=0.0;
-        l_coh[cf][4]=0.0;
-        l_coh[cf][5]=0.0;
-        l_coh[cf][6]=0.0;
-        l_coh[cf][7]=0.0;
-   }
-
-
-   // split to two cases, F==1 and F>1
-   if (F==1) {
-     //use simply for-loop, if K is very large this may be slow and may need further parallelization
-     for (int k=0; k<K; k++) {
-        //source specific params
-        double sIf,sQf,sUf,sVf;
-        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
-        sIf=__ldg(&sI[k]);
-        sQf=__ldg(&sQ[k]);
-        sUf=__ldg(&sU[k]);
-        sVf=__ldg(&sV[k]);
-
-        unsigned char stypeT=__ldg(&stype[k]);
-
-        double llcoh[8];
-        compute_prodterm(sta1, sta2, N, K, T, phterm0, sIf, sQf, sUf, sVf,
-               __ldg(&(freqs[0])), deltaf, dobeam, tslot, k, beam, exs, stypeT, u_n, v_n, w_n, llcoh);
-
-         l_coh[0][0] +=llcoh[0];
-         l_coh[0][1] +=llcoh[1];
-         l_coh[0][2] +=llcoh[2];
-         l_coh[0][3] +=llcoh[3];
-         l_coh[0][4] +=llcoh[4];
-         l_coh[0][5] +=llcoh[5];
-         l_coh[0][6] +=llcoh[6];
-         l_coh[0][7] +=llcoh[7];
-
-     }
+   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
+   // int ThreadsPerBlock=16;
+   /* each slave thread will calculate one source, 8xF values for all freq */
+   /* also give right offset for coherencies */
+   if (K<ThreadsPerBlock) {
+    /* one kernel is enough, offset is 0 */
+    kernel_coherencies_slave<<<1,K,sizeof(float)*(8*F*K)>>>(sta1,sta2,tslot,B,N,T,K,K,0,F,__ldg(&u[n]),__ldg(&v[n]),__ldg(&w[n]),freqs,beam,ll,mm,nn,sI,stype,sI0,f0,spec_idx,spec_idx1,spec_idx2,exs,deltaf,deltat,dec0,&coh[8*n],dobeam,NearestPowerOf2(K));
+    cudaDeviceSynchronize();
+#ifdef CUDA_DBG
+  error = cudaGetLastError();
+  if(error != cudaSuccess) {
+    // print the CUDA error message and exit
+    printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
+  }
+#endif
    } else {
-     //use simply for-loop, if K is very large this may be slow and may need further parallelization
-     for (int k=0; k<K; k++) {
-        //source specific params
-        double sI0f,sQ0f,sU0f,sV0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
-        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
-        sI0f=__ldg(&sI0[k]);
-        sQ0f=__ldg(&sQ0[k]);
-        sU0f=__ldg(&sU0[k]);
-        sV0f=__ldg(&sV0[k]);
-        spec_idxf=__ldg(&spec_idx[k]);
-        spec_idx1f=__ldg(&spec_idx1[k]);
-        spec_idx2f=__ldg(&spec_idx2[k]);
-        myf0=__ldg(&f0[k]);
-
-        unsigned char stypeT=__ldg(&stype[k]);
-
-        for(int cf=0; cf<F; cf++) {
-            double llcoh[8];
-            compute_prodterm_multifreq(sta1, sta2, N, K, T, F, phterm0, sI0f, sQ0f, sU0f, sV0f, spec_idxf, spec_idx1f, spec_idx2f, 
-               myf0, __ldg(&(freqs[cf])), deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n,llcoh);
-         l_coh[cf][0] +=llcoh[0];
-         l_coh[cf][1] +=llcoh[1];
-         l_coh[cf][2] +=llcoh[2];
-         l_coh[cf][3] +=llcoh[3];
-         l_coh[cf][4] +=llcoh[4];
-         l_coh[cf][5] +=llcoh[5];
-         l_coh[cf][6] +=llcoh[6];
-         l_coh[cf][7] +=llcoh[7];
-        }
-
+    /* more than 1 kernel */
+    int L=(K+ThreadsPerBlock-1)/ThreadsPerBlock;
+    int ct=0;
+    int myT;
+    for (int ci=0; ci<L; ci++) {
+     if (ct+ThreadsPerBlock<K) {
+       myT=ThreadsPerBlock;
+     } else {
+       myT=K-ct;
      }
-
-
-   }
-
-
-
-     //write output with right multi frequency offset
-    double *coh1 = &coh[8*n];
-    for(int cf=0; cf<F; cf++) {
-        coh1[cf*8*B+0] = l_coh[cf][0];
-        coh1[cf*8*B+1] = l_coh[cf][1];
-        coh1[cf*8*B+2] = l_coh[cf][2];
-        coh1[cf*8*B+3] = l_coh[cf][3];
-        coh1[cf*8*B+4] = l_coh[cf][4];
-        coh1[cf*8*B+5] = l_coh[cf][5];
-        coh1[cf*8*B+6] = l_coh[cf][6];
-        coh1[cf*8*B+7] = l_coh[cf][7];
+     /* launch kernel with myT threads, starting at ct offset */
+     kernel_coherencies_slave<<<1,myT,sizeof(float)*(8*F*myT)>>>(sta1,sta2,tslot,B,N,T,K,myT,ct,F,__ldg(&u[n]),__ldg(&v[n]),__ldg(&w[n]),freqs,beam,&ll[ct],&mm[ct],&nn[ct],&sI[ct],&stype[ct],&sI0[ct],&f0[ct],&spec_idx[ct],&spec_idx1[ct],&spec_idx2[ct],&exs[ct],deltaf,deltat,dec0,&coh[8*n],dobeam,NearestPowerOf2(myT));
+     cudaDeviceSynchronize();
+#ifdef CUDA_DBG
+  error = cudaGetLastError();
+  if(error != cudaSuccess) {
+    // print the CUDA error message and exit
+    printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
+  }
+#endif
+     ct=ct+ThreadsPerBlock;
     }
+   }
 
-  
   }
 
 }
 
-/* kernel to calculate residuals */
-__global__ void 
-kernel_residuals(int B, int N, int T, int K, int F,
-  const double *__restrict__ u, const double *__restrict__ v, const double *__restrict__ w,
-  const double *__restrict__ p, int nchunk,
-  baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, 
-  const double *__restrict__ sI, const double *__restrict__ sQ, const double *__restrict__ sU, const double *__restrict__ sV,
-  const unsigned char *__restrict__ stype, const double *__restrict__ sI0, 
-const double *__restrict__ sQ0, const double *__restrict__ sU0, const double *__restrict__ sV0,
-  const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
-
-  /* global thread index */
-  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
-
-  /* each thread will calculate for one baseline, over all sources */
-  if (n<B) {
-   int sta1=barr[n].sta1;
-   int sta2=barr[n].sta2;
-   /* find out which time slot this baseline is from */
-   int tslot=n/((N*(N-1)/2));
-   /* find out which chunk to select from p : 0,1..nchunk-1 */
-   int chunk=(n)/((B+nchunk-1)/nchunk);
-   /* create G1 and G2 Jones matrices from p */
-   cuDoubleComplex G1[4],G2[4];
-   G1[0].x=__ldg(&p[chunk*8*N+sta1*8]);
-   G1[0].y=__ldg(&p[chunk*8*N+sta1*8+1]);
-   G1[1].x=__ldg(&p[chunk*8*N+sta1*8+2]);
-   G1[1].y=__ldg(&p[chunk*8*N+sta1*8+3]);
-   G1[2].x=__ldg(&p[chunk*8*N+sta1*8+4]);
-   G1[2].y=__ldg(&p[chunk*8*N+sta1*8+5]);
-   G1[3].x=__ldg(&p[chunk*8*N+sta1*8+6]);
-   G1[3].y=__ldg(&p[chunk*8*N+sta1*8+7]);
-   G2[0].x=__ldg(&p[chunk*8*N+sta2*8]);
-   G2[0].y=__ldg(&p[chunk*8*N+sta2*8+1]);
-   G2[1].x=__ldg(&p[chunk*8*N+sta2*8+2]);
-   G2[1].y=__ldg(&p[chunk*8*N+sta2*8+3]);
-   G2[2].x=__ldg(&p[chunk*8*N+sta2*8+4]);
-   G2[2].y=__ldg(&p[chunk*8*N+sta2*8+5]);
-   G2[3].x=__ldg(&p[chunk*8*N+sta2*8+6]);
-   G2[3].y=__ldg(&p[chunk*8*N+sta2*8+7]);
-
-   double u_n=(u[n]);
-   double v_n=(v[n]);
-   double w_n=(w[n]);
-
-   //TODO: figure out if this max_f makes any sense
-   #define MAX_F 20
-   double l_coh[MAX_F][8];
-   for(int cf=0; cf<F; cf++) {
-        l_coh[cf][0]=0.0;
-        l_coh[cf][1]=0.0;
-        l_coh[cf][2]=0.0;
-        l_coh[cf][3]=0.0;
-        l_coh[cf][4]=0.0;
-        l_coh[cf][5]=0.0;
-        l_coh[cf][6]=0.0;
-        l_coh[cf][7]=0.0;
-   }
-
-
-   // split to two cases, F==1 and F>1
-   if (F==1) {
-     //use simply for-loop, if K is very large this may be slow and may need further parallelization
-     for (int k=0; k<K; k++) {
-        //source specific params
-        double sIf,sQf,sUf,sVf;
-        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
-        sIf=__ldg(&sI[k]);
-        sQf=__ldg(&sQ[k]);
-        sUf=__ldg(&sU[k]);
-        sVf=__ldg(&sV[k]);
-
-        unsigned char stypeT=__ldg(&stype[k]);
-
-        double llcoh[8];
-        compute_prodterm(sta1, sta2, N, K, T, phterm0, sIf, sQf, sUf, sVf,
-               __ldg(&(freqs[0])), deltaf, dobeam, tslot, k, beam, exs, stypeT, u_n, v_n, w_n, llcoh);
-
-         l_coh[0][0] +=llcoh[0];
-         l_coh[0][1] +=llcoh[1];
-         l_coh[0][2] +=llcoh[2];
-         l_coh[0][3] +=llcoh[3];
-         l_coh[0][4] +=llcoh[4];
-         l_coh[0][5] +=llcoh[5];
-         l_coh[0][6] +=llcoh[6];
-         l_coh[0][7] +=llcoh[7];
-
-
-     }
-     cuDoubleComplex L1[4],L2[4];
-     L1[0].x=l_coh[0][0];
-     L1[0].y=l_coh[0][1];
-     L1[1].x=l_coh[0][2];
-     L1[1].y=l_coh[0][3];
-     L1[2].x=l_coh[0][4];
-     L1[2].y=l_coh[0][5];
-     L1[3].x=l_coh[0][6];
-     L1[3].y=l_coh[0][7];
-     /* L2=G1*L1 */
-     amb(G1,L1,L2);
-     /* L1=L2*G2^H */
-     ambt(L2,G2,L1);
-     l_coh[0][0]=L1[0].x;
-     l_coh[0][1]=L1[0].y;
-     l_coh[0][2]=L1[1].x;
-     l_coh[0][3]=L1[1].y;
-     l_coh[0][4]=L1[2].x;
-     l_coh[0][5]=L1[2].y;
-     l_coh[0][6]=L1[3].x;
-     l_coh[0][7]=L1[3].y;
-
-   } else {
-     //use simply for-loop, if K is very large this may be slow and may need further parallelization
-     for (int k=0; k<K; k++) {
-        //source specific params
-        double sI0f,sQ0f,sU0f,sV0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
-        double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
-        sI0f=__ldg(&sI0[k]);
-        sQ0f=__ldg(&sQ0[k]);
-        sU0f=__ldg(&sU0[k]);
-        sV0f=__ldg(&sV0[k]);
-        spec_idxf=__ldg(&spec_idx[k]);
-        spec_idx1f=__ldg(&spec_idx1[k]);
-        spec_idx2f=__ldg(&spec_idx2[k]);
-        myf0=__ldg(&f0[k]);
-
-        unsigned char stypeT=__ldg(&stype[k]);
-
-        for(int cf=0; cf<F; cf++) {
-            double llcoh[8];
-            compute_prodterm_multifreq(sta1, sta2, N, K, T, F, phterm0, sI0f, sQ0f, sU0f, sV0f, spec_idxf, spec_idx1f, spec_idx2f, 
-               myf0, __ldg(&(freqs[cf])), deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n,llcoh);
-         l_coh[cf][0] +=llcoh[0];
-         l_coh[cf][1] +=llcoh[1];
-         l_coh[cf][2] +=llcoh[2];
-         l_coh[cf][3] +=llcoh[3];
-         l_coh[cf][4] +=llcoh[4];
-         l_coh[cf][5] +=llcoh[5];
-         l_coh[cf][6] +=llcoh[6];
-         l_coh[cf][7] +=llcoh[7];
-        }
-
-     }
-
-     cuDoubleComplex L1[4],L2[4];
-     for(int cf=0; cf<F; cf++) {
-       L1[0].x=l_coh[cf][0];
-       L1[0].y=l_coh[cf][1];
-       L1[1].x=l_coh[cf][2];
-       L1[1].y=l_coh[cf][3];
-       L1[2].x=l_coh[cf][4];
-       L1[2].y=l_coh[cf][5];
-       L1[3].x=l_coh[cf][6];
-       L1[3].y=l_coh[cf][7];
-       /* L2=G1*L1 */
-       amb(G1,L1,L2);
-       /* L1=L2*G2^H */
-       ambt(L2,G2,L1);
-       l_coh[cf][0]=L1[0].x;
-       l_coh[cf][1]=L1[0].y;
-       l_coh[cf][2]=L1[1].x;
-       l_coh[cf][3]=L1[1].y;
-       l_coh[cf][4]=L1[2].x;
-       l_coh[cf][5]=L1[2].y;
-       l_coh[cf][6]=L1[3].x;
-       l_coh[cf][7]=L1[3].y;
-     }
-
-
-   }
-
-
-
-     //write output with right multi frequency offset
-    double *coh1 = &coh[8*n];
-    for(int cf=0; cf<F; cf++) {
-        coh1[cf*8*B+0] = l_coh[cf][0];
-        coh1[cf*8*B+1] = l_coh[cf][1];
-        coh1[cf*8*B+2] = l_coh[cf][2];
-        coh1[cf*8*B+3] = l_coh[cf][3];
-        coh1[cf*8*B+4] = l_coh[cf][4];
-        coh1[cf*8*B+5] = l_coh[cf][5];
-        coh1[cf*8*B+6] = l_coh[cf][6];
-        coh1[cf*8*B+7] = l_coh[cf][7];
-    }
-
-  
-  }
-
-}
-
-
-/* kernel to correct residuals */
-__global__ void
-kernel_correct_residuals(int B, int N, int Nb, int boff, int F, int nchunk, double *__restrict__ x, const double *__restrict__ p, baseline_t *barr)  {
-
-  /* relative baseline index */
-  unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
-
-  if (n<Nb) {
-   int sta1=barr[n].sta1;
-   int sta2=barr[n].sta2;
-   /* find out which chunk to select from p : 0,1..nchunk-1 */
-   int chunk=(n+boff)/((B+nchunk-1)/nchunk);
-   /* create G1 and G2 Jones matrices from p */
-   cuDoubleComplex G1[4],G2[4];
-   G1[0].x=__ldg(&p[chunk*8*N+sta1*8]);
-   G1[0].y=__ldg(&p[chunk*8*N+sta1*8+1]);
-   G1[1].x=__ldg(&p[chunk*8*N+sta1*8+2]);
-   G1[1].y=__ldg(&p[chunk*8*N+sta1*8+3]);
-   G1[2].x=__ldg(&p[chunk*8*N+sta1*8+4]);
-   G1[2].y=__ldg(&p[chunk*8*N+sta1*8+5]);
-   G1[3].x=__ldg(&p[chunk*8*N+sta1*8+6]);
-   G1[3].y=__ldg(&p[chunk*8*N+sta1*8+7]);
-   G2[0].x=__ldg(&p[chunk*8*N+sta2*8]);
-   G2[0].y=__ldg(&p[chunk*8*N+sta2*8+1]);
-   G2[1].x=__ldg(&p[chunk*8*N+sta2*8+2]);
-   G2[1].y=__ldg(&p[chunk*8*N+sta2*8+3]);
-   G2[2].x=__ldg(&p[chunk*8*N+sta2*8+4]);
-   G2[2].y=__ldg(&p[chunk*8*N+sta2*8+5]);
-   G2[3].x=__ldg(&p[chunk*8*N+sta2*8+6]);
-   G2[3].y=__ldg(&p[chunk*8*N+sta2*8+7]);
-
-   for (int cf=0; cf<F; cf++) {
-      cuDoubleComplex L1[4],L2[4];
-
-      L1[0].x=x[cf*8*Nb+8*n];
-      L1[0].y=x[cf*8*Nb+8*n+1];
-      L1[1].x=x[cf*8*Nb+8*n+2];
-      L1[1].y=x[cf*8*Nb+8*n+3];
-      L1[2].x=x[cf*8*Nb+8*n+4];
-      L1[2].y=x[cf*8*Nb+8*n+5];
-      L1[3].x=x[cf*8*Nb+8*n+6];
-      L1[3].y=x[cf*8*Nb+8*n+7];
-
-      /* L2=G1*L1 */
-      amb(G1,L1,L2);
-      /* L1=L2*G2^H */
-      ambt(L2,G2,L1);
-
-      x[cf*8*Nb+8*n]=L1[0].x;
-      x[cf*8*Nb+8*n+1]=L1[0].y;
-      x[cf*8*Nb+8*n+2]=L1[1].x;
-      x[cf*8*Nb+8*n+3]=L1[1].y;
-      x[cf*8*Nb+8*n+4]=L1[2].x;
-      x[cf*8*Nb+8*n+5]=L1[2].y;
-      x[cf*8*Nb+8*n+6]=L1[3].x;
-      x[cf*8*Nb+8*n+7]=L1[3].y;
-
-   }
-  }
-}
 
 /* kernel to convert time (JD) to GMST angle*/
 __global__ void 
@@ -980,7 +705,7 @@ checkCudaError(cudaError_t err, const char *file, int line)
 */
 
 void
-cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitude, float *latitude,
+cudakernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude, float *latitude,
  double *time_utc, int *Nelem, float **xx, float **yy, float **zz, float *ra, float *dec, float ph_ra0, float ph_dec0, float ph_freq0, float *beam) {
 #ifdef CUDA_DBG
   cudaError_t error;
@@ -991,17 +716,28 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud
   //cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128*1024*1024);
   // for an array of max 24*16 x 2  double, the default 8MB is ok
 
-  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
+  /* total number of threads needed */
+  int Ntotal=N*T*K*F;
+  float *buffer;
+  /* allocate buffer to store intermerdiate sin() cos() values per thread */
+#ifdef CUDA_DBG
+  error=cudaMalloc((void**)&buffer, 2*Ntotal*sizeof(float));
+  checkCudaError(error,__FILE__,__LINE__);
+#endif
+#ifndef CUDA_DBG
+  cudaMalloc((void**)&buffer, 2*Ntotal*sizeof(float));
+#endif
+  cudaMemset(buffer,0,sizeof(float)*2*Ntotal);
+
+
+  // int ThreadsPerBlock=DEFAULT_TH_PER_BK;
+  int ThreadsPerBlock=8;
   /* note: make sure we do not exceed max no of blocks available, otherwise (too many sources, loop over source id) */
-
-  /* 2D grid of threads: x dim->data, y dim-> stations */
-  dim3 grid(1, 1, 1);
-  grid.x = (int)ceilf((K*T*F) / (float)ThreadsPerBlock);
-  grid.y = N;
-
-  kernel_array_beam<<<grid,ThreadsPerBlock>>>(N,T,K,F,freqs,longitude,latitude,time_utc,Nelem,xx,yy,zz,ra,dec,ph_ra0,ph_dec0,ph_freq0,beam);
+  int BlocksPerGrid= 2*(Ntotal+ThreadsPerBlock-1)/ThreadsPerBlock;
+  kernel_array_beam<<<BlocksPerGrid,ThreadsPerBlock>>>(N,T,K,F,freqs,longitude,latitude,time_utc,Nelem,xx,yy,zz,ra,dec,ph_ra0,ph_dec0,ph_freq0,beam,buffer);
   cudaDeviceSynchronize();
 
+  cudaFree(buffer);
 #ifdef CUDA_DBG
   error = cudaGetLastError();
   if(error != cudaSuccess) {
@@ -1015,7 +751,7 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud
 
 /* 
   calculate coherencies:
-  B: total baselines (could be more than one timeslot)
+  B: total baselines
   N: no of stations
   T: no of time slots
   K: no of sources
@@ -1038,20 +774,26 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud
   dobeam: enable beam if >0
 */
 void
-cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
-  unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
+cudakernel_coherencies(int B, int N, int T, int K, int F, float *u, float *v, float *w,baseline_t *barr, float *freqs, float *beam, float *ll, float *mm, float *nn, float *sI,
+  unsigned char *stype, float *sI0, float *f0, float *spec_idx, float *spec_idx1, float *spec_idx2, int **exs, float deltaf, float deltat, float dec0, float *coh,int dobeam) {
 #ifdef CUDA_DBG
   cudaError_t error;
   error = cudaGetLastError();
+  error=cudaMemset(coh,0,sizeof(float)*8*B*F);
+  checkCudaError(error,__FILE__,__LINE__);
+#endif
+#ifndef CUDA_DBG
+  cudaMemset(coh,0,sizeof(float)*8*B*F);
 #endif
 
   /* spawn threads to handle baselines, these threads will spawn threads for sources */
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
+  // int ThreadsPerBlock=16;
   /* note: make sure we do not exceed max no of blocks available, 
    otherwise (too many baselines, loop over source id) */
-  int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock;
-  kernel_coherencies<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV,
-    stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
+  int BlocksPerGrid= 2*(B+ThreadsPerBlock-1)/ThreadsPerBlock;
+  kernel_coherencies<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI,
+    stype, sI0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
   cudaDeviceSynchronize();
 #ifdef CUDA_DBG
   error = cudaGetLastError();
@@ -1064,56 +806,6 @@ cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v,
 }
 
 
-/* p : parameters 8Nxnchunk values */
-void
-cudakernel_residuals(int B, int N, int T, int K, int F, double *u, double *v, double *w, double *p, int nchunk, baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
-  unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
-#ifdef CUDA_DBG
-  cudaError_t error;
-  error = cudaGetLastError();
-#endif
-
-  /* spawn threads to handle baselines, these threads will loop over sources */
-  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  /* note: make sure we do not exceed max no of blocks available, 
-   otherwise (too many baselines, loop over source id) */
-  int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock;
-  kernel_residuals<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,p,nchunk,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV,
-    stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
-  cudaDeviceSynchronize();
-#ifdef CUDA_DBG
-  error = cudaGetLastError();
-  if(error != cudaSuccess) {
-    // print the CUDA error message and exit
-    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
-    exit(-1);
-  }
-#endif
-}
-
-
-void
-cudakernel_correct_residuals(int B, int N, int Nb, int boff, int F, int nchunk, double *x, double *p, baseline_t *barr) {
-#ifdef CUDA_DBG
-  cudaError_t error;
-  error = cudaGetLastError();
-#endif
-
-  /* spawn threads to handle baselines */
-  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(Nb+ThreadsPerBlock-1)/ThreadsPerBlock;
-  kernel_correct_residuals<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, Nb, boff, F, nchunk, x, p, barr);
-  cudaDeviceSynchronize();
-#ifdef CUDA_DBG
-  error = cudaGetLastError();
-  if(error != cudaSuccess) {
-    // print the CUDA error message and exit
-    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
-    exit(-1);
-  }
-#endif
-}
-
 /* convert time JD to GMST angle
   store result at the same location */
 void
@@ -1126,10 +818,10 @@ cudakernel_convert_time(int T, double *time_utc) {
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
   /* note: make sure we do not exceed max no of blocks available, 
    otherwise (too many baselines, loop over source id) */
-  int BlocksPerGrid=(T+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(T+ThreadsPerBlock-1)/ThreadsPerBlock;
   kernel_convert_time<<<BlocksPerGrid,ThreadsPerBlock>>>(T,time_utc);
   cudaDeviceSynchronize();
-#ifdef CUDA_DBG
+ #ifdef CUDA_DBG
   error = cudaGetLastError();
   if(error != cudaSuccess) {
     // print the CUDA error message and exit
diff --git a/src/lib/Radio/predict_withbeam.c b/src/lib/Radio/predict_withbeam.c
index 53a3d9a..52722e2 100644
--- a/src/lib/Radio/predict_withbeam.c
+++ b/src/lib/Radio/predict_withbeam.c
@@ -1114,7 +1114,7 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit
     for (cj=0; cj<carr[cm].nchunk; cj++) {
       pm=&(p[carr[cm].p[cj]]); /* start of solutions */
       /* extract phase of pm, output to pphase */
-      extract_phases(pm,pphase,N,10);
+      // extract_phases(pm,pphase,N,10);
       /* invert N solutions */
       for (ci=0; ci<N; ci++) {
        mat_invert(&pphase[8*ci],&pinv[8*ci+8*N*cj], rho);
diff --git a/src/lib/Radio/predict_withbeam_gpu.c b/src/lib/Radio/predict_withbeam_gpu.c
index 695ad9c..d69da91 100644
--- a/src/lib/Radio/predict_withbeam_gpu.c
+++ b/src/lib/Radio/predict_withbeam_gpu.c
@@ -21,7 +21,6 @@
 
 #include "Radio.h"
 
-//#define CUDA_DEBUG
 static void
 checkCudaError(cudaError_t err, char *file, int line)
 {
@@ -33,44 +32,6 @@ checkCudaError(cudaError_t err, char *file, int line)
 #endif
 }
 
-/* invert matrix xx - 8x1 array
- *  * store it in   yy - 8x1 array
- * FIXME: remove duplication of this  */
-static int
-mat_invert(double xx[8],double yy[8], double rho) {
- complex double a[4];
- complex double det;
- complex double b[4];
-  
- a[0]=xx[0]+xx[1]*_Complex_I+rho;
- a[1]=xx[2]+xx[3]*_Complex_I;
- a[2]=xx[4]+xx[5]*_Complex_I;
- a[3]=xx[6]+xx[7]*_Complex_I+rho;
-  
-  
-  
- det=a[0]*a[3]-a[1]*a[2];
- if (sqrt(cabs(det))<=rho) {
-  det+=rho;
- }
- det=1.0/det;
- b[0]=a[3]*det;
- b[1]=-a[1]*det;
- b[2]=-a[2]*det;
- b[3]=a[0]*det;
-
-
- yy[0]=creal(b[0]);
- yy[1]=cimag(b[0]);
- yy[2]=creal(b[1]);
- yy[3]=cimag(b[1]);
- yy[4]=creal(b[2]);
- yy[5]=cimag(b[2]);
- yy[6]=creal(b[3]);
- yy[7]=cimag(b[3]);
-
- return 0;
-}
 
 /* struct to pass data to worker threads attached to GPUs */
 typedef struct thread_data_pred_t_ {
@@ -108,22 +69,6 @@ typedef struct thread_data_pred_t_ {
 
 } thread_data_pred_t;
 
-/* struct for passing data to threads for correcting data */
-typedef struct thread_data_corr_t_ {
-  int tid; /* this thread id */
-  taskhist *hst; /* for load balancing GPUs */
-
-  double *x; /* residual vector to be corrected : 8*Nbase*Nf*/
-  int N; /* stations */
-  int Nbase; /* total baselines (N-1)N/2 x tilesz */
-  int boff; /* offset of x in for each thread (multiple columns if Nf>1) */
-  int Nb; /* no of baselines handeled by this thread */
-  baseline_t *barr; /* baseline info */
-  int Nf; /* of of freqs */
-
-  double *pinv; /* inverse of solutions to apply: size 8N*nchunk */
-  int nchunk; /* how many solutions */
-} thread_data_corr_t;
 
 /* copy Nx1 double array x to device as float
    first allocate device memory */
@@ -146,7 +91,7 @@ dtofcopy(int N, float **x_d, double *x) {
   checkCudaError(err,__FILE__,__LINE__);
 
   /* copy memory */
-  err=cudaMemcpy(xc,xhost,N*sizeof(float),cudaMemcpyHostToDevice);
+  err=cudaMemcpyAsync(xc,xhost,N*sizeof(float),cudaMemcpyHostToDevice,0);
   checkCudaError(err,__FILE__,__LINE__);
 
   /* free host buffer */
@@ -173,57 +118,38 @@ precalcoh_threadfn(void *data) {
 
   err=cudaSetDevice(card);
   checkCudaError(err,__FILE__,__LINE__);
-  err=cudaSetDeviceFlags(cudaDeviceLmemResizeToMax);
-  checkCudaError(err,__FILE__,__LINE__);
 
-  /* make sure enough heap memory is available for shapelet computations */
-  size_t plim;
-  err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize);
-  checkCudaError(err,__FILE__,__LINE__);
-  if (plim<GPU_HEAP_SIZE*1024*1024) { 
-   err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, GPU_HEAP_SIZE*1024*1024);
-   checkCudaError(err,__FILE__,__LINE__);
+  float *ud,*vd,*wd,*cohd;
+  baseline_t *barrd;
+  float *freqsd;
+
+  float *longd,*latd; double *timed, *copyoftimed;
+  // This is needed to write times to file. 
+  // Because of the convert_time applied to timed, we cannot use t->time_utc as input to fwrite.
+  if ((copyoftimed=(double*)calloc((size_t) t->tilesz, sizeof(double)))==0) {
+      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+      exit(1);
   }
 
-  double *ud,*vd,*wd;
-  double *cohd;
-  baseline_t *barrd;
-  double *freqsd;
-  float *longd=0,*latd=0; double *timed;
   int *Nelemd;
-  float **xx_p=0,**yy_p=0,**zz_p=0;
+  float **xx_p,**yy_p,**zz_p;
   float **xxd,**yyd,**zzd;
   /* allocate memory in GPU */
-  err=cudaMalloc((void**) &cohd, t->Nbase*8*sizeof(double)); /* coherencies only for 1 cluster, Nf=1 */
+  err=cudaMalloc((void**) &cohd, t->Nbase*8*sizeof(float)); /* coherencies only for 1 cluster, Nf=1 */
   checkCudaError(err,__FILE__,__LINE__);
   err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t));
   checkCudaError(err,__FILE__,__LINE__);
   err=cudaMalloc((void**) &Nelemd, t->N*sizeof(int));
   checkCudaError(err,__FILE__,__LINE__);
   
-  /* u,v,w and l,m,n coords need to be double for precision */
-  err=cudaMalloc((void**) &ud, t->Nbase*sizeof(double));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMalloc((void**) &vd, t->Nbase*sizeof(double));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMalloc((void**) &wd, t->Nbase*sizeof(double));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(ud, t->u, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(vd, t->v, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(wd, t->w, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-
+  /* copy to device */
+  dtofcopy(t->Nbase,&ud,t->u);
+  dtofcopy(t->Nbase,&vd,t->v);
+  dtofcopy(t->Nbase,&wd,t->w);
   err=cudaMemcpy(barrd, t->barr, t->Nbase*sizeof(baseline_t), cudaMemcpyHostToDevice);
   checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMalloc((void**) &freqsd, t->Nf*sizeof(double));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(freqsd, t->freqs, t->Nf*sizeof(double), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
 
-
-  if (t->dobeam) {
+  dtofcopy(t->Nf,&freqsd,t->freqs);
   dtofcopy(t->N,&longd,t->longitude);
   dtofcopy(t->N,&latd,t->latitude);
   err=cudaMalloc((void**) &timed, t->tilesz*sizeof(double));
@@ -233,9 +159,19 @@ precalcoh_threadfn(void *data) {
   /* convert time jd to GMST angle */
   cudakernel_convert_time(t->tilesz,timed);
 
+  // Fill copyoftimed.
+  err=cudaMemcpy((double*)copyoftimed, timed, t->tilesz*sizeof(double), cudaMemcpyDeviceToHost);
+
   err=cudaMemcpy(Nelemd, t->Nelem, t->N*sizeof(int), cudaMemcpyHostToDevice);
   checkCudaError(err,__FILE__,__LINE__);
 
+  /* temp host storage to copy coherencies */
+  complex float *tempcoh;
+  if ((tempcoh=(complex float*)calloc((size_t)t->Nbase*4,sizeof(complex float)))==0) {
+      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+      exit(1);
+  }
+
   /* jagged arrays for element locations */
   err=cudaMalloc((void**)&xxd, t->N*sizeof(int*));
   checkCudaError(err,__FILE__,__LINE__);
@@ -277,121 +213,40 @@ precalcoh_threadfn(void *data) {
   checkCudaError(err,__FILE__,__LINE__);
   err=cudaMemcpy(zzd, zz_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
   checkCudaError(err,__FILE__,__LINE__);
-  }
-
 
   float *beamd;
-  double *lld,*mmd,*nnd;
-  double *sId; float *rad,*decd;
-  double *sQd,*sUd,*sVd;
+  /* temp host storage for beam */
+  float *tempbeam;
+
+  float *lld,*mmd,*nnd,*sId,*rad,*decd;
   unsigned char *styped;
-  double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
-  double *sQ0d,*sU0d,*sV0d;
+  float *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
   int **host_p,**dev_p;
-
-  complex double *tempdcoh;
-  err=cudaMallocHost((void**)&tempdcoh,sizeof(complex double)*(size_t)t->Nbase*4);
-  checkCudaError(err,__FILE__,__LINE__);
-
 /******************* begin loop over clusters **************************/
   for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
-
-     if (t->dobeam) {
-      /* allocate memory for this clusters beam */
-      err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float));
-      checkCudaError(err,__FILE__,__LINE__);
-     } else {
-      beamd=0;
-     }
+     /* allocate memory for this clusters beam */
+     err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float));
+     checkCudaError(err,__FILE__,__LINE__);
 
      /* copy cluster details to GPU */
      err=cudaMalloc((void**)&styped, t->carr[ncl].N*sizeof(unsigned char));
      checkCudaError(err,__FILE__,__LINE__);
 
-     err=cudaMalloc((void**) &lld, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(lld, t->carr[ncl].ll, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &mmd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(mmd, t->carr[ncl].mm, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &nnd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-
-
-     if (t->Nf==1) {
-     err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-
-     err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     }
-
-
-
-     if (t->dobeam) {
-      dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
-      dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
-     } else {
-       rad=0;
-       decd=0;
-     }
+     dtofcopy(t->carr[ncl].N,&lld,t->carr[ncl].ll);
+     dtofcopy(t->carr[ncl].N,&mmd,t->carr[ncl].mm);
+     dtofcopy(t->carr[ncl].N,&nnd,t->carr[ncl].nn);
+     dtofcopy(t->carr[ncl].N,&sId,t->carr[ncl].sI);
+     dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
+     dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
      err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice);
      checkCudaError(err,__FILE__,__LINE__);
 
-     /* for multi channel data - FIXME: remove this part because Nf==1 always */
-     if (t->Nf>1) {
-     err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &f0d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(f0d, t->carr[ncl].f0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &spec_idxd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(spec_idxd, t->carr[ncl].spec_idx, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &spec_idx1d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(spec_idx1d, t->carr[ncl].spec_idx1, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &spec_idx2d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-
-
-     err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-
-     }
+     /* for multi channel data */
+     dtofcopy(t->carr[ncl].N,&sI0d,t->carr[ncl].sI0);
+     dtofcopy(t->carr[ncl].N,&f0d,t->carr[ncl].f0);
+     dtofcopy(t->carr[ncl].N,&spec_idxd,t->carr[ncl].spec_idx);
+     dtofcopy(t->carr[ncl].N,&spec_idx1d,t->carr[ncl].spec_idx1);
+     dtofcopy(t->carr[ncl].N,&spec_idx2d,t->carr[ncl].spec_idx2);
 
      /* extra info for source, if any */
      if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
@@ -442,29 +297,152 @@ precalcoh_threadfn(void *data) {
         
 
      }
+
+     if ((tempbeam=(float*)calloc((size_t) t->N*t->tilesz*t->carr[ncl].N*t->Nf,sizeof(float)))==0) {
+         fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+         exit(1);
+     }
+
      /* now copy pointer locations to device */
      err=cudaMemcpy(dev_p, host_p, t->carr[ncl].N*sizeof(int*), cudaMemcpyHostToDevice);
      checkCudaError(err,__FILE__,__LINE__);
 
-     if (t->dobeam) {
-     /* now calculate beam for all sources in this cluster */
-      cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd);
-     }
+     FILE *t_N;
+     t_N=fopen("t_N.bin","wb");
+     fwrite(&t->N, sizeof(int), sizeof(t->N)/sizeof(int), t_N);
+     fclose(t_N);
 
+     FILE *t_tilesz;
+     t_tilesz=fopen("t_tilesz.bin","wb");
+     fwrite(&t->tilesz, sizeof(int), sizeof(t->tilesz)/sizeof(int), t_tilesz);
+     fclose(t_tilesz);
+
+     FILE *t_carr_ncl_N;
+     t_carr_ncl_N=fopen("t_carr_ncl_N.bin","wb");
+     fwrite(&t->carr[ncl].N, sizeof(int), sizeof(t->carr[ncl].N)/sizeof(int), t_carr_ncl_N);
+     fclose(t_carr_ncl_N);
+
+     FILE *t_Nf;
+     t_Nf=fopen("t_Nf.bin","wb");
+     fwrite(&t->Nf, sizeof(int), sizeof(t->Nf)/sizeof(int), t_Nf);
+     fclose(t_Nf);
+
+     FILE *freq_sd;
+     freq_sd=fopen("freq_sd.bin","wb");
+     fwrite(t->freqs, sizeof(double), t->Nf, freq_sd);
+     fclose(freq_sd);
+
+     FILE *long_d;
+     long_d=fopen("long_d.bin","wb");
+     fwrite(t->longitude, sizeof(double), t->N, long_d);
+     fclose(long_d);
+
+     FILE *lat_d;
+     lat_d=fopen("lat_d.bin","wb");
+     fwrite(t->latitude, sizeof(double), t->N, lat_d);
+     fclose(lat_d);
+
+     FILE *time_d;
+     time_d=fopen("time_d.bin","wb");
+     fwrite(&copyoftimed, sizeof(double), t->tilesz, time_d);
+     fclose(time_d);
+
+     FILE *Nelem_d;
+     Nelem_d=fopen("Nelem_d.bin","wb");
+     fwrite(t->Nelem, sizeof(int), t->N, Nelem_d);
+     fclose(Nelem_d);
+
+     FILE *xx_d;
+     xx_d=fopen("xx_d.bin","wb");
+     int j;
+     for (j=0; j<t->N; j++){
+         fwrite(t->xx[j], sizeof(double), t->Nelem[j], xx_d);
+     }
+     fclose(xx_d);
+
+     FILE *yy_d;
+     yy_d=fopen("yy_d.bin","wb");
+     for (j=0; j<t->N; j++){
+         fwrite(t->yy[j], sizeof(double), t->Nelem[j], yy_d);
+     }
+     fclose(yy_d);
+
+     FILE *zz_d;
+     zz_d=fopen("zz_d.bin","wb");
+     for (j=0; j<t->N; j++){
+         fwrite(t->zz[j], sizeof(double), t->Nelem[j], zz_d);
+     }
+     fclose(zz_d);
+
+     FILE *ra_d;
+     ra_d=fopen("ra_d.bin","wb");
+     fwrite(t->carr[ncl].ra, t->carr[ncl].N, sizeof(double), ra_d);
+     fclose(ra_d);
+
+     FILE *dec_d;
+     dec_d=fopen("dec_d.bin","wb");
+     fwrite(t->carr[ncl].dec, t->carr[ncl].N, sizeof(double), dec_d);
+     fclose(dec_d);
+
+     FILE *t_ph_ra0;
+     t_ph_ra0=fopen("t_ph_ra0.bin","wb");
+     fwrite((double *)&t->ph_ra0, 1, sizeof(double), t_ph_ra0);
+     fclose(t_ph_ra0);
+
+     FILE *t_ph_dec0;
+     t_ph_dec0=fopen("t_ph_dec0.bin","wb");
+     fwrite((double *)&t->ph_dec0, 1, sizeof(double), t_ph_dec0);
+     fclose(t_ph_dec0);
+
+     FILE *t_ph_freq0;
+     t_ph_freq0=fopen("t_ph_freq0.bin","wb");
+     fwrite((double *)&t->ph_freq0, 1, sizeof(double), t_ph_freq0);
+     fclose(t_ph_freq0);
+
+     /* now calculate beam for all sources in this cluster */
+     cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd);
+
+     FILE *beam_d;
+     beam_d=fopen("beam_d.bin","wb");
+     err=cudaMemcpy((float*)tempbeam, beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float), cudaMemcpyDeviceToHost);
+
+     for (j=0; j<t->N*t->tilesz*t->carr[ncl].N*t->Nf; j++){
+       fwrite(&tempbeam[j], sizeof(float), 1, beam_d); 
+     }
+     fclose(beam_d);
 
      /* calculate coherencies for all sources in this cluster, add them up */
      cudakernel_coherencies(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,barrd,freqsd,beamd,
-     lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
+     lld,mmd,nnd,sId,styped,sI0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,(float)t->fdelta,(float)t->tdelta,(float)t->dec0,cohd,t->dobeam);
     
      /* copy back coherencies to host, 
         coherencies on host have 8M stride, on device have 8 stride */
-     err=cudaMemcpy((double*)tempdcoh, cohd, sizeof(double)*t->Nbase*8, cudaMemcpyDeviceToHost);
+     err=cudaMemcpy((float*)tempcoh, cohd, sizeof(float)*t->Nbase*8, cudaMemcpyDeviceToHost);
      checkCudaError(err,__FILE__,__LINE__);
+     complex double *tempdcoh;
+     if ((tempdcoh=(complex double*)calloc((size_t)t->Nbase*4,sizeof(complex double)))==0) {
+       fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+       exit(1);
+     }
+     int di;
+     double *dcp=(double*)tempdcoh;
+     float *fcp=(float*)tempcoh;
+     for (di=0; di<t->Nbase; di++) {
+       dcp[8*di]=(double)fcp[8*di];
+       dcp[8*di+1]=(double)fcp[8*di+1];
+       dcp[8*di+2]=(double)fcp[8*di+2];
+       dcp[8*di+3]=(double)fcp[8*di+3];
+       dcp[8*di+4]=(double)fcp[8*di+4];
+       dcp[8*di+5]=(double)fcp[8*di+5];
+       dcp[8*di+6]=(double)fcp[8*di+6];
+       dcp[8*di+7]=(double)fcp[8*di+7];
+     }
      /* now copy this with right offset and stride */
      my_ccopy(t->Nbase,&tempdcoh[0],4,&(t->coh[4*ncl]),4*t->M);
      my_ccopy(t->Nbase,&tempdcoh[1],4,&(t->coh[4*ncl+1]),4*t->M);
      my_ccopy(t->Nbase,&tempdcoh[2],4,&(t->coh[4*ncl+2]),4*t->M);
      my_ccopy(t->Nbase,&tempdcoh[3],4,&(t->coh[4*ncl+3]),4*t->M);
+     free(tempdcoh);
 
 
      for (cj=0; cj<t->carr[ncl].N; cj++) {
@@ -493,36 +471,24 @@ precalcoh_threadfn(void *data) {
      err=cudaFree(dev_p);
      checkCudaError(err,__FILE__,__LINE__);
 
-     if (t->dobeam) {
-      err=cudaFree(beamd);
-      checkCudaError(err,__FILE__,__LINE__);
-     }
+
+     err=cudaFree(beamd);
+     checkCudaError(err,__FILE__,__LINE__);
      err=cudaFree(lld);
      checkCudaError(err,__FILE__,__LINE__);
      err=cudaFree(mmd);
      checkCudaError(err,__FILE__,__LINE__);
      err=cudaFree(nnd);
      checkCudaError(err,__FILE__,__LINE__);
-     if (t->Nf==1) {
      err=cudaFree(sId);
      checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(sQd);
+     err=cudaFree(rad);
      checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(sUd);
+     err=cudaFree(decd);
      checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(sVd);
-     checkCudaError(err,__FILE__,__LINE__);
-     }
-     if (t->dobeam) {
-      err=cudaFree(rad);
-      checkCudaError(err,__FILE__,__LINE__);
-      err=cudaFree(decd);
-      checkCudaError(err,__FILE__,__LINE__);
-     }
      err=cudaFree(styped);
      checkCudaError(err,__FILE__,__LINE__);
 
-     if (t->Nf>1) {
      err=cudaFree(sI0d);
      checkCudaError(err,__FILE__,__LINE__);
      err=cudaFree(f0d);
@@ -533,21 +499,14 @@ precalcoh_threadfn(void *data) {
      checkCudaError(err,__FILE__,__LINE__);
      err=cudaFree(spec_idx2d);
      checkCudaError(err,__FILE__,__LINE__);
-
-     err=cudaFree(sQ0d);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(sU0d);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(sV0d);
-     checkCudaError(err,__FILE__,__LINE__);
-     }
   }
 /******************* end loop over clusters **************************/
 
-  /* free memory */
-  err=cudaFreeHost(tempdcoh);
-  checkCudaError(err,__FILE__,__LINE__);
+  free(tempcoh);
+  free(tempbeam);
+  free(copyoftimed);
 
+  /* free memory */
   err=cudaFree(ud);
   checkCudaError(err,__FILE__,__LINE__);
   err=cudaFree(vd);
@@ -560,8 +519,6 @@ precalcoh_threadfn(void *data) {
   checkCudaError(err,__FILE__,__LINE__);
   err=cudaFree(freqsd);
   checkCudaError(err,__FILE__,__LINE__);
-  
-  if (t->dobeam) {
   err=cudaFree(longd);
   checkCudaError(err,__FILE__,__LINE__);
   err=cudaFree(latd);
@@ -571,6 +528,7 @@ precalcoh_threadfn(void *data) {
   err=cudaFree(Nelemd);
   checkCudaError(err,__FILE__,__LINE__);
 
+
   for (ci=0; ci<t->N; ci++) {
     err=cudaFree(xx_p[ci]);
     checkCudaError(err,__FILE__,__LINE__);
@@ -590,11 +548,9 @@ precalcoh_threadfn(void *data) {
   free(xx_p);
   free(yy_p);
   free(zz_p);
-  }
 
   /* reset error state */
   err=cudaGetLastError(); 
-
   return NULL;
 
 }
@@ -630,7 +586,12 @@ precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex d
   taskhist thst;
   init_task_hist(&thst);
 
-  int Ngpu=MAX_GPU_ID+1;
+  int Ngpu;
+  if (M<4) {
+   Ngpu=2;
+  } else {
+   Ngpu=4;
+  }
 
   /* calculate min clusters thread can handle */
   Nthb0=(M+Ngpu-1)/Ngpu;
@@ -664,10 +625,10 @@ precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex d
      threaddata[nth].v=v;
      threaddata[nth].w=w;
      threaddata[nth].coh=x;
-     threaddata[nth].x=0; /* no input data */
+     threaddata[nth].x=0; /* no data */
 
      threaddata[nth].N=N;
-     threaddata[nth].Nbase=Nbase; /* total baselines: actually Nbasextilesz (from input) */
+     threaddata[nth].Nbase=Nbase; /* total baselines: actually Nbasextilesz */
      threaddata[nth].barr=barr;
      threaddata[nth].carr=carr;
      threaddata[nth].M=M;
@@ -768,7 +729,6 @@ precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex d
  free(threaddata1);
  pthread_attr_destroy(&attr);
  free(th_array);
-
  return 0;
 }
 
@@ -789,33 +749,18 @@ predictvis_threadfn(void *data) {
   cudaError_t err;
   int ci,ncl,cj;
 
-
   err=cudaSetDevice(card);
   checkCudaError(err,__FILE__,__LINE__);
-  err=cudaSetDeviceFlags(cudaDeviceLmemResizeToMax);
-  checkCudaError(err,__FILE__,__LINE__);
 
-  /* make sure enough heap memory is available for shapelet computations */
-  size_t plim;
-  err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize);
-  checkCudaError(err,__FILE__,__LINE__);
-  if (plim<GPU_HEAP_SIZE*1024*1024) { 
-   err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, GPU_HEAP_SIZE*1024*1024);
-   checkCudaError(err,__FILE__,__LINE__);
-  }
-
-
-
-  double *ud,*vd,*wd;
-  double *cohd;
+  float *ud,*vd,*wd,*cohd;
   baseline_t *barrd;
-  double *freqsd;
-  float *longd=0,*latd=0; double *timed;
+  float *freqsd;
+  float *longd,*latd; double *timed;
   int *Nelemd;
-  float **xx_p=0,**yy_p=0,**zz_p=0;
+  float **xx_p,**yy_p,**zz_p;
   float **xxd,**yyd,**zzd;
   /* allocate memory in GPU */
-  err=cudaMalloc((void**) &cohd, t->Nbase*8*t->Nf*sizeof(double)); /* coherencies only for 1 cluster, Nf freq, used to store sum of clusters*/
+  err=cudaMalloc((void**) &cohd, t->Nbase*8*t->Nf*sizeof(float)); /* coherencies only for 1 cluster, Nf freq, used to store sum of clusters*/
   checkCudaError(err,__FILE__,__LINE__);
   err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t));
   checkCudaError(err,__FILE__,__LINE__);
@@ -823,31 +768,13 @@ predictvis_threadfn(void *data) {
   checkCudaError(err,__FILE__,__LINE__);
   
   /* copy to device */
-  /* u,v,w and l,m,n coords need to be double for precision */
-  err=cudaMalloc((void**) &ud, t->Nbase*sizeof(double));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMalloc((void**) &vd, t->Nbase*sizeof(double));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMalloc((void**) &wd, t->Nbase*sizeof(double));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(ud, t->u, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(vd, t->v, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(wd, t->w, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-
-
+  dtofcopy(t->Nbase,&ud,t->u);
+  dtofcopy(t->Nbase,&vd,t->v);
+  dtofcopy(t->Nbase,&wd,t->w);
   err=cudaMemcpy(barrd, t->barr, t->Nbase*sizeof(baseline_t), cudaMemcpyHostToDevice);
   checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMalloc((void**) &freqsd, t->Nf*sizeof(double));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(freqsd, t->freqs, t->Nf*sizeof(double), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
 
-
-  /* check if beam is actually calculated */
-  if (t->dobeam) {
+  dtofcopy(t->Nf,&freqsd,t->freqs);
   dtofcopy(t->N,&longd,t->longitude);
   dtofcopy(t->N,&latd,t->latitude);
   err=cudaMalloc((void**) &timed, t->tilesz*sizeof(double));
@@ -901,122 +828,39 @@ predictvis_threadfn(void *data) {
   checkCudaError(err,__FILE__,__LINE__);
   err=cudaMemcpy(zzd, zz_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
   checkCudaError(err,__FILE__,__LINE__);
-  }
 
 
   float *beamd;
-  double *lld,*mmd,*nnd;
-  double *sId; float *rad,*decd;
-  double *sQd,*sUd,*sVd;
+  float *lld,*mmd,*nnd,*sId,*rad,*decd;
   unsigned char *styped;
-  double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
-  double *sQ0d,*sU0d,*sV0d;
+  float *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
   int **host_p,**dev_p;
-
-  double *xlocal;
-  err=cudaMallocHost((void**)&xlocal,sizeof(double)*(size_t)t->Nbase*8*t->Nf);
-  checkCudaError(err,__FILE__,__LINE__);
-
 /******************* begin loop over clusters **************************/
   for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
      /* allocate memory for this clusters beam */
-     if (t->dobeam) {
-      err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float));
-      checkCudaError(err,__FILE__,__LINE__);
-     } else {
-       beamd=0;
-     }
+     err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float));
+     checkCudaError(err,__FILE__,__LINE__);
 
 
      /* copy cluster details to GPU */
      err=cudaMalloc((void**)&styped, t->carr[ncl].N*sizeof(unsigned char));
      checkCudaError(err,__FILE__,__LINE__);
 
-     err=cudaMalloc((void**) &lld, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(lld, t->carr[ncl].ll, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &mmd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(mmd, t->carr[ncl].mm, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &nnd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-
-
-     if (t->Nf==1) {
-     err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-
-     err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     }
-
-
-     if (t->dobeam) {
-      dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
-      dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
-     } else {
-       rad=0;
-       decd=0;
-     }
+     dtofcopy(t->carr[ncl].N,&lld,t->carr[ncl].ll);
+     dtofcopy(t->carr[ncl].N,&mmd,t->carr[ncl].mm);
+     dtofcopy(t->carr[ncl].N,&nnd,t->carr[ncl].nn);
+     dtofcopy(t->carr[ncl].N,&sId,t->carr[ncl].sI);
+     dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
+     dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
      err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice);
      checkCudaError(err,__FILE__,__LINE__);
 
      /* for multi channel data */
-     if (t->Nf>1) {
-     err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &f0d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(f0d, t->carr[ncl].f0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &spec_idxd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(spec_idxd, t->carr[ncl].spec_idx, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &spec_idx1d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(spec_idx1d, t->carr[ncl].spec_idx1, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &spec_idx2d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-
-
-     err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-
-     }
-
-
+     dtofcopy(t->carr[ncl].N,&sI0d,t->carr[ncl].sI0);
+     dtofcopy(t->carr[ncl].N,&f0d,t->carr[ncl].f0);
+     dtofcopy(t->carr[ncl].N,&spec_idxd,t->carr[ncl].spec_idx);
+     dtofcopy(t->carr[ncl].N,&spec_idx1d,t->carr[ncl].spec_idx1);
+     dtofcopy(t->carr[ncl].N,&spec_idx2d,t->carr[ncl].spec_idx2);
 
      /* extra info for source, if any */
      if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
@@ -1072,21 +916,37 @@ predictvis_threadfn(void *data) {
      checkCudaError(err,__FILE__,__LINE__);
 
 
-     if (t->dobeam) {
-      /* now calculate beam for all sources in this cluster */
-      cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd);
-     }
+     /* now calculate beam for all sources in this cluster */
+     cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd);
 
 
      /* calculate coherencies for all sources in this cluster, add them up */
      cudakernel_coherencies(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,barrd,freqsd,beamd,
-     lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
+     lld,mmd,nnd,sId,styped,sI0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,(float)t->fdelta,(float)t->tdelta,(float)t->dec0,cohd,t->dobeam);
     
      /* copy back coherencies to host, 
         coherencies on host have 8M stride, on device have 8 stride */
-     err=cudaMemcpy(xlocal, cohd, sizeof(double)*t->Nbase*8*t->Nf, cudaMemcpyDeviceToHost);
+     float *tempx;
+     if ((tempx=(float*)calloc((size_t)t->Nbase*8*t->Nf,sizeof(float)))==0) {
+      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+      exit(1);
+     }
+     err=cudaMemcpy(tempx, cohd, sizeof(float)*t->Nbase*8*t->Nf, cudaMemcpyDeviceToHost);
      checkCudaError(err,__FILE__,__LINE__);
-     my_daxpy(t->Nbase*8*t->Nf,xlocal,1.0,t->x);
+     /* copy back as double */
+     int di;
+     for (di=0; di<t->Nbase*t->Nf; di++) {
+      t->x[8*di]=(double)tempx[8*di];
+      t->x[8*di+1]=(double)tempx[8*di+1];
+      t->x[8*di+2]=(double)tempx[8*di+2];
+      t->x[8*di+3]=(double)tempx[8*di+3];
+      t->x[8*di+4]=(double)tempx[8*di+4];
+      t->x[8*di+5]=(double)tempx[8*di+5];
+      t->x[8*di+6]=(double)tempx[8*di+6];
+      t->x[8*di+7]=(double)tempx[8*di+7];
+     }
+     free(tempx);
+
 
      for (cj=0; cj<t->carr[ncl].N; cj++) {
         if (t->carr[ncl].stype[cj]==STYPE_POINT) {
@@ -1114,36 +974,24 @@ predictvis_threadfn(void *data) {
      err=cudaFree(dev_p);
      checkCudaError(err,__FILE__,__LINE__);
 
-     if (t->dobeam) {
-      err=cudaFree(beamd);
-      checkCudaError(err,__FILE__,__LINE__);
-     }
+
+     err=cudaFree(beamd);
+     checkCudaError(err,__FILE__,__LINE__);
      err=cudaFree(lld);
      checkCudaError(err,__FILE__,__LINE__);
      err=cudaFree(mmd);
      checkCudaError(err,__FILE__,__LINE__);
      err=cudaFree(nnd);
      checkCudaError(err,__FILE__,__LINE__);
-     if (t->Nf==1) {
      err=cudaFree(sId);
      checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(sQd);
+     err=cudaFree(rad);
      checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(sUd);
+     err=cudaFree(decd);
      checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(sVd);
-     checkCudaError(err,__FILE__,__LINE__);
-     }
-     if (t->dobeam) {
-      err=cudaFree(rad);
-      checkCudaError(err,__FILE__,__LINE__);
-      err=cudaFree(decd);
-      checkCudaError(err,__FILE__,__LINE__);
-     }
      err=cudaFree(styped);
      checkCudaError(err,__FILE__,__LINE__);
 
-     if (t->Nf>1) {
      err=cudaFree(sI0d);
      checkCudaError(err,__FILE__,__LINE__);
      err=cudaFree(f0d);
@@ -1154,21 +1002,10 @@ predictvis_threadfn(void *data) {
      checkCudaError(err,__FILE__,__LINE__);
      err=cudaFree(spec_idx2d);
      checkCudaError(err,__FILE__,__LINE__);
-
-     err=cudaFree(sQ0d);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(sU0d);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(sV0d);
-     checkCudaError(err,__FILE__,__LINE__);
-     }
   }
 /******************* end loop over clusters **************************/
 
   /* free memory */
-  err=cudaFreeHost(xlocal);
-  checkCudaError(err,__FILE__,__LINE__);
-
   err=cudaFree(ud);
   checkCudaError(err,__FILE__,__LINE__);
   err=cudaFree(vd);
@@ -1181,8 +1018,6 @@ predictvis_threadfn(void *data) {
   checkCudaError(err,__FILE__,__LINE__);
   err=cudaFree(freqsd);
   checkCudaError(err,__FILE__,__LINE__);
-
-  if (t->dobeam) {
   err=cudaFree(longd);
   checkCudaError(err,__FILE__,__LINE__);
   err=cudaFree(latd);
@@ -1212,9 +1047,6 @@ predictvis_threadfn(void *data) {
   free(xx_p);
   free(yy_p);
   free(zz_p);
-  }
-
-
 
   /* reset error state */
   err=cudaGetLastError(); 
@@ -1235,8 +1067,12 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit
   taskhist thst;
   init_task_hist(&thst);
 
-  /* oversubsribe GPU */
-  int Ngpu=MAX_GPU_ID+1;
+  int Ngpu;
+  if (M<4) {
+   Ngpu=2;
+  } else {
+   Ngpu=4;
+  }
 
   /* calculate min clusters thread can handle */
   Nthb0=(M+Ngpu-1)/Ngpu;
@@ -1261,14 +1097,14 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit
     exit(1);
   }
 
-  if (add_to_data==SIMUL_ONLY) {
-   /* set input column to zero */
+  if (!add_to_data) {
+   /* set output column to zero */
    memset(x,0,sizeof(double)*Nbase*8*tilesz*Nchan);
   }
 
 
 
-  /* set common parameters, and split clusters to threads */
+  /* set common parameters, and split baselines to threads */
   ci=0;
   for (nth=0;  nth<Ngpu && ci<M; nth++) {
      if (ci+Nthb0<M) {
@@ -1328,11 +1164,7 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit
   for(ci=0; ci<nth; ci++) {
     pthread_join(th_array[ci],NULL);
     /* add or copy xlocal back to x */
-    if (add_to_data==SIMUL_ONLY || add_to_data==SIMUL_ADD) {
-     my_daxpy(Nbase*8*tilesz*Nchan,threaddata[ci].x,1.0,x);
-    } else { /* subtract */
-     my_daxpy(Nbase*8*tilesz*Nchan,threaddata[ci].x,-1.0,x);
-    }
+    my_daxpy(Nbase*8*tilesz*Nchan,threaddata[ci].x,1.0,x);
   }
 
 
@@ -1345,745 +1177,3 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit
 
   return 0;
 }
-
-
-
-static void *
-residual_threadfn(void *data) {
-  thread_data_pred_t *t=(thread_data_pred_t*)data;
-  /* first, select a GPU, if total clusters < MAX_GPU_ID
-    use random selection, elese use this thread id */
-  int card;
-  if (t->M<=MAX_GPU_ID) {
-   card=select_work_gpu(MAX_GPU_ID,t->hst);
-  } else {
-   card=t->tid;
-  }
-  cudaError_t err;
-  int ci,ncl,cj;
-
-
-  err=cudaSetDevice(card);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaSetDeviceFlags(cudaDeviceLmemResizeToMax);
-  checkCudaError(err,__FILE__,__LINE__);
-
-  /* make sure enough heap memory is available for shapelet computations */
-  size_t plim;
-  err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize);
-  checkCudaError(err,__FILE__,__LINE__);
-  if (plim<GPU_HEAP_SIZE*1024*1024) { 
-   err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, GPU_HEAP_SIZE*1024*1024);
-   checkCudaError(err,__FILE__,__LINE__);
-  }
-
-
-
-  double *ud,*vd,*wd;
-  double *cohd;
-  baseline_t *barrd;
-  double *freqsd;
-  float *longd=0,*latd=0; double *timed;
-  int *Nelemd;
-  float **xx_p=0,**yy_p=0,**zz_p=0;
-  float **xxd,**yyd,**zzd;
-  /* allocate memory in GPU */
-  err=cudaMalloc((void**) &cohd, t->Nbase*8*t->Nf*sizeof(double)); /* coherencies only for 1 cluster, Nf freq, used to store sum of clusters*/
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMalloc((void**) &Nelemd, t->N*sizeof(int));
-  checkCudaError(err,__FILE__,__LINE__);
-  
-  /* copy to device */
-  /* u,v,w and l,m,n coords need to be double for precision */
-  err=cudaMalloc((void**) &ud, t->Nbase*sizeof(double));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMalloc((void**) &vd, t->Nbase*sizeof(double));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMalloc((void**) &wd, t->Nbase*sizeof(double));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(ud, t->u, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(vd, t->v, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(wd, t->w, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-
-
-  err=cudaMemcpy(barrd, t->barr, t->Nbase*sizeof(baseline_t), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMalloc((void**) &freqsd, t->Nf*sizeof(double));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(freqsd, t->freqs, t->Nf*sizeof(double), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-
-
-  /* check if beam is actually calculated */
-  if (t->dobeam) {
-  dtofcopy(t->N,&longd,t->longitude);
-  dtofcopy(t->N,&latd,t->latitude);
-  err=cudaMalloc((void**) &timed, t->tilesz*sizeof(double));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(timed, t->time_utc, t->tilesz*sizeof(double), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-  /* convert time jd to GMST angle */
-  cudakernel_convert_time(t->tilesz,timed);
-
-  err=cudaMemcpy(Nelemd, t->Nelem, t->N*sizeof(int), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-
-  /* jagged arrays for element locations */
-  err=cudaMalloc((void**)&xxd, t->N*sizeof(int*));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMalloc((void**)&yyd, t->N*sizeof(int*));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMalloc((void**)&zzd, t->N*sizeof(int*));
-  checkCudaError(err,__FILE__,__LINE__);
-  /* allocate host memory to store pointers */
-  if ((xx_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
-      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-      exit(1);
-  }
-  if ((yy_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
-      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-      exit(1);
-  }
-  if ((zz_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
-      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-      exit(1);
-  }
-  for (ci=0; ci<t->N; ci++) {
-    err=cudaMalloc((void**)&xx_p[ci], t->Nelem[ci]*sizeof(double));
-    checkCudaError(err,__FILE__,__LINE__);
-    err=cudaMalloc((void**)&yy_p[ci], t->Nelem[ci]*sizeof(double));
-    checkCudaError(err,__FILE__,__LINE__);
-    err=cudaMalloc((void**)&zz_p[ci], t->Nelem[ci]*sizeof(double));
-    checkCudaError(err,__FILE__,__LINE__);
-  }
-  /* now copy data */
-  for (ci=0; ci<t->N; ci++) {
-    dtofcopy(t->Nelem[ci],&xx_p[ci],t->xx[ci]);
-    dtofcopy(t->Nelem[ci],&yy_p[ci],t->yy[ci]);
-    dtofcopy(t->Nelem[ci],&zz_p[ci],t->zz[ci]);
-  }
-  /* now copy pointer locations to device */
-  err=cudaMemcpy(xxd, xx_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(yyd, yy_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(zzd, zz_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-  }
-
-
-  float *beamd;
-  double *lld,*mmd,*nnd;
-  double *sId; float *rad,*decd;
-  double *sQd,*sUd,*sVd;
-  unsigned char *styped;
-  double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
-  double *sQ0d,*sU0d,*sV0d;
-  int **host_p,**dev_p;
-
-  double *xlocal;
-  err=cudaMallocHost((void**)&xlocal,sizeof(double)*(size_t)t->Nbase*8*t->Nf);
-  checkCudaError(err,__FILE__,__LINE__);
-
-  double *pd; /* parameter array per cluster */
-
-/******************* begin loop over clusters **************************/
-  for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
-     /* check if cluster id >=0 to do a subtraction */
-     if (t->carr[ncl].id>=0) {
-     /* allocate memory for this clusters beam */
-     if (t->dobeam) {
-      err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float));
-      checkCudaError(err,__FILE__,__LINE__);
-     } else {
-       beamd=0;
-     }
-
-
-     /* copy cluster details to GPU */
-     err=cudaMalloc((void**)&styped, t->carr[ncl].N*sizeof(unsigned char));
-     checkCudaError(err,__FILE__,__LINE__);
-
-     err=cudaMalloc((void**) &lld, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(lld, t->carr[ncl].ll, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &mmd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(mmd, t->carr[ncl].mm, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &nnd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-
-     /* parameter vector size may change depending on hybrid parameter */
-     err=cudaMalloc((void**) &pd, t->N*8*t->carr[ncl].nchunk*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(pd, &(t->p[t->carr[ncl].p[0]]), t->N*8*t->carr[ncl].nchunk*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-
-
-     if (t->Nf==1) {
-     err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-
-     err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     }
-
-
-     if (t->dobeam) {
-      dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
-      dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
-     } else {
-       rad=0;
-       decd=0;
-     }
-     err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-
-     /* for multi channel data */
-     if (t->Nf>1) {
-     err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &f0d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(f0d, t->carr[ncl].f0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &spec_idxd, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(spec_idxd, t->carr[ncl].spec_idx, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &spec_idx1d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(spec_idx1d, t->carr[ncl].spec_idx1, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &spec_idx2d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-
-
-     err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-
-     }
-
-
-
-     /* extra info for source, if any */
-     if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
-      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-      exit(1);
-     }
-     err=cudaMalloc((void**)&dev_p, t->carr[ncl].N*sizeof(int*));
-     checkCudaError(err,__FILE__,__LINE__);
-
-
-     for (cj=0; cj<t->carr[ncl].N; cj++) {
-
-        if (t->carr[ncl].stype[cj]==STYPE_POINT) {
-          host_p[cj]=0;
-        } else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) {
-          exinfo_shapelet *d=(exinfo_shapelet*)t->carr[ncl].ex[cj];
-          err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_shapelet));
-          checkCudaError(err,__FILE__,__LINE__);
-          double *modes;
-          err=cudaMalloc((void**)&modes, d->n0*d->n0*sizeof(double));
-          checkCudaError(err,__FILE__,__LINE__);
-          err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_shapelet), cudaMemcpyHostToDevice);
-          checkCudaError(err,__FILE__,__LINE__);
-          err=cudaMemcpy(modes, d->modes, d->n0*d->n0*sizeof(double), cudaMemcpyHostToDevice);
-          checkCudaError(err,__FILE__,__LINE__);
-          exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj];
-          err=cudaMemcpy(&(d_p->modes), &modes, sizeof(double*), cudaMemcpyHostToDevice);
-          checkCudaError(err,__FILE__,__LINE__);
-        } else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) {
-          exinfo_gaussian *d=(exinfo_gaussian*)t->carr[ncl].ex[cj];
-          err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_gaussian));
-          checkCudaError(err,__FILE__,__LINE__);
-          err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_gaussian), cudaMemcpyHostToDevice);
-          checkCudaError(err,__FILE__,__LINE__);
-        } else if (t->carr[ncl].stype[cj]==STYPE_DISK) {
-          exinfo_disk *d=(exinfo_disk*)t->carr[ncl].ex[cj];
-          err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_disk));
-          checkCudaError(err,__FILE__,__LINE__);
-          err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_disk), cudaMemcpyHostToDevice);
-          checkCudaError(err,__FILE__,__LINE__);
-        } else if (t->carr[ncl].stype[cj]==STYPE_RING) {
-          exinfo_ring *d=(exinfo_ring*)t->carr[ncl].ex[cj];
-          err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_ring));
-          checkCudaError(err,__FILE__,__LINE__);
-          err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_ring), cudaMemcpyHostToDevice);
-          checkCudaError(err,__FILE__,__LINE__);
-        }
-        
-
-     }
-     /* now copy pointer locations to device */
-     err=cudaMemcpy(dev_p, host_p, t->carr[ncl].N*sizeof(int*), cudaMemcpyHostToDevice);
-     checkCudaError(err,__FILE__,__LINE__);
-
-
-     if (t->dobeam) {
-      /* now calculate beam for all sources in this cluster */
-      cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd);
-     }
-
-
-     /* calculate coherencies for all sources in this cluster, add them up */
-     cudakernel_residuals(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,pd,t->carr[ncl].nchunk,barrd,freqsd,beamd,
-     lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
-    
-     /* copy back coherencies to host, 
-        coherencies on host have 8M stride, on device have 8 stride */
-     err=cudaMemcpy(xlocal, cohd, sizeof(double)*t->Nbase*8*t->Nf, cudaMemcpyDeviceToHost);
-     checkCudaError(err,__FILE__,__LINE__);
-     my_daxpy(t->Nbase*8*t->Nf,xlocal,1.0,t->x);
-
-     for (cj=0; cj<t->carr[ncl].N; cj++) {
-        if (t->carr[ncl].stype[cj]==STYPE_POINT) {
-        } else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) {
-          exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj];
-          double *modes=0;
-          err=cudaMemcpy(&modes, &(d_p->modes), sizeof(double*), cudaMemcpyDeviceToHost);
-          err=cudaFree(modes);
-          checkCudaError(err,__FILE__,__LINE__);
-          err=cudaFree(host_p[cj]);
-          checkCudaError(err,__FILE__,__LINE__);
-        } else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) {
-          err=cudaFree(host_p[cj]);
-          checkCudaError(err,__FILE__,__LINE__);
-        } else if (t->carr[ncl].stype[cj]==STYPE_DISK) {
-          err=cudaFree(host_p[cj]);
-          checkCudaError(err,__FILE__,__LINE__);
-        } else if (t->carr[ncl].stype[cj]==STYPE_RING) {
-          err=cudaFree(host_p[cj]);
-          checkCudaError(err,__FILE__,__LINE__);
-        }
-     }
-     free(host_p);
-
-     err=cudaFree(dev_p);
-     checkCudaError(err,__FILE__,__LINE__);
-
-     if (t->dobeam) {
-      err=cudaFree(beamd);
-      checkCudaError(err,__FILE__,__LINE__);
-     }
-     err=cudaFree(lld);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(mmd);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(nnd);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(pd);
-     checkCudaError(err,__FILE__,__LINE__);
-     if (t->Nf==1) {
-     err=cudaFree(sId);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(sQd);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(sUd);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(sVd);
-     checkCudaError(err,__FILE__,__LINE__);
-     }
-     if (t->dobeam) {
-      err=cudaFree(rad);
-      checkCudaError(err,__FILE__,__LINE__);
-      err=cudaFree(decd);
-      checkCudaError(err,__FILE__,__LINE__);
-     }
-     err=cudaFree(styped);
-     checkCudaError(err,__FILE__,__LINE__);
-
-     if (t->Nf>1) {
-     err=cudaFree(sI0d);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(f0d);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(spec_idxd);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(spec_idx1d);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(spec_idx2d);
-     checkCudaError(err,__FILE__,__LINE__);
-
-     err=cudaFree(sQ0d);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(sU0d);
-     checkCudaError(err,__FILE__,__LINE__);
-     err=cudaFree(sV0d);
-     checkCudaError(err,__FILE__,__LINE__);
-     }
-   }
-  }
-/******************* end loop over clusters **************************/
-
-  /* free memory */
-  err=cudaFreeHost(xlocal);
-  checkCudaError(err,__FILE__,__LINE__);
-
-  err=cudaFree(ud);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaFree(vd);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaFree(wd);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaFree(cohd);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaFree(barrd);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaFree(freqsd);
-  checkCudaError(err,__FILE__,__LINE__);
-
-  if (t->dobeam) {
-  err=cudaFree(longd);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaFree(latd);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaFree(timed);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaFree(Nelemd);
-  checkCudaError(err,__FILE__,__LINE__);
-
-
-  for (ci=0; ci<t->N; ci++) {
-    err=cudaFree(xx_p[ci]);
-    checkCudaError(err,__FILE__,__LINE__);
-    err=cudaFree(yy_p[ci]);
-    checkCudaError(err,__FILE__,__LINE__);
-    err=cudaFree(zz_p[ci]);
-    checkCudaError(err,__FILE__,__LINE__);
-  }
-
-  err=cudaFree(xxd);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaFree(yyd);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaFree(zzd);
-  checkCudaError(err,__FILE__,__LINE__);
-
-  free(xx_p);
-  free(yy_p);
-  free(zz_p);
-  }
-
-
-
-  /* reset error state */
-  err=cudaGetLastError(); 
-  return NULL;
-
-}
-
-
-static void *
-correct_threadfn(void *data) {
-  thread_data_corr_t *t=(thread_data_corr_t*)data;
-  /* first, select a GPU, if total threads > MAX_GPU_ID
-    use random selection, elese use this thread id */
-  int card;
-  if (t->tid>MAX_GPU_ID) {
-   card=select_work_gpu(MAX_GPU_ID,t->hst);
-  } else {
-   card=t->tid;
-  }
-  cudaError_t err;
-  int cf;
-
-  err=cudaSetDevice(card);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaSetDeviceFlags(cudaDeviceLmemResizeToMax);
-  checkCudaError(err,__FILE__,__LINE__);
-
-  double *xd;
-  baseline_t *barrd;
-  double *pd;
-
-  /* allocate memory in GPU */
-  err=cudaMalloc((void**) &xd, t->Nb*8*t->Nf*sizeof(double)); /* coherencies only for Nb baselines, Nf freqs */
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMalloc((void**) &barrd, t->Nb*sizeof(baseline_t));
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMalloc((void**) &pd, t->N*8*t->nchunk*sizeof(double));
-  checkCudaError(err,__FILE__,__LINE__);
-
-  /* copy with right offset */
-  err=cudaMemcpy(barrd, &(t->barr[t->boff]), t->Nb*sizeof(baseline_t), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaMemcpy(pd, t->pinv, t->N*8*t->nchunk*sizeof(double), cudaMemcpyHostToDevice);
-  checkCudaError(err,__FILE__,__LINE__);
-  
-  for (cf=0; cf<t->Nf; cf++) {
-    err=cudaMemcpy(&xd[cf*8*t->Nb], &(t->x[cf*8*t->Nbase+t->boff*8]), t->Nb*8*sizeof(double), cudaMemcpyHostToDevice);
-    checkCudaError(err,__FILE__,__LINE__);
-  }
-
-  /* run kernel */
-  cudakernel_correct_residuals(t->Nbase,t->N,t->Nb,t->boff,t->Nf,t->nchunk,xd,pd,barrd);
- 
-  /* copy back corrected result */
-  for (cf=0; cf<t->Nf; cf++) {
-    err=cudaMemcpy(&(t->x[cf*8*t->Nbase+t->boff*8]), &xd[cf*8*t->Nb], t->Nb*8*sizeof(double), cudaMemcpyDeviceToHost);
-    checkCudaError(err,__FILE__,__LINE__);
-  }
-
-  err=cudaFree(xd);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaFree(barrd);
-  checkCudaError(err,__FILE__,__LINE__);
-  err=cudaFree(pd);
-  checkCudaError(err,__FILE__,__LINE__);
-
-
-  /* reset error state */
-  err=cudaGetLastError();
-  return NULL;
-}
-
-/* p: 8NMx1 parameter array, but M is 'effective' clusters, need to use carr to find the right offset
-   ccid: which cluster to use as correction
-   rho: MMSE robust parameter J+rho I inverted
-
-   phase_only: if >0, and if there is any correction done, use only phase of diagonal elements for correction 
-*/
-int
-calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
-double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int ccid, double rho, int phase_only) {
-
-  int nth,ci;
-
-  int Nthb0,Nthb;
-  pthread_attr_t attr;
-  pthread_t *th_array;
-  thread_data_pred_t *threaddata;
-  thread_data_corr_t *threaddata_corr;
-  taskhist thst;
-  init_task_hist(&thst);
-
-  /* oversubsribe GPU */
-  int Ngpu=MAX_GPU_ID+1;
-
-  /* calculate min clusters thread can handle */
-  Nthb0=(M+Ngpu-1)/Ngpu;
-
-  /* setup threads : note: Ngpu is no of GPUs used */
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
-
-  if ((th_array=(pthread_t*)malloc((size_t)Ngpu*sizeof(pthread_t)))==0) {
-   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-   exit(1);
-  }
-  if ((threaddata=(thread_data_pred_t*)malloc((size_t)Ngpu*sizeof(thread_data_pred_t)))==0) {
-    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-
-  /* arrays to store result */
-  double *xlocal;
-  if ((xlocal=(double*)calloc((size_t)Nbase*8*tilesz*Nchan*Ngpu,sizeof(double)))==0) {
-    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-
-
-  /* set common parameters, and split clusters to threads */
-  ci=0;
-  for (nth=0;  nth<Ngpu && ci<M; nth++) {
-     if (ci+Nthb0<M) {
-      Nthb=Nthb0;
-     } else {
-      Nthb=M-ci;
-     }
-      
-     threaddata[nth].hst=&thst;  /* for load balancing */
-     threaddata[nth].tid=nth;
-
-     threaddata[nth].u=u; 
-     threaddata[nth].v=v;
-     threaddata[nth].w=w;
-     threaddata[nth].coh=0;
-     threaddata[nth].x=&xlocal[nth*Nbase*8*tilesz*Nchan]; /* distinct arrays to get back the result */
-
-     threaddata[nth].N=N;
-     threaddata[nth].Nbase=Nbase*tilesz; /* total baselines: actually Nbasextilesz */
-     threaddata[nth].barr=barr;
-     threaddata[nth].carr=carr;
-     threaddata[nth].M=M;
-     threaddata[nth].Nf=Nchan;
-     threaddata[nth].freqs=freqs;
-     threaddata[nth].fdelta=fdelta;
-     threaddata[nth].tdelta=tdelta;
-     threaddata[nth].dec0=dec0;
-
-     threaddata[nth].dobeam=dobeam;
-     threaddata[nth].ph_ra0=ph_ra0;
-     threaddata[nth].ph_dec0=ph_dec0;
-     threaddata[nth].ph_freq0=ph_freq0;
-     threaddata[nth].longitude=longitude;
-     threaddata[nth].latitude=latitude;
-     threaddata[nth].time_utc=time_utc;
-     threaddata[nth].tilesz=tilesz;
-     threaddata[nth].Nelem=Nelem;
-     threaddata[nth].xx=xx;
-     threaddata[nth].yy=yy;
-     threaddata[nth].zz=zz;
-
-     /* parameters */
-     threaddata[nth].p=p;
-
-     threaddata[nth].Ns=Nthb;
-     threaddata[nth].soff=ci;
-
-    
-     pthread_create(&th_array[nth],&attr,residual_threadfn,(void*)(&threaddata[nth]));
-     /* next source set */
-     ci=ci+Nthb;
-  }
-
-     
-
-  /* now wait for threads to finish */
-  for(ci=0; ci<nth; ci++) {
-    pthread_join(th_array[ci],NULL);
-    /* subtract to find residual */
-    my_daxpy(Nbase*8*tilesz*Nchan,threaddata[ci].x,-1.0,x);
-  }
-
-
-
-  free(xlocal);
-  free(threaddata);
-
-  /* find if any cluster is specified for correction of data */
-  int cm,cj;
-  cm=-1;
-  for (cj=0; cj<M; cj++) { /* clusters */
-    /* check if cluster id == ccid to do a correction */
-    if (carr[cj].id==ccid) {
-     /* valid cluster found */
-     cm=cj;
-    }
-  }
-  /* correction of data, if valid cluster is given */
-  if (cm>=0) {
-   double *pm,*pinv,*pphase=0;
-   /* allocate memory for inverse J */
-   if ((pinv=(double*)malloc((size_t)8*N*carr[cm].nchunk*sizeof(double)))==0) {
-     fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-     exit(1);
-   }
-   if (!phase_only) {
-    for (cj=0; cj<carr[cm].nchunk; cj++) {
-     pm=&(p[carr[cm].p[cj]]); /* start of solutions */
-     /* invert N solutions */
-     for (ci=0; ci<N; ci++) {
-      mat_invert(&pm[8*ci],&pinv[8*ci+8*N*cj], rho);
-     }
-    }
-   } else {
-    /* joint diagonalize solutions and get only phases before inverting */
-    if ((pphase=(double*)malloc((size_t)8*N*sizeof(double)))==0) {
-     fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-     exit(1);
-    }
-    for (cj=0; cj<carr[cm].nchunk; cj++) {
-      pm=&(p[carr[cm].p[cj]]); /* start of solutions */
-      /* extract phase of pm, output to pphase */
-      extract_phases(pm,pphase,N,10);
-      /* invert N solutions */
-      for (ci=0; ci<N; ci++) {
-       mat_invert(&pphase[8*ci],&pinv[8*ci+8*N*cj], rho);
-      }
-    }
-    free(pphase);
-   }
-
-   /* divide x[] over GPUs */
-   int Nbase1=Nbase*tilesz;
-   Nthb0=(Nbase1+Ngpu-1)/Ngpu;
-
-   if ((threaddata_corr=(thread_data_corr_t*)malloc((size_t)Ngpu*sizeof(thread_data_corr_t)))==0) {
-    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-    exit(1);
-   }
-   ci=0;
-   for (nth=0;  nth<Ngpu && ci<Nbase1; nth++) {
-    /* this thread will handle baselines [ci:min(Nbase-1,ci+Nthb0-1)] */
-    /* determine actual no. of baselines */
-    if (ci+Nthb0<Nbase1) {
-     Nthb=Nthb0;
-    } else {
-     Nthb=Nbase1-ci;
-    }
-
-    threaddata_corr[nth].hst=&thst;
-    threaddata_corr[nth].tid=nth;
-    threaddata_corr[nth].boff=ci;
-    threaddata_corr[nth].Nb=Nthb;
-    threaddata_corr[nth].barr=barr;
-    threaddata_corr[nth].x=x;
-    threaddata_corr[nth].N=N;
-    threaddata_corr[nth].Nbase=Nbase1;
-    threaddata_corr[nth].Nf=Nchan;
-    /* for correction of data */
-    threaddata_corr[nth].pinv=pinv;
-    threaddata_corr[nth].nchunk=carr[cm].nchunk;
-
-
-    pthread_create(&th_array[nth],&attr,correct_threadfn,(void*)(&threaddata_corr[nth]));
-    /* next baseline set */
-    ci=ci+Nthb;
-   }
-
-   /* now wait for threads to finish */
-   for(ci=0; ci<nth; ci++) {
-    pthread_join(th_array[ci],NULL);
-   }
-
-   
-   free(pinv);
-   free(threaddata_corr);
-  }
-
-
-  free(th_array);
-  pthread_attr_destroy(&attr);
-
-  destroy_task_hist(&thst);
-
-  return 0;
-}
diff --git a/test/Generate_sources.py b/test/Generate_sources.py
index cefaf9c..7b6d692 100644
--- a/test/Generate_sources.py
+++ b/test/Generate_sources.py
@@ -28,6 +28,9 @@ tol_seconds_of_RA = tol_seconds_of_decl * 24/360
 RA_hours_3C196 = 0
 RA_minutes_3C196 = 0
 RA_seconds_3C196 = 0
+#RA_hours_3C196 = 8
+#RA_minutes_3C196 = 13
+#RA_seconds_3C196 = 35.981540
 
 RA_seconds_3C196 = (RA_hours_3C196 * 60 + RA_minutes_3C196) *60 + RA_seconds_3C196
 
@@ -42,6 +45,9 @@ RA_hours_high, RA_minutes_high = divmod(RA_minutes_high, 60)
 decl_degrees_3C196 = 90
 decl_minutes_3C196 = 0
 decl_seconds_3C196 = 0
+#decl_degrees_3C196 = 48
+#decl_minutes_3C196 = 12
+#decl_seconds_3C196 = 59.174770
 
 decl_seconds_3C196 = (decl_degrees_3C196 * 60 + decl_minutes_3C196) *60 + decl_seconds_3C196
 
@@ -68,9 +74,8 @@ with warnings.catch_warnings():
              'pos_angle', 'freq0')
 
     formats = ['U6', 'i4', 'i4', 'f8', 'i4', 'i4', 'f8', 'f8', 'i4', 'i4', 'i4', 'f8', 'f8',
-               'i4', 'i4', 'i4', 'i4', 'f8', 'f8']
-
-    formats_reformatted = '%s  %d  %d  %f  %d  %d  %f  %f  %d  %d  %d  %f  %f  %d  %d  %d  %d  %f  %f'
+      'f8', 'f8', 'f8', 'f8', 'f8', 'f8']
+    formats_reformatted = '%s  %d  %d  %f  %d  %d  %f  %f  %d  %d  %d  %f  %f  %f  %f  %f  %f  %f  %f'
 
     sources_parameters = np.recarray((number_of_sources,), formats=formats,
                                      names=names)
diff --git a/test/dosage.sh b/test/dosage.sh
index ecaddfe..365aac5 100755
--- a/test/dosage.sh
+++ b/test/dosage.sh
@@ -1,2 +1,2 @@
 # Before running this, untar sm.ms.tar and build sagecal
-../src/MS/sagecal -d sm.ms -s 3c196.sky.txt -c 3c196.sky.txt.cluster -n 4 -t 10 -p sm.ms.solutions -e 4 -g 2 -l 10 -m 7 -x 30 -F 1 -j 5  -k -1 -B 1 -W 0 > sm.ms.output
+../src/MS/sagecal -d sm.ms -s extended_source_list.txt -c extended_source_list.txt.cluster -n 4 -t 10 -p sm.ms.solutions -e 4 -g 2 -l 10 -m 7 -x 30 -F 1 -j 2  -k -1 -B 1 -W 0 > sm.ms.output