Merge branch 'master' into sourceforge

2018-02-06 10:59:45 +01:00 · 2018-02-06 10:59:45 +01:00 · 9110379d6a
parent 97f9b51ba3 538abbe6c1
commit 9110379d6a
27 changed files with 764 additions and 2581 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,25 @@
+*/*.png
+*/*.out
+*/*.output
+*/*.solutions
+*/*.dot
+test/sm.ms/
+test/nvprof-resultaten/
+src/MS/sagecal
+test/analysis-*.txt
+test/extended_*.*
+
+*/*/*.a
+*/*/*.o
+*/*/*.swp
+*/*/*.swo
+*/*/*.out
+*/*/*.output
+
+*/*/*/*.o
+*/*/*/*.a
+*/*/*/*.swp
+*/*/*/*.swo
+*/*/*/*.out
+*/*/*/*.output
+
--- a/INSTALL.md
+++ b/INSTALL.md
@ -1,7 +1,8 @@
 vr  2 dec 2016 23:07:19 CET
-SAGECal Installation
-====================
-0) Prerequsites:
+
+# SAGECal Installation
+
+## 1 Prerequsites:
 - CASACORE http://casacore.googlecode.com/
 - glib http://developer.gnome.org/glib
 - BLAS/LAPACK
@ -15,12 +16,13 @@ SAGECal Installation
  -- Intel MKL and other libraries
 - Get the source for SAGECal : git clone git://git.code.sf.net/p/sagecal/code sagecal-code

-1) The basic way to build is
+## 2 The basic way to build is
  1.a) go to ./src/lib  and run make (which will create libsagecal.a)
  1.b) go to ./src/MS and run make (which will create the executable)


-2) In ./src/lib and ./src/MS you MUST edit the Makefiles to suit your system. Some common items to edit are:
+## 3 Build settings
+In ./src/lib and ./src/MS you MUST edit the Makefiles to suit your system. Some common items to edit are:
 - LAPACK: directory where LAPACK/OpenBLAS is installed
 - GLIBI/GLIBL: include/lib files for glib
 - CASA_LIBDIR/CASA_INCDIR/CASA_LIBS : casacore include/library location and files:
@ -39,23 +41,23 @@ SAGECal Installation



-SAGECAL-MPI Installation
-========================
-0) Prerequsites:
+# SAGECAL-MPI Installation
+
+## 1 Prerequsites:
 - Same as above 
 - MPI (e.g. OpenMPI)

-1) Build ./src/lib as above (using mpicc -DMPI_BUILD)
+## 2 Build ./src/lib as above (using mpicc -DMPI_BUILD)

-2) Build ./src/MPI using mpicc++
+## 3 Build ./src/MPI using mpicc++



-BUILDSKY Installation
-=====================
-1) See INSTALL in ./src/buildsky
+## BUILDSKY Installation
+
+  - See INSTALL in ./src/buildsky


-RESTORE Installation
-=====================
-1) See INSTALL in ./src/restore
+## RESTORE Installation
+
+  - See INSTALL in ./src/restore
--- a/README.md
+++ b/README.md
@ -1,15 +1,17 @@
-SAGECAL
-=======
+# SAGECAL
+
+
 Read INSTALL for installation. This file gives a brief guide to use SAGECal.
 Warning: this file may be obsolete. use sagecal -h to see up-to-date options.


-Step by Step Introduction:
-#######################################################################
+## Step by Step Introduction:
+
 1a)Calibrate data in the standard way using BBS/CASA or anything else. 
 Use NDPP to average the data in your MS to a few channels (also average in time to about 10sec). Also flag any spikes in the data.
 1b)For subtraction of the ATeam from raw data (CasA,CygA,...), no initial calibration is necessary. Just run sagecal on raw data, but it is better to scale the sky model to match the apparent flux of the sources that are being subtracted.
-#######################################################################
+
+
 2) Sky Model:
 3a)Make an image of your MS (using ExCon/casapy). 
 Use Duchamp to create a mask for the image. Use buildsky to create a sky model. (see the README file on top level directory). Also create a proper cluster file.
--- a/src/MS/Makefile
+++ b/src/MS/Makefile
@ -36,5 +36,4 @@ data.o:data.cpp data.h
 sagecal:$(OBJECTS) ../lib/Radio/libsagecal.a ../lib/Dirac/libdirac.a 
 	$(CXX) $(CXXFLAGS) $(LDFLAGS) $(INCLUDES) $(GLIBI) $(LIBPATH)  -o $@  $(OBJECTS) $(MY_LIBS) $(LAPACK) $(CASA_LIBS)  $(GLIBL)
 clean:
-	rm *.o 
-
+	rm *.o *.tmp *.fits *.swp *.swo *.o *.output
--- a/src/MS/Makefile.gpu
+++ b/src/MS/Makefile.gpu
@ -10,9 +10,9 @@ LAPACK_DIR=/cm/shared/package/openblas/0.2.17mt/lib
 #LAPACK_DIR=/usr/lib/atlas/sse/

 CUDAINC=-I/cm/shared/apps/cuda80/toolkit/8.0.44/include/
-CUDALIB=-L/cm/shared/apps/cuda80/toolkit/8.0.44/lib64/ -lcuda -lcudart
+CUDALIB=-lcublas -lcusolver -lcudadevrt

-CULALIB=-lcublas -lcusolver -lcudadevrt
+# CULALIB=-lcublas -lcusolver -lcudadevrt
 # NVML
 NVML_INC=/usr/include/nvidia/gdk/
 NVML_LIB=-lnvidia-ml -L/usr/lib64/nvidia/
--- a/src/MS/main.cpp
+++ b/src/MS/main.cpp
@ -23,6 +23,9 @@
 #include <stdio.h>
 #include <string.h>
 #include <pthread.h>
+#include <casacore/casa/Quanta/Quantum.h>
+
+#include "cuda_profiler_api.h"

 #include <Dirac.h>
 #include <Radio.h>
@ -241,11 +244,7 @@ main(int argc, char **argv) {
     Data::readMSlist(Data::MSlist,&msnames);
    }
    if (Data::TableName) {
-     if (!doBeam) {
-      Data::readAuxData(Data::TableName,&iodata);
-     } else {
      Data::readAuxData(Data::TableName,&iodata,&beam);
-     }
     cout<<"Only one MS"<<endl;
    } else if (Data::MSlist) {
     Data::readAuxDataList(msnames,&iodata);
@ -256,11 +255,12 @@ main(int argc, char **argv) {
     srand(time(0)); /* use different seed */
    }

-    openblas_set_num_threads(1);//Data::Nt;
+    // openblas_set_num_threads(1);//Data::Nt;
+    // export OMP_NUM_THREADS=1
    /**********************************************************/
     int M,Mt,ci,cj,ck;  
   /* parameters */
-   double *p,*pinit,*pfreq;
+   double *p,*pinit;
   double **pm;
   complex double *coh;
   FILE *sfp=0;
@ -333,19 +333,6 @@ main(int argc, char **argv) {
    }
  }

-#ifdef USE_MIC
-  /* need for bitwise copyable parameter passing */
-  int *mic_pindex,*mic_chunks;
-  if ((mic_chunks=(int*)calloc((size_t)M,sizeof(int)))==0) {
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     exit(1);
-  }
-  if ((mic_pindex=(int*)calloc((size_t)Mt,sizeof(int)))==0) {
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     exit(1);
-  }
-  int cl=0;
-#endif

  /* update cluster array with correct pointers to parameters */
  cj=0;
@ -354,14 +341,8 @@ main(int argc, char **argv) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
    }
-#ifdef USE_MIC
-    mic_chunks[ci]=carr[ci].nchunk;
-#endif
    for (ck=0; ck<carr[ci].nchunk; ck++) {
      carr[ci].p[ck]=cj*8*iodata.N;
-#ifdef USE_MIC
-      mic_pindex[cl++]=carr[ci].p[ck];
-#endif
      cj++;
    }
  }
@ -418,8 +399,8 @@ main(int argc, char **argv) {

    double res_0,res_1,res_00,res_01;   
   /* previous residual */
-   double res_prev=CLM_DBL_MAX;
-   double res_ratio=5; /* how much can the residual increase before resetting solutions */
+   // double res_prev=CLM_DBL_MAX;
+   // double res_ratio=5; /* how much can the residual increase before resetting solutions */
   res_0=res_1=res_00=res_01=0.0;

    /**********************************************************/
@ -462,19 +443,18 @@ main(int argc, char **argv) {


    /* starting iterations are doubled */
-    int start_iter=1;
-    int sources_precessed=0;
+    // int start_iter=1;
+    // int sources_precessed=0;

    double inv_c=1.0/CONST_C;

+#ifdef HAVE_CUDA
+    cudaProfilerStart();
+#endif
    while (msitr[0]->more()) {
      start_time = time(0);
      if (iodata.Nms==1) {
-       if (!doBeam) {
-        Data::loadData(msitr[0]->table(),iodata,&iodata.fratio);
-       } else {
        Data::loadData(msitr[0]->table(),iodata,beam,&iodata.fratio);
-       }
      } else { 
       Data::loadDataList(msitr,iodata,&iodata.fratio);
      }
@ -489,45 +469,16 @@ main(int argc, char **argv) {
    preset_flags_and_data(iodata.Nbase*iodata.tilesz,iodata.flag,barr,iodata.x,Data::Nt);
    /* if data is being whitened, whiten x here,
     no need for a copy because we use xo for residual calculation */
-    if (Data::whiten) {
-     whiten_data(iodata.Nbase*iodata.tilesz,iodata.x,iodata.u,iodata.v,iodata.freq0,Data::Nt);
-    }
    /* precess source locations (also beam pointing) from J2000 to JAPP if we do any beam predictions,
      using first time slot as epoch */
-    if (doBeam && !sources_precessed) {
-      precess_source_locations(beam.time_utc[iodata.tilesz/2],carr,M,&beam.p_ra0,&beam.p_dec0,Data::Nt);
-      sources_precessed=1;
-    }
+    // sources_precessed=1;


-#ifdef USE_MIC
-  double *mic_u,*mic_v,*mic_w,*mic_x;
-  mic_u=iodata.u;
-  mic_v=iodata.v;
-  mic_w=iodata.w;
-  mic_x=iodata.x;
-  int mic_Nbase=iodata.Nbase;
-  int mic_tilesz=iodata.tilesz;
-  int mic_N=iodata.N;
-  double mic_freq0=iodata.freq0;
-  double mic_deltaf=iodata.deltaf;
-  double mic_data_min_uvcut=Data::min_uvcut;
-  int mic_data_Nt=Data::Nt;
-  int mic_data_max_emiter=Data::max_emiter;
-  int mic_data_max_iter=Data::max_iter;
-  int mic_data_max_lbfgs=Data::max_lbfgs;
-  int mic_data_lbfgs_m=Data::lbfgs_m;
-  int mic_data_gpu_threads=Data::gpu_threads;
-  int mic_data_linsolv=Data::linsolv;
-  int mic_data_solver_mode=Data::solver_mode;
-  int mic_data_randomize=Data::randomize;
-  double mic_data_nulow=Data::nulow;
-  double mic_data_nuhigh=Data::nuhigh;
-#endif

   /* FIXME: uvmin is not needed in calibration, because its taken care of by flags */
    if (!Data::DoSim) {
    /****************** calibration **************************/
+<<<<<<< HEAD
 #ifndef HAVE_CUDA
    if (!doBeam) {
     precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
@ -685,15 +636,19 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea
 #endif

    }
+=======
+#ifdef HAVE_CUDA
+     precalculate_coherencies_withbeam_gpu(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
+  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt);
+#endif
+#ifndef HAVE_CUDA
+     precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
+#endif
+>>>>>>> master
    /****************** end calibration **************************/
    /****************** begin diagnostics ************************/
-#ifdef HAVE_CUDA
-    if (Data::DoDiag) {
-     calculate_diagnostics(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,Data::DoDiag,Data::Nt);
-    }
-#endif /* HAVE_CUDA */
-    /****************** end diagnostics **************************/
   } else {
+<<<<<<< HEAD
    /************ simulation only mode ***************************/
    if (!solfile) {
 #ifndef HAVE_CUDA
@ -724,6 +679,23 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea
    }
    /************ end simulation only mode ***************************/
   }
+=======
+#ifdef HAVE_CUDA
+      predict_visibilities_multifreq_withbeam_gpu(iodata.u,iodata.v,iodata.w,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
+  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt,(Data::DoSim>1?1:0));
+#endif
+#ifndef HAVE_CUDA
+     precalculate_coherencies_withbeam(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
+  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt);
+#endif    
+}
+
+#ifdef HAVE_CUDA
+    cudaDeviceSynchronize();
+    cudaProfilerStop();
+    exit(0);
+#endif
+>>>>>>> master

   tilex+=iodata.tilesz;
   /* print solutions to file */
@ -740,33 +712,11 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea
   }

    /**********************************************************/
-      /* also write back */
-    if (iodata.Nms==1) {
-     Data::writeData(msitr[0]->table(),iodata);
-    } else {
-     Data::writeDataList(msitr,iodata);
-    }
    for(int cm=0; cm<iodata.Nms; cm++) {
      (*msitr[cm])++;
    }
-   if (!Data::DoSim) {
-   /* if residual has increased too much, or all are flagged (0 residual)
-      or NaN
-      reset solutions to original
-      initial values */
-   if (res_1==0.0 || !isfinite(res_1) || res_1>res_ratio*res_prev) {
-     cout<<"Resetting Solution"<<endl;
-     /* reset solutions so next iteration has default initial values */
-     memcpy(p,pinit,(size_t)iodata.N*8*Mt*sizeof(double));
-     /* also assume iterations have restarted from scratch */
-     start_iter=1;
-     /* also forget min residual (otherwise will try to reset it always) */
-     res_prev=res_1;
-   } else if (res_1<res_prev) { /* only store the min value */
-    res_prev=res_1;
-   }
-   }
    end_time = time(0);
+
    elapsed_time = ((double) (end_time-start_time)) / 60.0;
    if (!Data::DoSim) {
    if (solver_mode==SM_OSLM_OSRLM_RLBFGS||solver_mode==SM_RLM_RLBFGS||solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) { 
@ -790,10 +740,6 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea
    Data::freeData(iodata,beam);
   }

-#ifdef USE_MIC
-   free(mic_pindex);
-   free(mic_chunks);
-#endif
    /**********************************************************/

  exinfo_gaussian *exg;
--- a/src/lib/Dirac/Makefile
+++ b/src/lib/Dirac/Makefile
@ -1,11 +1,13 @@
 CC=gcc
 CXX=g++
-#CFLAGS= -Wall -O3 -g #-pg
-CFLAGS= -Wall -O3 -fopt-info-optimized
+CFLAGS= -Wall -g -pg
+# Extra args for making callgraphs.
+# CFLAGS= -Wall -pg -O2 -ansi -fPIC -fpermissive -fno-omit-frame-pointer -DNDEBUG -fno-inline-functions -fno-inline-functions-called-once -fno-optimize-sibling-calls
+# CFLAGS= -Wall -O3 -fopt-info-optimized
 CLIBS= -lm -lpthread
 #LAPACK=-L/usr/lib/atlas/sse -llapack -lblas
 #LAPACK=-L/usr/local/GotoBLAS2/lib -lgoto2 -lpthread -lgfortran
-LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
+LAPACK=-L/cm/shared/package/openblas/0.2.17mt/lib -lopenblas -lgfortran -lpthread


 INCLUDES= -I. 
--- a/src/lib/Dirac/Makefile.gpu
+++ b/src/lib/Dirac/Makefile.gpu
@ -4,7 +4,6 @@ NVCC=nvcc
 CFLAGS= -Wall -O3 -g -DHAVE_CUDA -DHYBRID_CODE -pg
 CLIBS= -lm -lpthread
 LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
-# LAPACK=-lblas -lgfortran -lpthread

 CUDAINC=/usr/local/cuda/include
 CUDALIB=-L/usr/local/cuda/lib64 -lcuda -lcudart
--- a/src/lib/Dirac/clmfit.c
+++ b/src/lib/Dirac/clmfit.c
@ -146,7 +146,7 @@ clevmar_der_single(

  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;

  err=cudaSetDevice(card);
  checkCudaError(err,__FILE__,__LINE__);
@ -199,7 +199,7 @@ clevmar_der_single(
  checkCudaError(err,__FILE__,__LINE__);


-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  int work_size=0;
  int *devInfo; 
  int devInfo_h=0;
@ -709,7 +709,7 @@ clevmar_der_single_cuda(

  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;


  unsigned long int moff; /* make sure offsets are multiples of 4 */
@ -742,7 +742,7 @@ clevmar_der_single_cuda(
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&ed, N*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==1) {
    err=cudaMalloc((void**)&taud, M*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
@ -1378,7 +1378,7 @@ mlm_der_single_cuda(

  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;


  if (opts) {
@ -1435,7 +1435,7 @@ mlm_der_single_cuda(
  /* we need coherencies for only this cluster */
  err=cudaMalloc((void**) &cohd, Nbase*8*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==1) {
    /* QR solver ********************************/
    err=cudaMalloc((void**)&taud, M*sizeof(double));
--- a/src/lib/Dirac/clmfit_fl.c
+++ b/src/lib/Dirac/clmfit_fl.c
@ -143,7 +143,7 @@ oslevmar_der_single_cuda_fl(

  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;


  unsigned long int moff;
@ -704,7 +704,7 @@ clevmar_der_single_cuda_fl(

  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;


  unsigned long int moff;
--- a/src/lib/Dirac/clmfit_nocuda.c
+++ b/src/lib/Dirac/clmfit_nocuda.c
@ -188,7 +188,7 @@ clevmar_der_single_nocuda(
  WORK=Ud=Sd=VTd=0;
  me_data_t *dt=(me_data_t*)adata;
  setweights(M,aones,1.0,dt->Nt);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==0) {

  } else if (solve_axb==1) {
@ -739,7 +739,7 @@ mlm_der_single(
 #endif
      exit(1);
  }
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==1) {
    /* QR solver ********************************/
    /* workspace query */
@ -1221,7 +1221,7 @@ oslevmar_der_single_nocuda(
  me_data_t *dt=(me_data_t*)adata;
  setweights(M,aones,1.0,dt->Nt);

-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==0) {

  } else if (solve_axb==1) {
--- a/src/lib/Dirac/consensus_poly.c
+++ b/src/lib/Dirac/consensus_poly.c
@ -283,185 +283,6 @@ find_prod_inverse(double *B, double *Bi, int Npoly, int Nf, double *fratio) {



-typedef struct thread_data_prod_inv_ {
- int startM;
- int endM;
- int M;
- int Nf;
- int Npoly;
- double *B;
- double *Bi;
- double *rho;
-} thread_data_prod_inv_t;
-
-
-/* worker thread function for calculating sum and inverse */ 
-static void*
-sum_inv_threadfn(void *data) {
- thread_data_prod_inv_t *t=(thread_data_prod_inv_t*)data;
- double w[1],*WORK,*U,*S,*VT;
-
- int k,ci,status,lwork=0;
- int Np2=t->Npoly*t->Npoly;
- /* allocate memory for the SVD here */
-  if ((U=(double*)calloc((size_t)Np2,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-  if ((VT=(double*)calloc((size_t)Np2,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-  if ((S=(double*)calloc((size_t)t->Npoly,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-
-  /* memory for SVD: use first location of Bi */
-  status=my_dgesvd('A','A',t->Npoly,t->Npoly,&(t->Bi[t->startM*Np2]),t->Npoly,S,U,t->Npoly,VT,t->Npoly,w,-1);
-  if (!status) {
-    lwork=(int)w[0];
-  } else {
-    printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
-    exit(1);
-  }
-  if ((WORK=(double*)calloc((size_t)lwork,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-
-
-
- /* iterate over clusters */
- for (k=t->startM; k<=t->endM; k++) {
-   memset(&(t->Bi[k*Np2]),0,sizeof(double)*Np2);
-   /* find sum */
-   for (ci=0; ci<t->Nf; ci++) {
-    /* outer product */
-    my_dgemm('N','T',t->Npoly,t->Npoly,1,t->rho[k+ci*t->M],&t->B[ci*t->Npoly],t->Npoly,&t->B[ci*t->Npoly],t->Npoly,1.0,&(t->Bi[k*Np2]),t->Npoly);
-   }
-   /* find SVD */
-   status=my_dgesvd('A','A',t->Npoly,t->Npoly,&(t->Bi[k*Np2]),t->Npoly,S,U,t->Npoly,VT,t->Npoly,WORK,lwork);
-   if (status) {
-    printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
-    exit(1);
-   }
-
-   /* find 1/singular values, and multiply columns of U with new singular values */
-   for (ci=0; ci<t->Npoly; ci++) {
-    if (S[ci]>CLM_EPSILON) {
-     S[ci]=1.0/S[ci];
-    } else {
-     S[ci]=0.0;
-    }
-    my_dscal(t->Npoly,S[ci],&U[ci*t->Npoly]);
-   }
-
-   /* find product U 1/S V^T */
-   my_dgemm('N','N',t->Npoly,t->Npoly,t->Npoly,1.0,U,t->Npoly,VT,t->Npoly,0.0,&(t->Bi[k*Np2]),t->Npoly);
-
- }
-
- free(U);
- free(VT);
- free(S);
- free(WORK);
- return NULL;
-}
-
-/* build matrix with polynomial terms
-  B : Npoly x Nf, each row is one basis function
-  Bi: Npoly x Npoly pseudo inverse of sum( B(:,col) x B(:,col)' ) : M times
-  Npoly : total basis functions
-  Nf: frequencies
-  M: clusters
-  rho: NfxM array of regularization factors (for each freq, M values)
-  Sum taken is a weighted sum, using weights in rho, rho is assumed to change for each freq,cluster pair 
-
-  Nt: no. of threads
-*/
-int
-find_prod_inverse_full(double *B, double *Bi, int Npoly, int Nf, int M, double *rho, int Nt) {
-
-  pthread_attr_t attr;
-  pthread_t *th_array;
-  thread_data_prod_inv_t *threaddata;
-
-  int ci,Nthb0,Nthb,nth,nth1;
-  /* clusters per thread */
-  Nthb0=(M+Nt-1)/Nt;
-
-  /* setup threads */
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
-
-  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
-   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-   exit(1);
-  }
-  if ((threaddata=(thread_data_prod_inv_t*)malloc((size_t)Nt*sizeof(thread_data_prod_inv_t)))==0) {
-    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-
-
-  ci=0;
-  for (nth=0;  nth<Nt && ci<M; nth++) {
-    if (ci+Nthb0<M) { 
-     Nthb=Nthb0;
-    } else {
-     Nthb=M-ci;
-    }
-    threaddata[nth].B=B;
-    threaddata[nth].Bi=Bi;
-    threaddata[nth].rho=rho;
-    threaddata[nth].Npoly=Npoly;
-    threaddata[nth].Nf=Nf;
-    threaddata[nth].M=M;
-    threaddata[nth].startM=ci;
-    threaddata[nth].endM=ci+Nthb-1;
-
-    pthread_create(&th_array[nth],&attr,sum_inv_threadfn,(void*)(&threaddata[nth]));
-    ci=ci+Nthb;
-  }
-
-  for(nth1=0; nth1<nth; nth1++) {
-   pthread_join(th_array[nth1],NULL);
-  }
-  
-  pthread_attr_destroy(&attr);
-
-
-  free(th_array);
-  free(threaddata);
-
-#ifdef DEBUG
-  int k,cj;
-  for (k=0; k<M; k++) {
-    printf("dir_%d=",k);
-    for (cj=0; cj<Nf; cj++) {
-      printf("%lf ",rho[k+cj*M]);
-    }
-    printf("\n");
-  }
-  for (k=0; k<M; k++) {
-  printf("Bii_%d=[\n",k);
-  for (ci=0; ci<Npoly; ci++) {
-   for(cj=0; cj<Npoly; cj++) {
-    printf("%lf ",Bi[k*Npoly*Npoly+ci*Npoly+cj]);
-   }
-   printf("\n");
-  }
-  printf("];\n");
-
-  }
-#endif
-
-
-  return 0;
-}
-
-
 /* update Z
   Z: 8N Npoly x M double array (real and complex need to be updated separate)
   N : stations
@ -526,323 +347,3 @@ update_global_z(double *Z,int N,int M,int Npoly,double *z,double *Bi) {
 free(Q);
 return 0;
 }
-
-
-typedef struct thread_data_update_z_ {
- int startM;
- int endM;
- int N;
- int M;
- int Npoly;
- double *Z;
- double *z;
- double *Bi;
-} thread_data_update_z_t;
-
-
-/* worker thread function for updating z */
-static void*
-update_z_threadfn(void *data) {
-  thread_data_update_z_t *t=(thread_data_update_z_t*)data;
-
- /* one block of Z for one direction 2Nx2xNpoly (complex)
-    and 8NxNpoly  real values : select one column : 2NxNpoly (complex)
-    select real,imag : 2NxNpoly each (vector)
-    reshape each to 2NxNpoly matrix => Q
-    Bi : NpolyxNpoly matrix = B^T
-    
-    for each direction (M values)
-    select 2N,2N,... : 2Nx Npoly complex values from z (ordered by M)
-    select real,imag: size 2NxNpoly, 2NxNpoly vectors
-    reshape to 2NxNpoly => R
-    reshape to 2NxNpoly => I (imag)
-    
-    then Q=([R I] Bi^T) for each column
-    Q=[R_1^T I_1^T R_2^T I_2^T]^T Bi^T for 2 columns
-    R_1,I_1,R_2,I_2 : size 2NxNpoly 
-    R : (2N 4) x Npoly
-    so find Q
- */
- double *R,*Q;
- if ((R=(double*)calloc((size_t)2*t->N*t->Npoly*4,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
- }
- if ((Q=(double*)calloc((size_t)2*t->N*t->Npoly*4,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
- }
- 
- int ci,np;
- for (ci=t->startM; ci<=t->endM; ci++) {
-  for (np=0; np<t->Npoly; np++) {
-   /* select 2N */
-   my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M], 4, &R[np*8*t->N], 1); /* R_1 */
-   my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M+1], 4, &R[np*8*t->N+2*t->N], 1); /* I_1 */
-   my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M+2], 4, &R[np*8*t->N+2*2*t->N], 1); /* R_2 */
-   my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M+3], 4, &R[np*8*t->N+3*2*t->N], 1); /* I_2 */
-  }
-  /* find Q=R B^T */
-  memset(Q,0,sizeof(double)*2*t->N*t->Npoly*4);
-  my_dgemm('N','N',8*t->N,t->Npoly,t->Npoly,1.0,R,8*t->N,&t->Bi[ci*t->Npoly*t->Npoly],t->Npoly,1.0,Q,8*t->N);
-  /* copy back to Z */ 
-  for (np=0; np<t->Npoly; np++) {
-   my_dcopy(2*t->N, &Q[np*8*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np], 4); 
-   my_dcopy(2*t->N, &Q[np*8*t->N+2*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np+1], 4); 
-   my_dcopy(2*t->N, &Q[np*8*t->N+2*2*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np+2], 4); 
-   my_dcopy(2*t->N, &Q[np*8*t->N+3*2*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np+3], 4); 
-  }
-   
- }
-
- free(R);
- free(Q);
-
- return NULL;
-}
-
-/* update Z
-   Z: 8N Npoly x M double array (real and complex need to be updated separate)
-   N : stations
-   M : clusters
-   Npoly: no of basis functions
-   z : right hand side 8NM Npoly x 1 (note the different ordering from Z)
-   Bi : M values of NpolyxNpoly matrices, Bi^T=Bi assumed
-
-   Nt: no. of threads
-*/
-int 
-update_global_z_multi(double *Z,int N,int M,int Npoly,double *z,double *Bi, int Nt) {
-   pthread_attr_t attr;
-   pthread_t *th_array;
-   thread_data_update_z_t *threaddata;
-
-  int ci,Nthb0,Nthb,nth,nth1;
-  /* clusters per thread */
-  Nthb0=(M+Nt-1)/Nt;
-
-  /* setup threads */
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
-
-  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
-   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-   exit(1);
-  }
-  if ((threaddata=(thread_data_update_z_t*)malloc((size_t)Nt*sizeof(thread_data_update_z_t)))==0) {
-    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-
-
-  ci=0;
-  for (nth=0;  nth<Nt && ci<M; nth++) {
-    if (ci+Nthb0<M) {
-     Nthb=Nthb0;
-    } else {
-     Nthb=M-ci;
-    }
-    threaddata[nth].z=z;
-    threaddata[nth].Z=Z;
-    threaddata[nth].Bi=Bi;
-    threaddata[nth].N=N;
-    threaddata[nth].M=M;
-    threaddata[nth].Npoly=Npoly;
-    threaddata[nth].startM=ci;
-    threaddata[nth].endM=ci+Nthb-1;
-
-    pthread_create(&th_array[nth],&attr,update_z_threadfn,(void*)(&threaddata[nth]));
-    ci=ci+Nthb;
-  }
-
-  for(nth1=0; nth1<nth; nth1++) {
-   pthread_join(th_array[nth1],NULL);
-  }
-
-  pthread_attr_destroy(&attr);
-
-
-  free(th_array);
-  free(threaddata);
- 
- return 0;
-}
-
-/* generate a random integer in the range 0,1,...,maxval */
-int
-random_int(int maxval) {
-  double rat=(double)random()/(double)RAND_MAX;
-  double y=rat*(double)(maxval+1);
-  int x=(int)floor(y);
-  return x;
-}
-
-
-typedef struct thread_data_rho_bb_ {
- int startM;
- int endM;
- int offset;
- int M;
- int N;
- double *rho;
- double *rhoupper;
- double *deltaY;
- double *deltaJ;
- clus_source_t *carr;
-} thread_data_rho_bb_t;
-
-
-/* worker thread function for calculating sum and inverse */
-static void*
-rho_bb_threadfn(void *data) {
- thread_data_rho_bb_t *t=(thread_data_rho_bb_t*)data;
- double alphacorrmin=0.2;
- int ci,ck;
- double ip11,ip12,ip22;
- ck=t->offset;
- for (ci=t->startM; ci<=t->endM; ci++) {
-   ip12=my_ddot(8*t->N*t->carr[ci].nchunk,&t->deltaY[ck],&t->deltaJ[ck]); /* x^T y */
-   /* further computations are only required if there is +ve correlation */
-   if (ip12>CLM_EPSILON) {
-   /* find the inner products */
-   ip11=my_dnrm2(8*t->N*t->carr[ci].nchunk,&t->deltaY[ck]); /* || ||_2 */
-   ip22=my_dnrm2(8*t->N*t->carr[ci].nchunk,&t->deltaJ[ck]); /* || ||_2 */
-   /* square the norm to get dot prod */
-   ip11*=ip11;
-   ip22*=ip22;
-   /* only try to do an update if the 'delta's are finite, also 
-     there is tangible correlation between the two deltas */
-#ifdef DEBUG
-   printf("%d ip11=%lf ip12=%lf ip22=%lf\n",ci,ip11,ip12,ip22);
-#endif
-   if (ip11>CLM_EPSILON && ip22>CLM_EPSILON) {
-     double alphacorr=ip12/sqrt(ip11*ip22);
-     /* decide on whether to do further calculations only if there is sufficient correlation 
-        between the deltas */
-     if (alphacorr>alphacorrmin) {
-     double alphaSD=ip11/ip12;
-     double alphaMG=ip12/ip22;
-     double alphahat;
-     if (2.0*alphaMG>alphaSD) {
-      alphahat=alphaMG;
-     } else {
-      alphahat=alphaSD-alphaMG*0.5;
-     }
-#ifdef DEBUG
-     printf("alphacorr=%lf alphaSD=%lf alphaMG=%lf alphahat=%lf rho=%lf\n",alphacorr,alphaSD,alphaMG,alphahat,t->rho[ci]);
-#endif
-      /* decide on whether to update rho based on heuristics */
-      if (alphahat> 0.001 && alphahat<t->rhoupper[ci]) {
-#ifdef DEBUG
-       printf("updating rho from %lf to %lf\n",t->rho[ci],alphahat);
-#endif
-       t->rho[ci]=alphahat;
-      }
-     }
-   }
-  
-   } 
-   ck+=t->N*8*t->carr[ci].nchunk;
- }
- return NULL;
-}
-
-
-/* Barzilai & Borwein update of rho [Xu et al] */
-/* rho: Mx1 values, to be updated
-   rhoupper: Mx1 values, upper limit of rho
-   N: no of stations
-   M : clusters
-   Mt: actual clusters (include hybrid parameter) Mt >= M
-   carr: cluster info array, to get hybrid parameters: Mx1
-   Yhat: current Yhat : 8*N*Mt 
-   Yhat_k0 : old Yhat at previous update of rho : 8*N*Mt
-   J: current solution : 8*N*Mt
-   J_k0: old solution at previous update of rho : 8*N*Mt
-   Nt: no. of threads
-*/ 
-int
-update_rho_bb(double *rho, double *rhoupper, int N, int M, int Mt, clus_source_t *carr, double *Yhat, double *Yhat_k0, double *J, double *J_k0, int Nt) {
-
- double *deltaY; /* Yhat - Yhat_k0 */
- double *deltaJ; /* J - J_k0 (with J_k0 projected to tangent plane of J)*/
- if ((deltaY=(double*)calloc((size_t)8*N*Mt,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
- }
- if ((deltaJ=(double*)calloc((size_t)8*N*Mt,sizeof(double)))==0) {
-    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
-    exit(1);
- }
- 
-
- my_dcopy(8*N*Mt, Yhat, 1, deltaY, 1); 
- my_daxpy(8*N*Mt, Yhat_k0, -1.0, deltaY);
-//no need to remove unitary ambiguity from J-Jold
- my_dcopy(8*N*Mt, J, 1, deltaJ, 1); 
- my_daxpy(8*N*Mt, J_k0,-1.0, deltaJ);
-
-  pthread_attr_t attr;
-  pthread_t *th_array;
-  thread_data_rho_bb_t *threaddata;
-
-  int ci,cj,ck,Nthb0,Nthb,nth,nth1;
-  /* clusters per thread */
-  Nthb0=(M+Nt-1)/Nt;
-
-  /* setup threads */
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
-
-  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
-   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-   exit(1);
-  }
-  if ((threaddata=(thread_data_rho_bb_t*)malloc((size_t)Nt*sizeof(thread_data_rho_bb_t)))==0) {
-    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-
-
-  ci=0;
-  ck=0;
-  for (nth=0;  nth<Nt && ci<M; nth++) {
-    if (ci+Nthb0<M) {
-     Nthb=Nthb0;
-    } else {
-     Nthb=M-ci;
-    }
-    threaddata[nth].N=N;
-    threaddata[nth].M=M;
-    threaddata[nth].offset=ck;
-    threaddata[nth].startM=ci;
-    threaddata[nth].endM=ci+Nthb-1;
-    threaddata[nth].rho=rho;
-    threaddata[nth].rhoupper=rhoupper;
-    threaddata[nth].deltaY=deltaY;
-    threaddata[nth].deltaJ=deltaJ;
-    threaddata[nth].carr=carr;
-    /* find the right offset too, since ci is not always incremented by 1 need to add up */
-    for (cj=ci; cj<ci+Nthb && cj<M; cj++) {
-      ck+=N*8*carr[cj].nchunk;
-    }
-
-
-    pthread_create(&th_array[nth],&attr,rho_bb_threadfn,(void*)(&threaddata[nth]));
-    ci=ci+Nthb;
-  }
-
-  for(nth1=0; nth1<nth; nth1++) {
-   pthread_join(th_array[nth1],NULL);
-  }
-
-  pthread_attr_destroy(&attr);
-
-  free(th_array);
-  free(threaddata);
-
-
- free(deltaY);
- free(deltaJ);
- return 0;
-}
--- a/src/lib/Dirac/dataio.c
+++ b/src/lib/Dirac/dataio.c
@ -1,82 +0,0 @@
-/*
- *
- Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- 
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
- 
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- $Id$
- */
-
-
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-
-#include "Dirac.h"
-
-int 
-open_data_stream(int file, double **d, int *count, int *N, double *freq0, double *ra0, double *dec0) {
-  struct stat statbuf;
-
-  int ig;
-  
-
- /* find the file size */
- if (fstat (file,&statbuf) < 0) {
-   fprintf(stderr,"%s: %d: no file open\n",__FILE__,__LINE__);
-   exit(1);
- }
-
- //printf("file size (bytes) %d\n",(int)statbuf.st_size);
- /* total double values is size/8 */
- *count=statbuf.st_size/8;
- //printf("total double values %d\n",*count);
-  
-  /* map the file to memory */
-  *d= (double*)mmap(NULL,  statbuf.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, file, 0);
-  if ( !d) {
-     fprintf(stderr,"%s: %d: no file open\n",__FILE__,__LINE__);
-		 exit(1);
-  }
-
-  /* remove header from data */
-  *N=(int)(*d)[0];
-  *freq0=(*d)[1];
-  *ra0=(*d)[2];
-  *dec0=(*d)[3];
-  /* read ignored stations and discard them */
-  ig=(int)(*d)[4]; 
-  /* make correct value for N */
-  *N=*N-ig;
- 
-  printf("Ignoring %d stations\n",ig);
-  /* increment to data */
-  *d=&((*d)[5+ig]); 
-  
-  return(0);
-}
-
-
-int
-close_data_stream(double *d, int count) {
-  
-  /* sync to disk */
-  msync(d, (size_t)count*sizeof(double), MS_SYNC );
-  munmap((void*)d, (size_t)count*sizeof(double));
-  return 0;
-}
-
--- a/src/lib/Dirac/lbfgs.c
+++ b/src/lib/Dirac/lbfgs.c
@ -102,7 +102,7 @@ pipeline_slave_code_b(void *data)
    checkCudaError(err,__FILE__,__LINE__);
  } else if (dp->status[tid]==PT_DO_CCOST) {
   /* divide total baselines by 2 */
-   int BlocksPerGrid=(dp->Nbase[tid]+dp->lmdata[tid]->ThreadsPerBlock-1)/dp->lmdata[tid]->ThreadsPerBlock;
+   int BlocksPerGrid= 2*(dp->Nbase[tid]+dp->lmdata[tid]->ThreadsPerBlock-1)/dp->lmdata[tid]->ThreadsPerBlock;
   int  boff=dp->boff[tid];
   /* copy the current solution to device */
   err=cudaMemcpy(dp->cpp[tid], dp->lmdata[tid]->p, m*sizeof(double), cudaMemcpyHostToDevice);
@ -901,12 +901,12 @@ lbfgs_fit_common(
  /* parameters per thread (GPU) */
  int Nparm=(m+2-1)/2;
  /* find number of blocks */
-  int BlocksPerGrid = (Nparm+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid = 2* (Nparm+ThreadsPerBlock-1)/ThreadsPerBlock;
  ci=0;
  int nth;
  for (nth=0; nth<2; nth++) {
   threaddata[nth].ThreadsPerBlock=ThreadsPerBlock;
-   threaddata[nth].BlocksPerGrid=BlocksPerGrid;
+   threaddata[nth].BlocksPerGrid= 2*BlocksPerGrid;
   threaddata[nth].card=nth;
   threaddata[nth].Nbase=dp->Nbase;
   threaddata[nth].tilesz=dp->tilesz;
--- a/src/lib/Dirac/myblas.c
+++ b/src/lib/Dirac/myblas.c
@ -206,7 +206,7 @@ __attribute__ ((target(MIC)))
 }


-/* following routines used in LAPACK solvers */
+/* following routines used in LAPACK dirac */
 /* cholesky factorization: real symmetric */
 int
 my_dpotrf(char uplo, int N, double *A, int lda) {
--- a/src/lib/Dirac/oslmfit.c
+++ b/src/lib/Dirac/oslmfit.c
@ -157,7 +157,7 @@ oslevmar_der_single_cuda(

  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;


  unsigned long int moff;
@ -190,7 +190,7 @@ oslevmar_der_single_cuda(
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&ed, N*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==1) {
    err=cudaMalloc((void**)&taud, M*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
--- a/src/lib/Dirac/robustlm.c
+++ b/src/lib/Dirac/robustlm.c
@ -159,7 +159,7 @@ rlevmar_der_single_cuda(
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
  int ThreadsPerBlock1=DEFAULT_TH_PER_BK; /* DEFAULT_TH_PER_BK/8 for accessing each element of a baseline */
  int ThreadsPerBlock2=Nd/2; /* for evaluating nu */
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;


  unsigned long int moff;
@ -196,7 +196,7 @@ rlevmar_der_single_cuda(
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&ed, N*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==1) {
    err=cudaMalloc((void**)&taud, M*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
@ -805,7 +805,7 @@ rlevmar_der_single_cuda_fl(
  /* FIXME: might need a large value for large no of baselines */
  int ThreadsPerBlock1=DEFAULT_TH_PER_BK; /* for accessing each element of a baseline */
  int ThreadsPerBlock2=Nd/2; /* for evaluating nu */
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;


  unsigned long int moff;
@ -1382,7 +1382,7 @@ osrlevmar_der_single_cuda_fl(
  /* FIXME: might need a large value for large no of baselines */
  int ThreadsPerBlock1=DEFAULT_TH_PER_BK; /* for accessing each element of a baseline */
  int ThreadsPerBlock2=Nd/2; /* for evaluating nu */
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;


  unsigned long int moff;
@ -2180,7 +2180,7 @@ rlevmar_der_single_nocuda(
  setweights(M,aones,1.0,lmdata->Nt);
  /*W set initial weights to 1 */
  setweights(N,wtd,1.0,lmdata->Nt);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==0) {

  } else if (solve_axb==1) {
@ -2766,7 +2766,7 @@ osrlevmar_der_single_nocuda(
  /*W set initial weights to 1 */
  setweights(N,wtd,1.0,lmdata0->Nt);

-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==0) {

  } else if (solve_axb==1) {
--- a/src/lib/Dirac/rtr_solve_cuda.c
+++ b/src/lib/Dirac/rtr_solve_cuda.c
@ -582,7 +582,7 @@ rtr_solve_cuda_fl(

  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;


  /* reshape x to make J: 2Nx2 complex double 
--- a/src/lib/Dirac/rtr_solve_robust_cuda.c
+++ b/src/lib/Dirac/rtr_solve_robust_cuda.c
@ -602,7 +602,7 @@ rtr_solve_cuda_robust_fl(

  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;


  /* reshape x to make J: 2Nx2 complex double 
@ -985,7 +985,7 @@ nsd_solve_cuda_robust_fl(

  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;


  /* reshape x to make J: 2Nx2 complex double 
--- a/src/lib/Dirac/rtr_solve_robust_cuda_admm.c
+++ b/src/lib/Dirac/rtr_solve_robust_cuda_admm.c
@ -517,7 +517,7 @@ rtr_solve_cuda_robust_admm_fl(

  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;


  /* reshape x to make J: 2Nx2 complex double 
@ -947,7 +947,7 @@ nsd_solve_cuda_robust_admm_fl(

  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;


  /* reshape x to make J: 2Nx2 complex double 
--- a/src/lib/Radio/Makefile
+++ b/src/lib/Radio/Makefile
@ -2,11 +2,12 @@ CC=gcc
 CXX=g++
 #CFLAGS= -Wall -O3 -g #-pg
 CFLAGS= -Wall -O3 -fopt-info-optimized
+# CFLAGS= -Wall -pg -O2 -ansi -fPIC -fpermissive -fno-omit-frame-pointer -DNDEBUG -fno-inline-functions -fno-inline-functions-called-once -fno-optimize-sibling-calls
 CLIBS= -lm -lpthread
 #LAPACK=-L/usr/lib/atlas/sse -llapack -lblas
 #LAPACK=-L/usr/local/GotoBLAS2/lib -lgoto2 -lpthread -lgfortran
-LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
-
+#LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
+LAPACK=-L/cm/shared/package/openblas/0.2.17mt/lib -lopenblas -lgfortran -lpthread

 INCLUDES= -I. -I../Dirac/ 
 LIBPATH=
--- a/src/lib/Radio/Makefile.gpu
+++ b/src/lib/Radio/Makefile.gpu
@ -3,7 +3,8 @@ CXX=g++
 NVCC=nvcc
 CFLAGS= -Wall -O3 -g -DHAVE_CUDA -DHYBRID_CODE -pg
 CLIBS= -lm -lpthread
-LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
+#LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
+LAPACK=-L/cm/shared/package/openblas/0.2.17mt/lib -lopenblas -lgfortran -lpthread
 # LAPACK=-lblas -lgfortran -lpthread

 CUDAINC=/usr/local/cuda/include
--- a/src/lib/Radio/predict_model.cu
+++ b/src/lib/Radio/predict_model.cu
--- a/src/lib/Radio/predict_withbeam.c
+++ b/src/lib/Radio/predict_withbeam.c
@ -1114,7 +1114,7 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit
    for (cj=0; cj<carr[cm].nchunk; cj++) {
      pm=&(p[carr[cm].p[cj]]); /* start of solutions */
      /* extract phase of pm, output to pphase */
-      extract_phases(pm,pphase,N,10);
+      // extract_phases(pm,pphase,N,10);
      /* invert N solutions */
      for (ci=0; ci<N; ci++) {
       mat_invert(&pphase[8*ci],&pinv[8*ci+8*N*cj], rho);
--- a/src/lib/Radio/predict_withbeam_gpu.c
+++ b/src/lib/Radio/predict_withbeam_gpu.c
--- a/test/Generate_sources.py
+++ b/test/Generate_sources.py
@ -28,6 +28,9 @@ tol_seconds_of_RA = tol_seconds_of_decl * 24/360
 RA_hours_3C196 = 0
 RA_minutes_3C196 = 0
 RA_seconds_3C196 = 0
+#RA_hours_3C196 = 8
+#RA_minutes_3C196 = 13
+#RA_seconds_3C196 = 35.981540

 RA_seconds_3C196 = (RA_hours_3C196 * 60 + RA_minutes_3C196) *60 + RA_seconds_3C196

@ -42,6 +45,9 @@ RA_hours_high, RA_minutes_high = divmod(RA_minutes_high, 60)
 decl_degrees_3C196 = 90
 decl_minutes_3C196 = 0
 decl_seconds_3C196 = 0
+#decl_degrees_3C196 = 48
+#decl_minutes_3C196 = 12
+#decl_seconds_3C196 = 59.174770

 decl_seconds_3C196 = (decl_degrees_3C196 * 60 + decl_minutes_3C196) *60 + decl_seconds_3C196

@ -68,9 +74,8 @@ with warnings.catch_warnings():
             'pos_angle', 'freq0')

    formats = ['U6', 'i4', 'i4', 'f8', 'i4', 'i4', 'f8', 'f8', 'i4', 'i4', 'i4', 'f8', 'f8',
-               'i4', 'i4', 'i4', 'i4', 'f8', 'f8']
-
-    formats_reformatted = '%s  %d  %d  %f  %d  %d  %f  %f  %d  %d  %d  %f  %f  %d  %d  %d  %d  %f  %f'
+      'f8', 'f8', 'f8', 'f8', 'f8', 'f8']
+    formats_reformatted = '%s  %d  %d  %f  %d  %d  %f  %f  %d  %d  %d  %f  %f  %f  %f  %f  %f  %f  %f'

    sources_parameters = np.recarray((number_of_sources,), formats=formats,
                                     names=names)
--- a/test/dosage.sh
+++ b/test/dosage.sh
@ -1,2 +1,2 @@
 # Before running this, untar sm.ms.tar and build sagecal
-../src/MS/sagecal -d sm.ms -s 3c196.sky.txt -c 3c196.sky.txt.cluster -n 4 -t 10 -p sm.ms.solutions -e 4 -g 2 -l 10 -m 7 -x 30 -F 1 -j 5  -k -1 -B 1 -W 0 > sm.ms.output
+../src/MS/sagecal -d sm.ms -s extended_source_list.txt -c extended_source_list.txt.cluster -n 4 -t 10 -p sm.ms.solutions -e 4 -g 2 -l 10 -m 7 -x 30 -F 1 -j 2  -k -1 -B 1 -W 0 > sm.ms.output