Merge branch 'master' into sourceforge

2018-02-06 10:59:45 +01:00 · 2018-02-06 10:59:45 +01:00 · 9110379d6a
parent 97f9b51ba3 538abbe6c1
commit 9110379d6a
27 changed files with 764 additions and 2581 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,25 @@
 */*.png
 */*.out
 */*.output
 */*.solutions
 */*.dot
 test/sm.ms/
 test/nvprof-resultaten/
 src/MS/sagecal
 test/analysis-*.txt
 test/extended_*.*
 */*/*.a
 */*/*.o
 */*/*.swp
 */*/*.swo
 */*/*.out
 */*/*.output
 */*/*/*.o
 */*/*/*.a
 */*/*/*.swp
 */*/*/*.swo
 */*/*/*.out
 */*/*/*.output
--- a/INSTALL.md
+++ b/INSTALL.md
@ -1,7 +1,8 @@
 vr  2 dec 2016 23:07:19 CET
-SAGECal Installation
+
-====================
+# SAGECal Installation
-0) Prerequsites:
+
 ## 1 Prerequsites:
 - CASACORE http://casacore.googlecode.com/
 - glib http://developer.gnome.org/glib
 - BLAS/LAPACK
@ -15,12 +16,13 @@ SAGECal Installation
  -- Intel MKL and other libraries
 - Get the source for SAGECal : git clone git://git.code.sf.net/p/sagecal/code sagecal-code
-1) The basic way to build is
+## 2 The basic way to build is
  1.a) go to ./src/lib  and run make (which will create libsagecal.a)
  1.b) go to ./src/MS and run make (which will create the executable)
-2) In ./src/lib and ./src/MS you MUST edit the Makefiles to suit your system. Some common items to edit are:
+## 3 Build settings
 In ./src/lib and ./src/MS you MUST edit the Makefiles to suit your system. Some common items to edit are:
 - LAPACK: directory where LAPACK/OpenBLAS is installed
 - GLIBI/GLIBL: include/lib files for glib
 - CASA_LIBDIR/CASA_INCDIR/CASA_LIBS : casacore include/library location and files:
@ -39,23 +41,23 @@ SAGECal Installation
-SAGECAL-MPI Installation
+# SAGECAL-MPI Installation
-========================
+
-0) Prerequsites:
+## 1 Prerequsites:
 - Same as above 
 - MPI (e.g. OpenMPI)
-1) Build ./src/lib as above (using mpicc -DMPI_BUILD)
+## 2 Build ./src/lib as above (using mpicc -DMPI_BUILD)
-2) Build ./src/MPI using mpicc++
+## 3 Build ./src/MPI using mpicc++
-BUILDSKY Installation
+## BUILDSKY Installation
-=====================
+
-1) See INSTALL in ./src/buildsky
+  - See INSTALL in ./src/buildsky
-RESTORE Installation
+## RESTORE Installation
-=====================
+
-1) See INSTALL in ./src/restore
+  - See INSTALL in ./src/restore
--- a/README.md
+++ b/README.md
@ -1,15 +1,17 @@
-SAGECAL
+# SAGECAL
-=======
+
 Read INSTALL for installation. This file gives a brief guide to use SAGECal.
 Warning: this file may be obsolete. use sagecal -h to see up-to-date options.
-Step by Step Introduction:
+## Step by Step Introduction:
-#######################################################################
+
 1a)Calibrate data in the standard way using BBS/CASA or anything else. 
 Use NDPP to average the data in your MS to a few channels (also average in time to about 10sec). Also flag any spikes in the data.
 1b)For subtraction of the ATeam from raw data (CasA,CygA,...), no initial calibration is necessary. Just run sagecal on raw data, but it is better to scale the sky model to match the apparent flux of the sources that are being subtracted.
-#######################################################################
+
 2) Sky Model:
 3a)Make an image of your MS (using ExCon/casapy). 
 Use Duchamp to create a mask for the image. Use buildsky to create a sky model. (see the README file on top level directory). Also create a proper cluster file.
--- a/src/MS/Makefile
+++ b/src/MS/Makefile
@ -36,5 +36,4 @@ data.o:data.cpp data.h
 sagecal:$(OBJECTS) ../lib/Radio/libsagecal.a ../lib/Dirac/libdirac.a 
 	$(CXX) $(CXXFLAGS) $(LDFLAGS) $(INCLUDES) $(GLIBI) $(LIBPATH)  -o $@  $(OBJECTS) $(MY_LIBS) $(LAPACK) $(CASA_LIBS)  $(GLIBL)
 clean:
-	rm *.o 
+	rm *.o *.tmp *.fits *.swp *.swo *.o *.output
--- a/src/MS/Makefile.gpu
+++ b/src/MS/Makefile.gpu
@ -10,9 +10,9 @@ LAPACK_DIR=/cm/shared/package/openblas/0.2.17mt/lib
 #LAPACK_DIR=/usr/lib/atlas/sse/
 CUDAINC=-I/cm/shared/apps/cuda80/toolkit/8.0.44/include/
-CUDALIB=-L/cm/shared/apps/cuda80/toolkit/8.0.44/lib64/ -lcuda -lcudart
+CUDALIB=-lcublas -lcusolver -lcudadevrt
-CULALIB=-lcublas -lcusolver -lcudadevrt
+# CULALIB=-lcublas -lcusolver -lcudadevrt
 # NVML
 NVML_INC=/usr/include/nvidia/gdk/
 NVML_LIB=-lnvidia-ml -L/usr/lib64/nvidia/
--- a/src/MS/main.cpp
+++ b/src/MS/main.cpp
@ -23,6 +23,9 @@
 #include <stdio.h>
 #include <string.h>
 #include <pthread.h>
 #include <casacore/casa/Quanta/Quantum.h>
 #include "cuda_profiler_api.h"
 #include <Dirac.h>
 #include <Radio.h>
@ -241,11 +244,7 @@ main(int argc, char **argv) {
     Data::readMSlist(Data::MSlist,&msnames);
    }
    if (Data::TableName) {
     if (!doBeam) {
      Data::readAuxData(Data::TableName,&iodata);
     } else {
      Data::readAuxData(Data::TableName,&iodata,&beam);
     }
     cout<<"Only one MS"<<endl;
    } else if (Data::MSlist) {
     Data::readAuxDataList(msnames,&iodata);
@ -256,11 +255,12 @@ main(int argc, char **argv) {
     srand(time(0)); /* use different seed */
    }
-    openblas_set_num_threads(1);//Data::Nt;
+    // openblas_set_num_threads(1);//Data::Nt;
    // export OMP_NUM_THREADS=1
    /**********************************************************/
     int M,Mt,ci,cj,ck;  
   /* parameters */
-   double *p,*pinit,*pfreq;
+   double *p,*pinit;
   double **pm;
   complex double *coh;
   FILE *sfp=0;
@ -333,19 +333,6 @@ main(int argc, char **argv) {
    }
  }
 #ifdef USE_MIC
  /* need for bitwise copyable parameter passing */
  int *mic_pindex,*mic_chunks;
  if ((mic_chunks=(int*)calloc((size_t)M,sizeof(int)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  if ((mic_pindex=(int*)calloc((size_t)Mt,sizeof(int)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  int cl=0;
 #endif
  /* update cluster array with correct pointers to parameters */
  cj=0;
@ -354,14 +341,8 @@ main(int argc, char **argv) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
    }
 #ifdef USE_MIC
    mic_chunks[ci]=carr[ci].nchunk;
 #endif
    for (ck=0; ck<carr[ci].nchunk; ck++) {
      carr[ci].p[ck]=cj*8*iodata.N;
 #ifdef USE_MIC
      mic_pindex[cl++]=carr[ci].p[ck];
 #endif
      cj++;
    }
  }
@ -418,8 +399,8 @@ main(int argc, char **argv) {
    double res_0,res_1,res_00,res_01;   
   /* previous residual */
-   double res_prev=CLM_DBL_MAX;
+   // double res_prev=CLM_DBL_MAX;
-   double res_ratio=5; /* how much can the residual increase before resetting solutions */
+   // double res_ratio=5; /* how much can the residual increase before resetting solutions */
   res_0=res_1=res_00=res_01=0.0;
    /**********************************************************/
@ -462,19 +443,18 @@ main(int argc, char **argv) {
    /* starting iterations are doubled */
-    int start_iter=1;
+    // int start_iter=1;
-    int sources_precessed=0;
+    // int sources_precessed=0;
    double inv_c=1.0/CONST_C;
 #ifdef HAVE_CUDA
    cudaProfilerStart();
 #endif
    while (msitr[0]->more()) {
      start_time = time(0);
      if (iodata.Nms==1) {
       if (!doBeam) {
        Data::loadData(msitr[0]->table(),iodata,&iodata.fratio);
       } else {
        Data::loadData(msitr[0]->table(),iodata,beam,&iodata.fratio);
       }
      } else { 
       Data::loadDataList(msitr,iodata,&iodata.fratio);
      }
@ -489,45 +469,16 @@ main(int argc, char **argv) {
    preset_flags_and_data(iodata.Nbase*iodata.tilesz,iodata.flag,barr,iodata.x,Data::Nt);
    /* if data is being whitened, whiten x here,
     no need for a copy because we use xo for residual calculation */
    if (Data::whiten) {
     whiten_data(iodata.Nbase*iodata.tilesz,iodata.x,iodata.u,iodata.v,iodata.freq0,Data::Nt);
    }
    /* precess source locations (also beam pointing) from J2000 to JAPP if we do any beam predictions,
      using first time slot as epoch */
-    if (doBeam && !sources_precessed) {
+    // sources_precessed=1;
      precess_source_locations(beam.time_utc[iodata.tilesz/2],carr,M,&beam.p_ra0,&beam.p_dec0,Data::Nt);
      sources_precessed=1;
    }
 #ifdef USE_MIC
  double *mic_u,*mic_v,*mic_w,*mic_x;
  mic_u=iodata.u;
  mic_v=iodata.v;
  mic_w=iodata.w;
  mic_x=iodata.x;
  int mic_Nbase=iodata.Nbase;
  int mic_tilesz=iodata.tilesz;
  int mic_N=iodata.N;
  double mic_freq0=iodata.freq0;
  double mic_deltaf=iodata.deltaf;
  double mic_data_min_uvcut=Data::min_uvcut;
  int mic_data_Nt=Data::Nt;
  int mic_data_max_emiter=Data::max_emiter;
  int mic_data_max_iter=Data::max_iter;
  int mic_data_max_lbfgs=Data::max_lbfgs;
  int mic_data_lbfgs_m=Data::lbfgs_m;
  int mic_data_gpu_threads=Data::gpu_threads;
  int mic_data_linsolv=Data::linsolv;
  int mic_data_solver_mode=Data::solver_mode;
  int mic_data_randomize=Data::randomize;
  double mic_data_nulow=Data::nulow;
  double mic_data_nuhigh=Data::nuhigh;
 #endif
   /* FIXME: uvmin is not needed in calibration, because its taken care of by flags */
    if (!Data::DoSim) {
    /****************** calibration **************************/
 <<<<<<< HEAD
 #ifndef HAVE_CUDA
    if (!doBeam) {
     precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
@ -685,15 +636,19 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea
 #endif
    }
 =======
 #ifdef HAVE_CUDA
     precalculate_coherencies_withbeam_gpu(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt);
 #endif
 #ifndef HAVE_CUDA
     precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
 #endif
 >>>>>>> master
    /****************** end calibration **************************/
    /****************** begin diagnostics ************************/
 #ifdef HAVE_CUDA
    if (Data::DoDiag) {
     calculate_diagnostics(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,Data::DoDiag,Data::Nt);
    }
 #endif /* HAVE_CUDA */
    /****************** end diagnostics **************************/
   } else {
 <<<<<<< HEAD
    /************ simulation only mode ***************************/
    if (!solfile) {
 #ifndef HAVE_CUDA
@ -724,6 +679,23 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea
    }
    /************ end simulation only mode ***************************/
   }
 =======
 #ifdef HAVE_CUDA
      predict_visibilities_multifreq_withbeam_gpu(iodata.u,iodata.v,iodata.w,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt,(Data::DoSim>1?1:0));
 #endif
 #ifndef HAVE_CUDA
     precalculate_coherencies_withbeam(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt);
 #endif    
 }
 #ifdef HAVE_CUDA
    cudaDeviceSynchronize();
    cudaProfilerStop();
    exit(0);
 #endif
 >>>>>>> master
   tilex+=iodata.tilesz;
   /* print solutions to file */
@ -740,33 +712,11 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea
   }
    /**********************************************************/
      /* also write back */
    if (iodata.Nms==1) {
     Data::writeData(msitr[0]->table(),iodata);
    } else {
     Data::writeDataList(msitr,iodata);
    }
    for(int cm=0; cm<iodata.Nms; cm++) {
      (*msitr[cm])++;
    }
   if (!Data::DoSim) {
   /* if residual has increased too much, or all are flagged (0 residual)
      or NaN
      reset solutions to original
      initial values */
   if (res_1==0.0 || !isfinite(res_1) || res_1>res_ratio*res_prev) {
     cout<<"Resetting Solution"<<endl;
     /* reset solutions so next iteration has default initial values */
     memcpy(p,pinit,(size_t)iodata.N*8*Mt*sizeof(double));
     /* also assume iterations have restarted from scratch */
     start_iter=1;
     /* also forget min residual (otherwise will try to reset it always) */
     res_prev=res_1;
   } else if (res_1<res_prev) { /* only store the min value */
    res_prev=res_1;
   }
   }
    end_time = time(0);
    elapsed_time = ((double) (end_time-start_time)) / 60.0;
    if (!Data::DoSim) {
    if (solver_mode==SM_OSLM_OSRLM_RLBFGS||solver_mode==SM_RLM_RLBFGS||solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) { 
@ -790,10 +740,6 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea
    Data::freeData(iodata,beam);
   }
 #ifdef USE_MIC
   free(mic_pindex);
   free(mic_chunks);
 #endif
    /**********************************************************/
  exinfo_gaussian *exg;
--- a/src/lib/Dirac/Makefile
+++ b/src/lib/Dirac/Makefile
@ -1,11 +1,13 @@
 CC=gcc
 CXX=g++
-#CFLAGS= -Wall -O3 -g #-pg
+CFLAGS= -Wall -g -pg
-CFLAGS= -Wall -O3 -fopt-info-optimized
+# Extra args for making callgraphs.
 # CFLAGS= -Wall -pg -O2 -ansi -fPIC -fpermissive -fno-omit-frame-pointer -DNDEBUG -fno-inline-functions -fno-inline-functions-called-once -fno-optimize-sibling-calls
 # CFLAGS= -Wall -O3 -fopt-info-optimized
 CLIBS= -lm -lpthread
 #LAPACK=-L/usr/lib/atlas/sse -llapack -lblas
 #LAPACK=-L/usr/local/GotoBLAS2/lib -lgoto2 -lpthread -lgfortran
-LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
+LAPACK=-L/cm/shared/package/openblas/0.2.17mt/lib -lopenblas -lgfortran -lpthread
 INCLUDES= -I. 
--- a/src/lib/Dirac/Makefile.gpu
+++ b/src/lib/Dirac/Makefile.gpu
@ -4,7 +4,6 @@ NVCC=nvcc
 CFLAGS= -Wall -O3 -g -DHAVE_CUDA -DHYBRID_CODE -pg
 CLIBS= -lm -lpthread
 LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
 # LAPACK=-lblas -lgfortran -lpthread
 CUDAINC=/usr/local/cuda/include
 CUDALIB=-L/usr/local/cuda/lib64 -lcuda -lcudart
--- a/src/lib/Dirac/clmfit.c
+++ b/src/lib/Dirac/clmfit.c
@ -146,7 +146,7 @@ clevmar_der_single(
  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  err=cudaSetDevice(card);
  checkCudaError(err,__FILE__,__LINE__);
@ -199,7 +199,7 @@ clevmar_der_single(
  checkCudaError(err,__FILE__,__LINE__);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  int work_size=0;
  int *devInfo; 
  int devInfo_h=0;
@ -709,7 +709,7 @@ clevmar_der_single_cuda(
  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  unsigned long int moff; /* make sure offsets are multiples of 4 */
@ -742,7 +742,7 @@ clevmar_der_single_cuda(
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&ed, N*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==1) {
    err=cudaMalloc((void**)&taud, M*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
@ -1378,7 +1378,7 @@ mlm_der_single_cuda(
  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  if (opts) {
@ -1435,7 +1435,7 @@ mlm_der_single_cuda(
  /* we need coherencies for only this cluster */
  err=cudaMalloc((void**) &cohd, Nbase*8*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==1) {
    /* QR solver ********************************/
    err=cudaMalloc((void**)&taud, M*sizeof(double));
--- a/src/lib/Dirac/clmfit_fl.c
+++ b/src/lib/Dirac/clmfit_fl.c
@ -143,7 +143,7 @@ oslevmar_der_single_cuda_fl(
  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  unsigned long int moff;
@ -704,7 +704,7 @@ clevmar_der_single_cuda_fl(
  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  unsigned long int moff;
--- a/src/lib/Dirac/clmfit_nocuda.c
+++ b/src/lib/Dirac/clmfit_nocuda.c
@ -188,7 +188,7 @@ clevmar_der_single_nocuda(
  WORK=Ud=Sd=VTd=0;
  me_data_t *dt=(me_data_t*)adata;
  setweights(M,aones,1.0,dt->Nt);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==0) {
  } else if (solve_axb==1) {
@ -739,7 +739,7 @@ mlm_der_single(
 #endif
      exit(1);
  }
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==1) {
    /* QR solver ********************************/
    /* workspace query */
@ -1221,7 +1221,7 @@ oslevmar_der_single_nocuda(
  me_data_t *dt=(me_data_t*)adata;
  setweights(M,aones,1.0,dt->Nt);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==0) {
  } else if (solve_axb==1) {
--- a/src/lib/Dirac/consensus_poly.c
+++ b/src/lib/Dirac/consensus_poly.c
@ -283,185 +283,6 @@ find_prod_inverse(double *B, double *Bi, int Npoly, int Nf, double *fratio) {
 typedef struct thread_data_prod_inv_ {
 int startM;
 int endM;
 int M;
 int Nf;
 int Npoly;
 double *B;
 double *Bi;
 double *rho;
 } thread_data_prod_inv_t;
 /* worker thread function for calculating sum and inverse */ 
 static void*
 sum_inv_threadfn(void *data) {
 thread_data_prod_inv_t *t=(thread_data_prod_inv_t*)data;
 double w[1],*WORK,*U,*S,*VT;
 int k,ci,status,lwork=0;
 int Np2=t->Npoly*t->Npoly;
 /* allocate memory for the SVD here */
  if ((U=(double*)calloc((size_t)Np2,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  if ((VT=(double*)calloc((size_t)Np2,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  if ((S=(double*)calloc((size_t)t->Npoly,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  /* memory for SVD: use first location of Bi */
  status=my_dgesvd('A','A',t->Npoly,t->Npoly,&(t->Bi[t->startM*Np2]),t->Npoly,S,U,t->Npoly,VT,t->Npoly,w,-1);
  if (!status) {
    lwork=(int)w[0];
  } else {
    printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
    exit(1);
  }
  if ((WORK=(double*)calloc((size_t)lwork,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
  }
 /* iterate over clusters */
 for (k=t->startM; k<=t->endM; k++) {
   memset(&(t->Bi[k*Np2]),0,sizeof(double)*Np2);
   /* find sum */
   for (ci=0; ci<t->Nf; ci++) {
    /* outer product */
    my_dgemm('N','T',t->Npoly,t->Npoly,1,t->rho[k+ci*t->M],&t->B[ci*t->Npoly],t->Npoly,&t->B[ci*t->Npoly],t->Npoly,1.0,&(t->Bi[k*Np2]),t->Npoly);
   }
   /* find SVD */
   status=my_dgesvd('A','A',t->Npoly,t->Npoly,&(t->Bi[k*Np2]),t->Npoly,S,U,t->Npoly,VT,t->Npoly,WORK,lwork);
   if (status) {
    printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
    exit(1);
   }
   /* find 1/singular values, and multiply columns of U with new singular values */
   for (ci=0; ci<t->Npoly; ci++) {
    if (S[ci]>CLM_EPSILON) {
     S[ci]=1.0/S[ci];
    } else {
     S[ci]=0.0;
    }
    my_dscal(t->Npoly,S[ci],&U[ci*t->Npoly]);
   }
   /* find product U 1/S V^T */
   my_dgemm('N','N',t->Npoly,t->Npoly,t->Npoly,1.0,U,t->Npoly,VT,t->Npoly,0.0,&(t->Bi[k*Np2]),t->Npoly);
 }
 free(U);
 free(VT);
 free(S);
 free(WORK);
 return NULL;
 }
 /* build matrix with polynomial terms
  B : Npoly x Nf, each row is one basis function
  Bi: Npoly x Npoly pseudo inverse of sum( B(:,col) x B(:,col)' ) : M times
  Npoly : total basis functions
  Nf: frequencies
  M: clusters
  rho: NfxM array of regularization factors (for each freq, M values)
  Sum taken is a weighted sum, using weights in rho, rho is assumed to change for each freq,cluster pair 
  Nt: no. of threads
 */
 int
 find_prod_inverse_full(double *B, double *Bi, int Npoly, int Nf, int M, double *rho, int Nt) {
  pthread_attr_t attr;
  pthread_t *th_array;
  thread_data_prod_inv_t *threaddata;
  int ci,Nthb0,Nthb,nth,nth1;
  /* clusters per thread */
  Nthb0=(M+Nt-1)/Nt;
  /* setup threads */
  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
  }
  if ((threaddata=(thread_data_prod_inv_t*)malloc((size_t)Nt*sizeof(thread_data_prod_inv_t)))==0) {
    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  ci=0;
  for (nth=0;  nth<Nt && ci<M; nth++) {
    if (ci+Nthb0<M) { 
     Nthb=Nthb0;
    } else {
     Nthb=M-ci;
    }
    threaddata[nth].B=B;
    threaddata[nth].Bi=Bi;
    threaddata[nth].rho=rho;
    threaddata[nth].Npoly=Npoly;
    threaddata[nth].Nf=Nf;
    threaddata[nth].M=M;
    threaddata[nth].startM=ci;
    threaddata[nth].endM=ci+Nthb-1;
    pthread_create(&th_array[nth],&attr,sum_inv_threadfn,(void*)(&threaddata[nth]));
    ci=ci+Nthb;
  }
  for(nth1=0; nth1<nth; nth1++) {
   pthread_join(th_array[nth1],NULL);
  }
  pthread_attr_destroy(&attr);
  free(th_array);
  free(threaddata);
 #ifdef DEBUG
  int k,cj;
  for (k=0; k<M; k++) {
    printf("dir_%d=",k);
    for (cj=0; cj<Nf; cj++) {
      printf("%lf ",rho[k+cj*M]);
    }
    printf("\n");
  }
  for (k=0; k<M; k++) {
  printf("Bii_%d=[\n",k);
  for (ci=0; ci<Npoly; ci++) {
   for(cj=0; cj<Npoly; cj++) {
    printf("%lf ",Bi[k*Npoly*Npoly+ci*Npoly+cj]);
   }
   printf("\n");
  }
  printf("];\n");
  }
 #endif
  return 0;
 }
 /* update Z
   Z: 8N Npoly x M double array (real and complex need to be updated separate)
   N : stations
@ -526,323 +347,3 @@ update_global_z(double *Z,int N,int M,int Npoly,double *z,double *Bi) {
 free(Q);
 return 0;
 }
 typedef struct thread_data_update_z_ {
 int startM;
 int endM;
 int N;
 int M;
 int Npoly;
 double *Z;
 double *z;
 double *Bi;
 } thread_data_update_z_t;
 /* worker thread function for updating z */
 static void*
 update_z_threadfn(void *data) {
  thread_data_update_z_t *t=(thread_data_update_z_t*)data;
 /* one block of Z for one direction 2Nx2xNpoly (complex)
    and 8NxNpoly  real values : select one column : 2NxNpoly (complex)
    select real,imag : 2NxNpoly each (vector)
    reshape each to 2NxNpoly matrix => Q
    Bi : NpolyxNpoly matrix = B^T
    for each direction (M values)
    select 2N,2N,... : 2Nx Npoly complex values from z (ordered by M)
    select real,imag: size 2NxNpoly, 2NxNpoly vectors
    reshape to 2NxNpoly => R
    reshape to 2NxNpoly => I (imag)
    then Q=([R I] Bi^T) for each column
    Q=[R_1^T I_1^T R_2^T I_2^T]^T Bi^T for 2 columns
    R_1,I_1,R_2,I_2 : size 2NxNpoly 
    R : (2N 4) x Npoly
    so find Q
 */
 double *R,*Q;
 if ((R=(double*)calloc((size_t)2*t->N*t->Npoly*4,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
 }
 if ((Q=(double*)calloc((size_t)2*t->N*t->Npoly*4,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
 }
 int ci,np;
 for (ci=t->startM; ci<=t->endM; ci++) {
  for (np=0; np<t->Npoly; np++) {
   /* select 2N */
   my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M], 4, &R[np*8*t->N], 1); /* R_1 */
   my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M+1], 4, &R[np*8*t->N+2*t->N], 1); /* I_1 */
   my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M+2], 4, &R[np*8*t->N+2*2*t->N], 1); /* R_2 */
   my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M+3], 4, &R[np*8*t->N+3*2*t->N], 1); /* I_2 */
  }
  /* find Q=R B^T */
  memset(Q,0,sizeof(double)*2*t->N*t->Npoly*4);
  my_dgemm('N','N',8*t->N,t->Npoly,t->Npoly,1.0,R,8*t->N,&t->Bi[ci*t->Npoly*t->Npoly],t->Npoly,1.0,Q,8*t->N);
  /* copy back to Z */ 
  for (np=0; np<t->Npoly; np++) {
   my_dcopy(2*t->N, &Q[np*8*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np], 4); 
   my_dcopy(2*t->N, &Q[np*8*t->N+2*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np+1], 4); 
   my_dcopy(2*t->N, &Q[np*8*t->N+2*2*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np+2], 4); 
   my_dcopy(2*t->N, &Q[np*8*t->N+3*2*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np+3], 4); 
  }
 }
 free(R);
 free(Q);
 return NULL;
 }
 /* update Z
   Z: 8N Npoly x M double array (real and complex need to be updated separate)
   N : stations
   M : clusters
   Npoly: no of basis functions
   z : right hand side 8NM Npoly x 1 (note the different ordering from Z)
   Bi : M values of NpolyxNpoly matrices, Bi^T=Bi assumed
   Nt: no. of threads
 */
 int 
 update_global_z_multi(double *Z,int N,int M,int Npoly,double *z,double *Bi, int Nt) {
   pthread_attr_t attr;
   pthread_t *th_array;
   thread_data_update_z_t *threaddata;
  int ci,Nthb0,Nthb,nth,nth1;
  /* clusters per thread */
  Nthb0=(M+Nt-1)/Nt;
  /* setup threads */
  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
  }
  if ((threaddata=(thread_data_update_z_t*)malloc((size_t)Nt*sizeof(thread_data_update_z_t)))==0) {
    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  ci=0;
  for (nth=0;  nth<Nt && ci<M; nth++) {
    if (ci+Nthb0<M) {
     Nthb=Nthb0;
    } else {
     Nthb=M-ci;
    }
    threaddata[nth].z=z;
    threaddata[nth].Z=Z;
    threaddata[nth].Bi=Bi;
    threaddata[nth].N=N;
    threaddata[nth].M=M;
    threaddata[nth].Npoly=Npoly;
    threaddata[nth].startM=ci;
    threaddata[nth].endM=ci+Nthb-1;
    pthread_create(&th_array[nth],&attr,update_z_threadfn,(void*)(&threaddata[nth]));
    ci=ci+Nthb;
  }
  for(nth1=0; nth1<nth; nth1++) {
   pthread_join(th_array[nth1],NULL);
  }
  pthread_attr_destroy(&attr);
  free(th_array);
  free(threaddata);
 return 0;
 }
 /* generate a random integer in the range 0,1,...,maxval */
 int
 random_int(int maxval) {
  double rat=(double)random()/(double)RAND_MAX;
  double y=rat*(double)(maxval+1);
  int x=(int)floor(y);
  return x;
 }
 typedef struct thread_data_rho_bb_ {
 int startM;
 int endM;
 int offset;
 int M;
 int N;
 double *rho;
 double *rhoupper;
 double *deltaY;
 double *deltaJ;
 clus_source_t *carr;
 } thread_data_rho_bb_t;
 /* worker thread function for calculating sum and inverse */
 static void*
 rho_bb_threadfn(void *data) {
 thread_data_rho_bb_t *t=(thread_data_rho_bb_t*)data;
 double alphacorrmin=0.2;
 int ci,ck;
 double ip11,ip12,ip22;
 ck=t->offset;
 for (ci=t->startM; ci<=t->endM; ci++) {
   ip12=my_ddot(8*t->N*t->carr[ci].nchunk,&t->deltaY[ck],&t->deltaJ[ck]); /* x^T y */
   /* further computations are only required if there is +ve correlation */
   if (ip12>CLM_EPSILON) {
   /* find the inner products */
   ip11=my_dnrm2(8*t->N*t->carr[ci].nchunk,&t->deltaY[ck]); /* || ||_2 */
   ip22=my_dnrm2(8*t->N*t->carr[ci].nchunk,&t->deltaJ[ck]); /* || ||_2 */
   /* square the norm to get dot prod */
   ip11*=ip11;
   ip22*=ip22;
   /* only try to do an update if the 'delta's are finite, also 
     there is tangible correlation between the two deltas */
 #ifdef DEBUG
   printf("%d ip11=%lf ip12=%lf ip22=%lf\n",ci,ip11,ip12,ip22);
 #endif
   if (ip11>CLM_EPSILON && ip22>CLM_EPSILON) {
     double alphacorr=ip12/sqrt(ip11*ip22);
     /* decide on whether to do further calculations only if there is sufficient correlation 
        between the deltas */
     if (alphacorr>alphacorrmin) {
     double alphaSD=ip11/ip12;
     double alphaMG=ip12/ip22;
     double alphahat;
     if (2.0*alphaMG>alphaSD) {
      alphahat=alphaMG;
     } else {
      alphahat=alphaSD-alphaMG*0.5;
     }
 #ifdef DEBUG
     printf("alphacorr=%lf alphaSD=%lf alphaMG=%lf alphahat=%lf rho=%lf\n",alphacorr,alphaSD,alphaMG,alphahat,t->rho[ci]);
 #endif
      /* decide on whether to update rho based on heuristics */
      if (alphahat> 0.001 && alphahat<t->rhoupper[ci]) {
 #ifdef DEBUG
       printf("updating rho from %lf to %lf\n",t->rho[ci],alphahat);
 #endif
       t->rho[ci]=alphahat;
      }
     }
   }
   } 
   ck+=t->N*8*t->carr[ci].nchunk;
 }
 return NULL;
 }
 /* Barzilai & Borwein update of rho [Xu et al] */
 /* rho: Mx1 values, to be updated
   rhoupper: Mx1 values, upper limit of rho
   N: no of stations
   M : clusters
   Mt: actual clusters (include hybrid parameter) Mt >= M
   carr: cluster info array, to get hybrid parameters: Mx1
   Yhat: current Yhat : 8*N*Mt 
   Yhat_k0 : old Yhat at previous update of rho : 8*N*Mt
   J: current solution : 8*N*Mt
   J_k0: old solution at previous update of rho : 8*N*Mt
   Nt: no. of threads
 */ 
 int
 update_rho_bb(double *rho, double *rhoupper, int N, int M, int Mt, clus_source_t *carr, double *Yhat, double *Yhat_k0, double *J, double *J_k0, int Nt) {
 double *deltaY; /* Yhat - Yhat_k0 */
 double *deltaJ; /* J - J_k0 (with J_k0 projected to tangent plane of J)*/
 if ((deltaY=(double*)calloc((size_t)8*N*Mt,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
 }
 if ((deltaJ=(double*)calloc((size_t)8*N*Mt,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
 }
 my_dcopy(8*N*Mt, Yhat, 1, deltaY, 1); 
 my_daxpy(8*N*Mt, Yhat_k0, -1.0, deltaY);
 //no need to remove unitary ambiguity from J-Jold
 my_dcopy(8*N*Mt, J, 1, deltaJ, 1); 
 my_daxpy(8*N*Mt, J_k0,-1.0, deltaJ);
  pthread_attr_t attr;
  pthread_t *th_array;
  thread_data_rho_bb_t *threaddata;
  int ci,cj,ck,Nthb0,Nthb,nth,nth1;
  /* clusters per thread */
  Nthb0=(M+Nt-1)/Nt;
  /* setup threads */
  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
  }
  if ((threaddata=(thread_data_rho_bb_t*)malloc((size_t)Nt*sizeof(thread_data_rho_bb_t)))==0) {
    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  ci=0;
  ck=0;
  for (nth=0;  nth<Nt && ci<M; nth++) {
    if (ci+Nthb0<M) {
     Nthb=Nthb0;
    } else {
     Nthb=M-ci;
    }
    threaddata[nth].N=N;
    threaddata[nth].M=M;
    threaddata[nth].offset=ck;
    threaddata[nth].startM=ci;
    threaddata[nth].endM=ci+Nthb-1;
    threaddata[nth].rho=rho;
    threaddata[nth].rhoupper=rhoupper;
    threaddata[nth].deltaY=deltaY;
    threaddata[nth].deltaJ=deltaJ;
    threaddata[nth].carr=carr;
    /* find the right offset too, since ci is not always incremented by 1 need to add up */
    for (cj=ci; cj<ci+Nthb && cj<M; cj++) {
      ck+=N*8*carr[cj].nchunk;
    }
    pthread_create(&th_array[nth],&attr,rho_bb_threadfn,(void*)(&threaddata[nth]));
    ci=ci+Nthb;
  }
  for(nth1=0; nth1<nth; nth1++) {
   pthread_join(th_array[nth1],NULL);
  }
  pthread_attr_destroy(&attr);
  free(th_array);
  free(threaddata);
 free(deltaY);
 free(deltaJ);
 return 0;
 }
--- a/src/lib/Dirac/dataio.c
+++ b/src/lib/Dirac/dataio.c
@ -1,82 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include <stdio.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <sys/mman.h>
 #include "Dirac.h"
 int 
 open_data_stream(int file, double **d, int *count, int *N, double *freq0, double *ra0, double *dec0) {
  struct stat statbuf;
  int ig;
 /* find the file size */
 if (fstat (file,&statbuf) < 0) {
   fprintf(stderr,"%s: %d: no file open\n",__FILE__,__LINE__);
   exit(1);
 }
 //printf("file size (bytes) %d\n",(int)statbuf.st_size);
 /* total double values is size/8 */
 *count=statbuf.st_size/8;
 //printf("total double values %d\n",*count);
  /* map the file to memory */
  *d= (double*)mmap(NULL,  statbuf.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, file, 0);
  if ( !d) {
     fprintf(stderr,"%s: %d: no file open\n",__FILE__,__LINE__);
 		 exit(1);
  }
  /* remove header from data */
  *N=(int)(*d)[0];
  *freq0=(*d)[1];
  *ra0=(*d)[2];
  *dec0=(*d)[3];
  /* read ignored stations and discard them */
  ig=(int)(*d)[4]; 
  /* make correct value for N */
  *N=*N-ig;
  printf("Ignoring %d stations\n",ig);
  /* increment to data */
  *d=&((*d)[5+ig]); 
  return(0);
 }
 int
 close_data_stream(double *d, int count) {
  /* sync to disk */
  msync(d, (size_t)count*sizeof(double), MS_SYNC );
  munmap((void*)d, (size_t)count*sizeof(double));
  return 0;
 }
--- a/src/lib/Dirac/lbfgs.c
+++ b/src/lib/Dirac/lbfgs.c
@ -102,7 +102,7 @@ pipeline_slave_code_b(void *data)
    checkCudaError(err,__FILE__,__LINE__);
  } else if (dp->status[tid]==PT_DO_CCOST) {
   /* divide total baselines by 2 */
-   int BlocksPerGrid=(dp->Nbase[tid]+dp->lmdata[tid]->ThreadsPerBlock-1)/dp->lmdata[tid]->ThreadsPerBlock;
+   int BlocksPerGrid= 2*(dp->Nbase[tid]+dp->lmdata[tid]->ThreadsPerBlock-1)/dp->lmdata[tid]->ThreadsPerBlock;
   int  boff=dp->boff[tid];
   /* copy the current solution to device */
   err=cudaMemcpy(dp->cpp[tid], dp->lmdata[tid]->p, m*sizeof(double), cudaMemcpyHostToDevice);
@ -901,12 +901,12 @@ lbfgs_fit_common(
  /* parameters per thread (GPU) */
  int Nparm=(m+2-1)/2;
  /* find number of blocks */
-  int BlocksPerGrid = (Nparm+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid = 2* (Nparm+ThreadsPerBlock-1)/ThreadsPerBlock;
  ci=0;
  int nth;
  for (nth=0; nth<2; nth++) {
   threaddata[nth].ThreadsPerBlock=ThreadsPerBlock;
-   threaddata[nth].BlocksPerGrid=BlocksPerGrid;
+   threaddata[nth].BlocksPerGrid= 2*BlocksPerGrid;
   threaddata[nth].card=nth;
   threaddata[nth].Nbase=dp->Nbase;
   threaddata[nth].tilesz=dp->tilesz;
--- a/src/lib/Dirac/myblas.c
+++ b/src/lib/Dirac/myblas.c
@ -206,7 +206,7 @@ __attribute__ ((target(MIC)))
 }
-/* following routines used in LAPACK solvers */
+/* following routines used in LAPACK dirac */
 /* cholesky factorization: real symmetric */
 int
 my_dpotrf(char uplo, int N, double *A, int lda) {
--- a/src/lib/Dirac/oslmfit.c
+++ b/src/lib/Dirac/oslmfit.c
@ -157,7 +157,7 @@ oslevmar_der_single_cuda(
  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  unsigned long int moff;
@ -190,7 +190,7 @@ oslevmar_der_single_cuda(
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&ed, N*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==1) {
    err=cudaMalloc((void**)&taud, M*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
--- a/src/lib/Dirac/robustlm.c
+++ b/src/lib/Dirac/robustlm.c
@ -159,7 +159,7 @@ rlevmar_der_single_cuda(
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
  int ThreadsPerBlock1=DEFAULT_TH_PER_BK; /* DEFAULT_TH_PER_BK/8 for accessing each element of a baseline */
  int ThreadsPerBlock2=Nd/2; /* for evaluating nu */
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  unsigned long int moff;
@ -196,7 +196,7 @@ rlevmar_der_single_cuda(
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&ed, N*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==1) {
    err=cudaMalloc((void**)&taud, M*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
@ -805,7 +805,7 @@ rlevmar_der_single_cuda_fl(
  /* FIXME: might need a large value for large no of baselines */
  int ThreadsPerBlock1=DEFAULT_TH_PER_BK; /* for accessing each element of a baseline */
  int ThreadsPerBlock2=Nd/2; /* for evaluating nu */
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  unsigned long int moff;
@ -1382,7 +1382,7 @@ osrlevmar_der_single_cuda_fl(
  /* FIXME: might need a large value for large no of baselines */
  int ThreadsPerBlock1=DEFAULT_TH_PER_BK; /* for accessing each element of a baseline */
  int ThreadsPerBlock2=Nd/2; /* for evaluating nu */
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  unsigned long int moff;
@ -2180,7 +2180,7 @@ rlevmar_der_single_nocuda(
  setweights(M,aones,1.0,lmdata->Nt);
  /*W set initial weights to 1 */
  setweights(N,wtd,1.0,lmdata->Nt);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==0) {
  } else if (solve_axb==1) {
@ -2766,7 +2766,7 @@ osrlevmar_der_single_nocuda(
  /*W set initial weights to 1 */
  setweights(N,wtd,1.0,lmdata0->Nt);
-  /* memory allocation: different solvers */
+  /* memory allocation: different dirac */
  if (solve_axb==0) {
  } else if (solve_axb==1) {
--- a/src/lib/Dirac/rtr_solve_cuda.c
+++ b/src/lib/Dirac/rtr_solve_cuda.c
@ -582,7 +582,7 @@ rtr_solve_cuda_fl(
  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  /* reshape x to make J: 2Nx2 complex double 
--- a/src/lib/Dirac/rtr_solve_robust_cuda.c
+++ b/src/lib/Dirac/rtr_solve_robust_cuda.c
@ -602,7 +602,7 @@ rtr_solve_cuda_robust_fl(
  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  /* reshape x to make J: 2Nx2 complex double 
@ -985,7 +985,7 @@ nsd_solve_cuda_robust_fl(
  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  /* reshape x to make J: 2Nx2 complex double 
--- a/src/lib/Dirac/rtr_solve_robust_cuda_admm.c
+++ b/src/lib/Dirac/rtr_solve_robust_cuda_admm.c
@ -517,7 +517,7 @@ rtr_solve_cuda_robust_admm_fl(
  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  /* reshape x to make J: 2Nx2 complex double 
@ -947,7 +947,7 @@ nsd_solve_cuda_robust_admm_fl(
  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
-  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  /* reshape x to make J: 2Nx2 complex double 
--- a/src/lib/Radio/Makefile
+++ b/src/lib/Radio/Makefile
@ -2,11 +2,12 @@ CC=gcc
 CXX=g++
 #CFLAGS= -Wall -O3 -g #-pg
 CFLAGS= -Wall -O3 -fopt-info-optimized
 # CFLAGS= -Wall -pg -O2 -ansi -fPIC -fpermissive -fno-omit-frame-pointer -DNDEBUG -fno-inline-functions -fno-inline-functions-called-once -fno-optimize-sibling-calls
 CLIBS= -lm -lpthread
 #LAPACK=-L/usr/lib/atlas/sse -llapack -lblas
 #LAPACK=-L/usr/local/GotoBLAS2/lib -lgoto2 -lpthread -lgfortran
-LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
+#LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
-
+LAPACK=-L/cm/shared/package/openblas/0.2.17mt/lib -lopenblas -lgfortran -lpthread
 INCLUDES= -I. -I../Dirac/ 
 LIBPATH=
--- a/src/lib/Radio/Makefile.gpu
+++ b/src/lib/Radio/Makefile.gpu
@ -3,7 +3,8 @@ CXX=g++
 NVCC=nvcc
 CFLAGS= -Wall -O3 -g -DHAVE_CUDA -DHYBRID_CODE -pg
 CLIBS= -lm -lpthread
-LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
+#LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
 LAPACK=-L/cm/shared/package/openblas/0.2.17mt/lib -lopenblas -lgfortran -lpthread
 # LAPACK=-lblas -lgfortran -lpthread
 CUDAINC=/usr/local/cuda/include
--- a/src/lib/Radio/predict_model.cu
+++ b/src/lib/Radio/predict_model.cu
--- a/src/lib/Radio/predict_withbeam.c
+++ b/src/lib/Radio/predict_withbeam.c
@ -1114,7 +1114,7 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit
    for (cj=0; cj<carr[cm].nchunk; cj++) {
      pm=&(p[carr[cm].p[cj]]); /* start of solutions */
      /* extract phase of pm, output to pphase */
-      extract_phases(pm,pphase,N,10);
+      // extract_phases(pm,pphase,N,10);
      /* invert N solutions */
      for (ci=0; ci<N; ci++) {
       mat_invert(&pphase[8*ci],&pinv[8*ci+8*N*cj], rho);
--- a/src/lib/Radio/predict_withbeam_gpu.c
+++ b/src/lib/Radio/predict_withbeam_gpu.c
--- a/test/Generate_sources.py
+++ b/test/Generate_sources.py
@ -28,6 +28,9 @@ tol_seconds_of_RA = tol_seconds_of_decl * 24/360
 RA_hours_3C196 = 0
 RA_minutes_3C196 = 0
 RA_seconds_3C196 = 0
 #RA_hours_3C196 = 8
 #RA_minutes_3C196 = 13
 #RA_seconds_3C196 = 35.981540
 RA_seconds_3C196 = (RA_hours_3C196 * 60 + RA_minutes_3C196) *60 + RA_seconds_3C196
@ -42,6 +45,9 @@ RA_hours_high, RA_minutes_high = divmod(RA_minutes_high, 60)
 decl_degrees_3C196 = 90
 decl_minutes_3C196 = 0
 decl_seconds_3C196 = 0
 #decl_degrees_3C196 = 48
 #decl_minutes_3C196 = 12
 #decl_seconds_3C196 = 59.174770
 decl_seconds_3C196 = (decl_degrees_3C196 * 60 + decl_minutes_3C196) *60 + decl_seconds_3C196
@ -68,9 +74,8 @@ with warnings.catch_warnings():
             'pos_angle', 'freq0')
    formats = ['U6', 'i4', 'i4', 'f8', 'i4', 'i4', 'f8', 'f8', 'i4', 'i4', 'i4', 'f8', 'f8',
-               'i4', 'i4', 'i4', 'i4', 'f8', 'f8']
+      'f8', 'f8', 'f8', 'f8', 'f8', 'f8']
-
+    formats_reformatted = '%s  %d  %d  %f  %d  %d  %f  %f  %d  %d  %d  %f  %f  %f  %f  %f  %f  %f  %f'
    formats_reformatted = '%s  %d  %d  %f  %d  %d  %f  %f  %d  %d  %d  %f  %f  %d  %d  %d  %d  %f  %f'
    sources_parameters = np.recarray((number_of_sources,), formats=formats,
                                     names=names)
--- a/test/dosage.sh
+++ b/test/dosage.sh
@ -1,2 +1,2 @@
 # Before running this, untar sm.ms.tar and build sagecal
-../src/MS/sagecal -d sm.ms -s 3c196.sky.txt -c 3c196.sky.txt.cluster -n 4 -t 10 -p sm.ms.solutions -e 4 -g 2 -l 10 -m 7 -x 30 -F 1 -j 5  -k -1 -B 1 -W 0 > sm.ms.output
+../src/MS/sagecal -d sm.ms -s extended_source_list.txt -c extended_source_list.txt.cluster -n 4 -t 10 -p sm.ms.solutions -e 4 -g 2 -l 10 -m 7 -x 30 -F 1 -j 2  -k -1 -B 1 -W 0 > sm.ms.output
`@ -1,2 +1,2 @@`
	`# Before running this, untar sm.ms.tar and build sagecal`	`# Before running this, untar sm.ms.tar and build sagecal`
	`../src/MS/sagecal -d sm.ms -s 3c196.sky.txt -c 3c196.sky.txt.cluster -n 4 -t 10 -p sm.ms.solutions -e 4 -g 2 -l 10 -m 7 -x 30 -F 1 -j 5 -k -1 -B 1 -W 0 > sm.ms.output`	`../src/MS/sagecal -d sm.ms -s extended_source_list.txt -c extended_source_list.txt.cluster -n 4 -t 10 -p sm.ms.solutions -e 4 -g 2 -l 10 -m 7 -x 30 -F 1 -j 2 -k -1 -B 1 -W 0 > sm.ms.output`