Major cleanup - in particular of files that should not be tracked, i.e. should not be included in the repo.

2017-06-21 16:19:52 +02:00 · 2017-06-21 16:19:52 +02:00 · a7108dc025
parent 1e7d1883a3
commit a7108dc025
129 changed files with 57 additions and 86843 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,25 @@
 */*.png
 */*.out
 */*.output
 */*.solutions
 */*.dot
 test/sm.ms/
 test/nvprof-resultaten/
 src/MS/sagecal
 test/analysis-*.txt
 test/extended_*.*
 */*/*.a
 */*/*.o
 */*/*.swp
 */*/*.swo
 */*/*.out
 */*/*.output
 */*/*/*.o
 */*/*/*.a
 */*/*/*.swp
 */*/*/*.swo
 */*/*/*.out
 */*/*/*.output
--- a/src/MS/Makefile
+++ b/src/MS/Makefile
@ -32,4 +32,5 @@ data.o:data.cpp data.h
 sagecal:$(OBJECTS) ../lib/Radio/libsagecal.a ../lib/Dirac/libdirac.a 
 	$(CXX) $(CXXFLAGS) $(LDFLAGS) $(INCLUDES) $(GLIBI) $(LIBPATH)  -o $@  $(OBJECTS) $(MY_LIBS) $(LAPACK) $(CASA_LIBS)  $(GLIBL)
 clean:
-	rm *.o *.tmp *.fits
+	rm *.o *.tmp *.fits *.swp *.swo *.o *.output
--- a/src/MS/Makefile.gpu
+++ b/src/MS/Makefile.gpu
@ -1,11 +1,10 @@
 OUTPUT=
 CXX=g++
 CXXFLAGS=-O3 -Wall -g -DHAVE_CUDA
 # CXXFLAGS=-O3 -Wall -g -DHAVE_CUDA -DONE_GPU
 CASA_LIBDIR=-L/cm/shared/package/casacore/v2.1.0-gcc-4.9.3/lib -L/cm/shared/package/cfitsio/3380-gcc-4.9.3/lib -L/cm/shared/package/lapack/3.6.0-gcc-4.9.3/lib64
 CASA_INCDIR=-I/cm/shared/package/casacore/v2.1.0-g++-4.9.3/include -I/cm/shared/package/casacore/v2.1.0-g++-4.9.3/include/casacore
 CASA_LIBS=-lcasa_casa -lcasa_tables -lcasa_measures -lcasa_ms -lcfitsio
-#LAPACK=-llapack -lblas
+# LAPACK=-llapack -lblas
 LAPACK=-lopenblas -lgfortran -lpthread
 LAPACK_DIR=/cm/shared/apps/openblas/0.2.8/lib
 #LAPACK_DIR=/usr/lib/atlas/sse/
--- a/src/MS/lib
+++ b/src/MS/lib
@ -1 +0,0 @@
 ../lib
--- a/src/MS/main.cpp
+++ b/src/MS/main.cpp
@ -236,11 +236,7 @@ main(int argc, char **argv) {
     Data::readMSlist(Data::MSlist,&msnames);
    }
    if (Data::TableName) {
     if (!doBeam) {
      Data::readAuxData(Data::TableName,&iodata);
     } else {
      Data::readAuxData(Data::TableName,&iodata,&beam);
     }
     cout<<"Only one MS"<<endl;
    } else if (Data::MSlist) {
     Data::readAuxDataList(msnames,&iodata);
@ -251,7 +247,8 @@ main(int argc, char **argv) {
     srand(time(0)); /* use different seed */
    }
-    openblas_set_num_threads(1);//Data::Nt;
+    // openblas_set_num_threads(1);//Data::Nt;
    // export OMP_NUM_THREADS=1
    /**********************************************************/
     int M,Mt,ci,cj,ck;  
   /* parameters */
@ -420,7 +417,7 @@ main(int argc, char **argv) {
    /* starting iterations are doubled */
    // int start_iter=1;
-    int sources_precessed=0;
+    // int sources_precessed=0;
    double inv_c=1.0/CONST_C;
@ -430,11 +427,7 @@ main(int argc, char **argv) {
    while (msitr[0]->more()) {
      start_time = time(0);
      if (iodata.Nms==1) {
       if (!doBeam) {
        Data::loadData(msitr[0]->table(),iodata,&iodata.fratio);
       } else {
        Data::loadData(msitr[0]->table(),iodata,beam,&iodata.fratio);
       }
      } else { 
       Data::loadDataList(msitr,iodata,&iodata.fratio);
      }
@ -449,15 +442,9 @@ main(int argc, char **argv) {
    preset_flags_and_data(iodata.Nbase*iodata.tilesz,iodata.flag,barr,iodata.x,Data::Nt);
    /* if data is being whitened, whiten x here,
     no need for a copy because we use xo for residual calculation */
    if (Data::whiten) {
     whiten_data(iodata.Nbase*iodata.tilesz,iodata.x,iodata.u,iodata.v,iodata.freq0,Data::Nt);
    }
    /* precess source locations (also beam pointing) from J2000 to JAPP if we do any beam predictions,
      using first time slot as epoch */
-    if (doBeam && !sources_precessed) {
+    // sources_precessed=1;
      precess_source_locations(beam.time_utc[iodata.tilesz/2],carr,M,&beam.p_ra0,&beam.p_dec0,Data::Nt);
      sources_precessed=1;
    }
@ -467,6 +454,9 @@ main(int argc, char **argv) {
 #ifdef HAVE_CUDA
     precalculate_coherencies_withbeam_gpu(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt);
 #endif
 #ifndef HAVE_CUDA
     precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
 #endif
    /****************** end calibration **************************/
    /****************** begin diagnostics ************************/
@ -475,7 +465,17 @@ main(int argc, char **argv) {
      predict_visibilities_multifreq_withbeam_gpu(iodata.u,iodata.v,iodata.w,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt,(Data::DoSim>1?1:0));
 #endif
-    }
+#ifndef HAVE_CUDA
     precalculate_coherencies_withbeam(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
  beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt);
 #endif    
 }
 #ifdef HAVE_CUDA
    cudaDeviceSynchronize();
    cudaProfilerStop();
    exit(0);
 #endif
   tilex+=iodata.tilesz;
   /* print solutions to file */
@ -495,10 +495,6 @@ main(int argc, char **argv) {
    for(int cm=0; cm<iodata.Nms; cm++) {
      (*msitr[cm])++;
    }
 #ifdef HAVE_CUDA
    cudaDeviceSynchronize();
    cudaProfilerStop();
 #endif
    end_time = time(0);
    elapsed_time = ((double) (end_time-start_time)) / 60.0;
--- a/src/lib/Dirac/.gitignore
+++ b/src/lib/Dirac/.gitignore
@ -1,2 +0,0 @@
 *.swp
 *.swo
--- a/src/lib/Dirac/Makefile.gpu
+++ b/src/lib/Dirac/Makefile.gpu
@ -4,12 +4,13 @@ NVCC=nvcc
 CFLAGS= -Wall -O3 -g -DHAVE_CUDA -DHYBRID_CODE
 CLIBS= -lm -lpthread
 LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
 # LAPACK=-lblas -lgfortran -lpthread
 CUDAINC=/usr/local/cuda/include
 CUDALIB=-L/usr/local/cuda/lib64 -lcuda -lcudart
 #NVCC=/usr/local/cuda/bin/nvcc
 #NVCFLAGS=-arch=sm_35 -g -G --ptxas-options=-v -O3
-NVCFLAGS=-arch=sm_35 --ptxas-options=-v -O3
+NVCFLAGS=-gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=compute_61 -gencode arch=compute_62,code=compute_62 --ptxas-options=-v -O3
 #### glib
 GLIBI=-I/usr/include/glib-2.0 -I/usr/lib64/glib-2.0/include/
--- a/src/lib/Makefile
+++ b/src/lib/Makefile
@ -1,66 +0,0 @@
 CC=gcc
 CXX=g++
 #CFLAGS= -Wall -O3 -g #-pg
 CFLAGS= -Wall -O3 -fopt-info-optimized
 CLIBS= -lm -lpthread
 #LAPACK=-L/usr/lib/atlas/sse -llapack -lblas
 #LAPACK=-L/usr/local/GotoBLAS2/lib -lgoto2 -lpthread -lgfortran
 LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
 INCLUDES= -I. 
 LIBPATH=
 #### glib
 GLIBI=-I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include -I/usr/lib/x86_64-linux-gnu/glib-2.0/include/ -I/usr/lib64/glib-2.0/include
 GLIBL=-lglib-2.0
 OBJECTS=readsky.o dataio.o predict.o lmfit_nocuda.o clmfit_nocuda.o lbfgs_nocuda.o myblas.o residual.o robustlm.o updatenu.o robust_lbfgs_nocuda.o rtr_solve.o  rtr_solve_robust.o manifold_average.o consensus_poly.o rtr_solve_robust_admm.o admm_solve.o transforms.o stationbeam.o predict_withbeam.o
 default:libsagecal.a
 readsky.o:readsky.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 dataio.o:dataio.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 predict.o:predict.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 lmfit_nocuda.o:lmfit_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 clmfit_nocuda.o:clmfit_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 lbfgs_nocuda.o:lbfgs_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 myblas.o:myblas.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 residual.o:residual.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 robustlm.o:robustlm.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 updatenu.o:updatenu.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 robust_lbfgs_nocuda.o:robust_lbfgs_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 rtr_solve.o:rtr_solve.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 rtr_solve_robust.o:rtr_solve_robust.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 manifold_average.o:manifold_average.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 consensus_poly.o:consensus_poly.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 rtr_solve_robust_admm.o:rtr_solve_robust_admm.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 admm_solve.o:admm_solve.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 transforms.o:transforms.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 stationbeam.o:stationbeam.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 predict_withbeam.o:predict_withbeam.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 RANLIB=ranlib
 libsagecal.a:$(OBJECTS) sagecal.h
 	ar rv $@ $(OBJECTS); \
 	$(RANLIB) $@;
--- a/src/lib/Makefile.MIC
+++ b/src/lib/Makefile.MIC
@ -1,66 +0,0 @@
 CC=icc
 CXX=icpc
 LD=icc
 # MKL 
 MKLROOT=/opt/intel/composer_xe_2013.5.192/mkl
 IFACE_LIB=mkl_intel_lp64
 THREADING_LIB=mkl_intel_thread
 CORE_LIB=mkl_core
 LDFLAGS=-L$(MKLROOT)/lib/intel64 -l$(IFACE_LIB) -l$(THREADING_LIB) -l$(CORE_LIB) -lpthread -lm
 MIC_LDFLAGS=-L$(MKLROOT)/lib/mic -l$(IFACE_LIB) -l$(THREADING_LIB) -l$(CORE_LIB)
 ##CFLAGS +=-DUSE_MIC -Wall -DDEBUG -g -O0 -openmp -vec-report=1
 #CFLAGS +=-DUSE_MIC -Wall -O1 -profile-functions -profile-loops=all -profile-loops-report=2 -openmp
 CFLAGS +=-DUSE_MIC -Wall -O3 -openmp -vec-report=1
 #MICFLAGS =-offload-option,mic,compiler,"-DUSE_MIC -vec-report1 -g -O0 -Wall"
 MICFLAGS =-offload-option,mic,compiler,"-DUSE_MIC -vec-report1 -O3 -openmp -Wall"
 #MICFLAGS =-offload-option,mic,compiler,"-DUSE_MIC -O1 -profile-functions -profile-loops=all -profile-loops-report=2 -openmp"
 MICLDFLAGS=-offload-option,mic,ld,"$(MIC_LDFLAGS)"
 #LAPACK=-L/usr/lib/atlas/sse -llapack -lblas
 #LAPACK=-L/usr/local/GotoBLAS2/lib -lgoto2 -lpthread -lgfortran
 LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
 INCLUDES= -I. 
 LIBPATH=
 #### glib
 GLIBI=-I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include -I/usr/lib64/glib-2.0/include/
 GLIBL=-lglib-2.0
 OBJECTS=readsky.o dataio.o predict.o lmfit_nocuda.o clmfit_nocuda.o lbfgs_nocuda.o myblas.o residual.o robustlm.o updatenu.o robust_lbfgs_nocuda.o
 #  clmfit_nocudaMIC.o lmfit_nocudaMIC.o robust_lbfgs_nocudaMIC.o updatenuMIC.o\
  lbfgs_nocudaMIC.o myblasMIC.o robustlmMIC.o
 default:libsagecal.a
 readsky.o:readsky.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 dataio.o:dataio.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 predict.o:predict.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 lmfit_nocuda.o:lmfit_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 clmfit_nocuda.o:clmfit_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 lbfgs_nocuda.o:lbfgs_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 myblas.o:myblas.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 residual.o:residual.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 robustlm.o:robustlm.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 updatenu.o:updatenu.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 robust_lbfgs_nocuda.o:robust_lbfgs_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 RANLIB=ranlib
 libsagecal.a:$(OBJECTS) sagecal.h
 	xiar -qoffload-build rv $@ $(OBJECTS); \
 	$(RANLIB) $@;
--- a/src/lib/Makefile.gpu
+++ b/src/lib/Makefile.gpu
@ -1,108 +0,0 @@
 CC=gcc
 CXX=g++
 NVCC=nvcc
 CFLAGS= -Wall -O3 -g -DHAVE_CUDA -DHYBRID_CODE
 CLIBS= -lm -lpthread
 LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
 CUDAINC=/usr/local/cuda/include
 CUDALIB=-L/usr/local/cuda/lib64 -lcuda -lcudart
 #NVCC=/usr/local/cuda/bin/nvcc
 #NVCFLAGS=-arch=sm_35 -g -G --ptxas-options=-v -O3
 NVCFLAGS=-arch=sm_35 --ptxas-options=-v -O3
 #### glib
 GLIBI=-I/usr/include/glib-2.0 -I/usr/lib64/glib-2.0/include/
 GLIBL=-lglib-2.0 -L/usr/lib64
 # NVML
 NVML_INC=/usr/include/nvidia/gdk/
 NVML_LIB=-lnvidia-ml -L/usr/lib64/nvidia/
 INCLUDES= -I. -I$(CUDAINC) -I$(NVML_INC)
 LIBPATH= $(CUDALIB)
 OBJECTS=readsky.o dataio.o predict.o lmfit.o lbfgs.o myblas.o mderiv.o clmfit.o clmfit_nocuda.o residual.o barrier.o robust.o robustlm.o oslmfit.o mderiv_fl.o clmfit_fl.o updatenu.o robust_lbfgs_nocuda.o robust_fl.o manifold_fl.o rtr_solve_cuda.o rtr_solve_robust_cuda.o diagnostics.o diag_fl.o manifold_average.o consensus_poly.o rtr_solve_robust_cuda_admm.o rtr_solve_robust_admm.o admm_solve.o load_balance.o transforms.o stationbeam.o predict_withbeam.o  predict_withbeam_gpu.o predict_model.o predict_model_device.o
 default:libsagecal.a
 readsky.o:readsky.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 dataio.o:dataio.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 predict.o:predict.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 lmfit.o:lmfit.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 lbfgs.o:lbfgs.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 myblas.o:myblas.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 mderiv.o:mderiv.cu
 	$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
 clmfit.o:clmfit.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 clmfit_nocuda.o:clmfit_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 residual.o:residual.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 barrier.o:barrier.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 robustlm.o:robustlm.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 robust.o:robust.cu
 	$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
 robust_fl.o:robust_fl.cu
 	$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
 oslmfit.o:oslmfit.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 robust_lbfgs_nocuda.o:robust_lbfgs_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 clmfit_fl.o:clmfit_fl.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 updatenu.o:updatenu.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 mderiv_fl.o:mderiv_fl.cu
 	$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
 manifold_fl.o:manifold_fl.cu
 	$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
 rtr_solve_cuda.o:rtr_solve_cuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 rtr_solve_robust_cuda.o:rtr_solve_robust_cuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 diagnostics.o:diagnostics.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 diag_fl.o:diag_fl.cu
 	$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
 manifold_average.o:manifold_average.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 consensus_poly.o:consensus_poly.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 rtr_solve_robust_cuda_admm.o:rtr_solve_robust_cuda_admm.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 rtr_solve_robust_admm.o:rtr_solve_robust_admm.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 admm_solve.o:admm_solve.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 load_balance.o:load_balance.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 transforms.o:transforms.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 stationbeam.o:stationbeam.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 predict_withbeam.o:predict_withbeam.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 predict_withbeam_gpu.o:predict_withbeam_gpu.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 ## for dynamic parallelism, two stage compilation
 predict_model.o:predict_model_device.o
 	$(NVCC) $(NVCFLAGS) -lineinfo -dlink $(INCLUDES) $(GLIBI) -o $@ $<
 predict_model_device.o:predict_model.cu
 	$(NVCC) $(NVCFLAGS) -lineinfo -rdc=true $(INCLUDES) $(GLIBI) -o $@ -c $<
 RANLIB=ranlib
 libsagecal.a:$(OBJECTS) sagecal.h
 	ar rv $@ $(OBJECTS); \
 	$(RANLIB) $@;
--- a/src/lib/Radio/Makefile.gpu
+++ b/src/lib/Radio/Makefile.gpu
@ -4,12 +4,14 @@ NVCC=nvcc
 CFLAGS= -Wall -O3 -g -DHAVE_CUDA -DHYBRID_CODE
 CLIBS= -lm -lpthread
 LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
 # LAPACK=-lblas -lgfortran -lpthread
 CUDAINC=/usr/local/cuda/include
 CUDALIB=-L/usr/local/cuda/lib64 -lcuda -lcudart
 #NVCC=/usr/local/cuda/bin/nvcc
 #NVCFLAGS=-arch=sm_35 -g -G --ptxas-options=-v -O3
-NVCFLAGS=-arch=sm_35 --ptxas-options=-v -O3
+NVCFLAGS=-gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=compute_61 -gencode arch=compute_62,code=compute_62 --ptxas-options=-v -O3
 # NVCFLAGS=-gencode arch=compute_35,code=sm_35 --ptxas-options=-v -O3
 #### glib
 GLIBI=-I/usr/include/glib-2.0 -I/usr/lib64/glib-2.0/include/
--- a/src/lib/Radio/libsagecal-gpu.a
+++ b/src/lib/Radio/libsagecal-gpu.a
--- a/src/lib/Radio/predict_model.cu
+++ b/src/lib/Radio/predict_model.cu
@ -610,6 +610,7 @@ kernel_coherencies(int B, int N, int T, int K, int F,float *u, float *v, float *
 #endif
   int ThreadsPerBlock=DEFAULT_TH_PER_BK;
   // int ThreadsPerBlock=16;
   /* each slave thread will calculate one source, 8xF values for all freq */
   /* also give right offset for coherencies */
   if (K<ThreadsPerBlock) {
@ -729,7 +730,8 @@ cudakernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude
  cudaMemset(buffer,0,sizeof(float)*2*Ntotal);
-  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
+  // int ThreadsPerBlock=DEFAULT_TH_PER_BK;
  int ThreadsPerBlock=8;
  /* note: make sure we do not exceed max no of blocks available, otherwise (too many sources, loop over source id) */
  int BlocksPerGrid= 2*(Ntotal+ThreadsPerBlock-1)/ThreadsPerBlock;
  kernel_array_beam<<<BlocksPerGrid,ThreadsPerBlock>>>(N,T,K,F,freqs,longitude,latitude,time_utc,Nelem,xx,yy,zz,ra,dec,ph_ra0,ph_dec0,ph_freq0,beam,buffer);
@ -786,6 +788,7 @@ cudakernel_coherencies(int B, int N, int T, int K, int F, float *u, float *v, fl
  /* spawn threads to handle baselines, these threads will spawn threads for sources */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
  // int ThreadsPerBlock=16;
  /* note: make sure we do not exceed max no of blocks available, 
   otherwise (too many baselines, loop over source id) */
  int BlocksPerGrid= 2*(B+ThreadsPerBlock-1)/ThreadsPerBlock;
--- a/src/lib/Radio/reserve/radio-reserve.h
+++ b/src/lib/Radio/reserve/radio-reserve.h
@ -1,470 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #ifndef SAGECAL_H
 #define SAGECAL_H
 #ifdef __cplusplus
        extern "C" {
 #endif
 #include <Solvers.h>
 /* structures to store extra source info for extended sources */
 typedef struct exinfo_gaussian_ {
  double eX,eY,eP; /* major,minor,PA */
  double cxi,sxi,cphi,sphi; /* projection of [0,0,1] to [l,m,n] */
  int use_projection;
 } exinfo_gaussian;
 typedef struct exinfo_disk_ {
  double eX; /* diameter */
  double cxi,sxi,cphi,sphi; /* projection of [0,0,1] to [l,m,n] */
  int use_projection;
 } exinfo_disk;
 typedef struct exinfo_ring_ {
  double eX; /* diameter */
  double cxi,sxi,cphi,sphi; /* projection of [0,0,1] to [l,m,n] */
  int use_projection;
 } exinfo_ring;
 typedef struct exinfo_shapelet_ {
  int n0; /* model order, no of modes=n0*n0 */
  double beta; /* scale*/
  double *modes; /* array of n0*n0 x 1 values */
  double eX,eY,eP; /* linear transform parameters */
  double cxi,sxi,cphi,sphi; /* projection of [0,0,1] to [l,m,n] */
  int use_projection;
 } exinfo_shapelet;
 /* when to project l,m coordinates */
 #ifndef PROJ_CUT
 #define PROJ_CUT 0.998
 #endif
 /* struct for a cluster GList item */
 typedef struct clust_t_{
 int id; /* cluster id */
 int nchunk; /* no of chunks the data is divided for solving */
 GList *slist; /* list of sources in this cluster (string)*/
 } clust_t;
 typedef struct clust_n_{
 char *name; /* source name (string)*/
 } clust_n;
 /* struct to store source info in hash table */
 typedef struct sinfo_t_ {
 double ll,mm,ra,dec,sI[4]; /* sI:4x1 for I,Q,U,V, note sI is updated for central freq (ra,dec) for Az,El */
 unsigned char stype; /* source type */
 void *exdata; /* pointer to carry additional data, if needed */
 double sI0[4],f0,spec_idx,spec_idx1,spec_idx2; /* for multi channel data, original sI,Q,U,V, f0 and spectral index */
 } sinfo_t;
 /* struct for array of the sky model, with clusters */
 typedef struct clus_source_t_ {
 int N; /* no of source in this cluster */
 int id; /* cluster id */
 double *ll,*mm,*nn,*sI,*sQ,*sU,*sV; /* arrays Nx1 of source info, note: sI is at reference freq of data */
 /* nn=sqrt(1-ll^2-mm^2)-1 */
 double *ra,*dec; /* arrays Nx1 for Az,El calculation */
 unsigned char *stype; /* source type array Nx1 */
 void **ex; /* array for extra source information Nx1 */
 int nchunk; /* no of chunks the data is divided for solving */
 int *p; /* array nchunkx1 points to parameter array indices */
 double *sI0,*sQ0,*sU0,*sV0,*f0,*spec_idx,*spec_idx1,*spec_idx2; /* for multi channel data, original sI, f0 and spectral index */
 } clus_source_t;
 /* strutct to store baseline to station mapping */
 typedef struct baseline_t_ {
 int sta1,sta2;
 unsigned char flag; /* if this baseline is flagged, set to 1, otherwise 0: 
             special case: 2 if baseline is not used in solution, but will be
              subtracted */
 } baseline_t;
 /****************************** readsky.c ****************************/
 /* read sky/cluster files, 
   carr:  return array size Mx1 of clusters
   M : no of clusters
   freq0: obs frequency Hz
   ra0,dec0 : ra,dec of phase center (radians)
   format: 0: LSM, 1: LSM with 3 order spec index
   each element has source infor for that cluster */
 extern int
 read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **carr, int *M, double freq0, double ra0, double dec0,int format);
 /* read solution file, only a set of solutions and load to p
  sfp: solution file pointer
  p: solutions vector Mt x 1
  carr: for getting correct offset in p
  N : stations 
  M : clusters
 */
 extern int
 read_solutions(FILE *sfp,double *p,clus_source_t *carr,int N,int M);
 /* set ignlist[ci]=1 if 
  cluster id 'cid' is mentioned in ignfile and carr[ci].id==cid
 */ 
 extern int
 update_ignorelist(const char *ignfile, int *ignlist, int M, clus_source_t *carr);
 /* read ADMM regularization factor per cluster from text file, format:
 cluster_id  hybrid_parameter admm_rho
 ...
 ...
 (M values)
 and store it in array arho : size Mtx1, taking into account the hybrid parameter
 also in array arhoslave : size Mx1, without taking hybrid params into account
 admm_rho : can be 0 to ignore consensus, just normal calibration
 */
 extern int
 read_arho_fromfile(const char *admm_rho_file,int Mt,double *arho, int M, double *arhoslave);
 /****************************** predict.c ****************************/
 /************* extended source contributions ************/
 extern complex double
 shapelet_contrib(void*dd, double u, double v, double w);
 extern complex double
 gaussian_contrib(void*dd, double u, double v, double w);
 extern complex double
 ring_contrib(void*dd, double u, double v, double w);
 extern complex double
 disk_contrib(void*dd, double u, double v, double w);
 /* time smearing TMS eq. 6.80 for EW-array formula 
  note u,v,w: meter/c so multiply by freq. to get wavelength 
  ll,mm: source
  dec0: phase center declination
  tdelta: integration time */
 extern double 
 time_smear(double ll,double mm,double dec0,double tdelta,double u,double v,double w,double freq0);
 /* predict visibilities
  u,v,w: u,v,w coordinates (wavelengths) size Nbase*tilesz x 1 
  u,v,w are ordered with baselines, timeslots
  x: data to write size Nbase*8*tileze x 1
   ordered by XX(re,im),XY(re,im),YX(re,im), YY(re,im), baseline, timeslots
  N: no of stations
  Nbase: no of baselines
  tilesz: tile size
  barr: baseline to station map, size Nbase*tilesz x 1
  carr: sky model/cluster info size Mx1 of clusters
  M: no of clusters
  freq0: frequency
  fdelta: bandwidth for freq smearing
  tdelta: integration time for time smearing
  dec0: declination for time smearing
  Nt: no of threads
 */
 extern int
 predict_visibilities(double *u, double *v, double *w, double *x, int N, 
   int Nbase, int tilesz,  baseline_t *barr, clus_source_t *carr, int M, double freq0, double fdelta, double tdelta, double dec0, int Nt); 
 /* precalculate cluster coherencies
  u,v,w: u,v,w coordinates (wavelengths) size Nbase*tilesz x 1 
  u,v,w are ordered with baselines, timeslots
  x: coherencies size Nbase*4*Mx 1
   ordered by XX(re,im),XY(re,im),YX(re,im), YY(re,im), baseline, timeslots
  N: no of stations
  Nbase: no of baselines (including more than one tile)
  barr: baseline to station map, size Nbase*tilesz x 1
  carr: sky model/cluster info size Mx1 of clusters
  M: no of clusters
  freq0: frequency
  fdelta: bandwidth for freq smearing
  tdelta: integration time for time smearing
  dec0: declination for time smearing
  uvmin: baseline length sqrt(u^2+v^2) below which not to include in solution
  uvmax: baseline length higher than this not included in solution
  Nt: no of threads
  NOTE: prediction is done for all baselines, even flagged ones
  and flags are set to 2 for baselines lower than uvcut
 */
 extern int
 precalculate_coherencies(double *u, double *v, double *w, complex double *x, int N,
   int Nbase, baseline_t *barr,  clus_source_t *carr, int M, double freq0, double fdelta, double tdelta, double dec0, double uvmin, double uvmax, int Nt); 
 /* rearranges coherencies for GPU use later */
 /* barr: 2*Nbase x 1
   coh: M*Nbase*4 x 1 complex
   ddcoh: M*Nbase*8 x 1
   ddbase: 2*Nbase x 1 (sta1,sta2) = -1 if flagged
 */
 extern int
 rearrange_coherencies(int Nbase, baseline_t *barr, complex double *coh, double *ddcoh, short *ddbase, int M, int Nt);
 /* ddbase: 3*Nbase x 1 (sta1,sta2,flag) */
 extern int
 rearrange_coherencies2(int Nbase, baseline_t *barr, complex double *coh, double *ddcoh, short *ddbase, int M, int Nt);
 /* rearranges baselines for GPU use later */
 /* barr: 2*Nbase x 1
   ddbase: 2*Nbase x 1
 */
 extern int
 rearrange_baselines(int Nbase, baseline_t *barr, short *ddbase, int Nt);
 /* cont how many baselines contribute to each station */
 extern int
 count_baselines(int Nbase, int N, float *iw, short *ddbase, int Nt);
 /* initialize array b (size Nx1) to given value a */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 extern void
 setweights(int N, double *b, double a, int Nt);
 /* update baseline flags, also make data zero if flagged
  this is needed for solving (calculate error) ignore flagged data */
 /* Nbase: total actual data points = Nbasextilesz
   flag: flag array Nbasex1
   barr: baseline array Nbasex1
   x: data Nbase*8 x 1 ( 8 value per baseline ) 
   Nt: no of threads 
 */
 extern int
 preset_flags_and_data(int Nbase, double *flag, baseline_t *barr, double *x, int Nt);
 /* generte baselines -> sta1,sta2 pairs for later use */
 /* barr: Nbasextilesz
   N : stations
   Nt : threads 
 */
 extern int
 generate_baselines(int Nbase, int tilesz, int N, baseline_t *barr,int Nt);
 /* convert types */
 /* both arrays size nx1 
   Nt: no of threads
 */
 extern int
 double_to_float(float *farr, double *darr,int n, int Nt);
 extern int
 float_to_double(double *darr, float *farr,int n, int Nt);
 /* create a vector with 1's at flagged data points */
 /* 
   ddbase: 3*Nbase x 1 (sta1,sta2,flag)
   x: 8*Nbase (set to 0's and 1's)
 */
 extern int
 create_onezerovec(int Nbase, short *ddbase, float *x, int Nt);
 /* 
  find sum1=sum(|x|), and sum2=y^T |x|
  x,y: nx1 arrays
 */
 extern int
 find_sumproduct(int N, float *x, float *y, float *sum1, float *sum2, int Nt);
 /****************************** transforms.c ****************************/
 #ifndef ASEC2RAD
 #define ASEC2RAD 4.848136811095359935899141e-6
 #endif
 /* 
 convert xyz ITRF 2000 coords (m) to
 long,lat, (rad) height (m)
 References:
 */
 extern int
 xyz2llh(double *x, double *y, double *z, double *longitude, double *latitude, double *height, int N);
 /* convert ra,dec to az,el
   ra,dec: radians
   longitude,latitude: rad,rad 
   time_jd: JD days
   az,el: output  rad,rad
 References: Darin C. Koblick MATLAB code, based on
  % Fundamentals of Astrodynamics and Applications 
 % D. Vallado, Second Edition
 % Example 3-5. Finding Local Siderial Time (pg. 192) 
 % Algorithm 28: AzElToRaDec (pg. 259)
 */
 extern int
 radec2azel(double ra, double dec, double longitude, double latitude, double time_jd, double *az, double *el);
 /* convert time to Greenwitch Mean Sideral Angle (deg)
   time_jd : JD days
   thetaGMST : GMST angle (deg)
 */
 extern int
 jd2gmst(double time_jd, double *thetaGMST); 
 /* convert ra,dec to az,el
   ra,dec: radians
   longitude,latitude: rad,rad 
   thetaGMST : GMST angle (deg)
   az,el: output  rad,rad
 */
 extern int
 radec2azel_gmst(double ra, double dec, double longitude, double latitude, double thetaGMST, double *az, double *el); 
 /* given the epoch jd_tdb2, 
 calculate rotation matrix params needed to precess from J2000 
   NOVAS (Naval Observatory Vector Astronomy Software)
   PURPOSE:
      Precesses equatorial rectangular coordinates from one epoch to
      another.  One of the two epochs must be J2000.0.  The coordinates
      are referred to the mean dynamical equator and equinox of the two
      respective epochs.
   REFERENCES:
      Explanatory Supplement To The Astronomical Almanac, pp. 103-104.
      Capitaine, N. et al. (2003), Astronomy And Astrophysics 412,
         pp. 567-586.
      Hilton, J. L. et al. (2006), IAU WG report, Celest. Mech., 94,
         pp. 351-367.
 */
 extern int
 get_precession_params(double jd_tdb2, double Tr[9]);
 /* precess  ra0,dec0 at J2000
   to ra,dec at epoch given by transform Tr
 using NOVAS library */
 extern int
 precession(double ra0, double dec0, double Tr[9], double *ra, double *dec);
 /****************************** stationbeam.c ****************************/
 /* 
  ra,dec: source direction (rad)
  ra0,dec0: beam center (rad)
  f: frequency (Hz)
  f0: beam forming frequency (Hz)
  longitude,latitude : Nx1 array of station positions (rad,rad)
  time_jd: JD (day) time
  Nelem : Nx1 array, no. of elements used in each station
  x,y,z: Nx1 pointer arrays to station positions, each station has Nelem[]x1 arrays
  beamgain: Nx1 array of station beam gain along the source direction
 */ 
 extern int
 arraybeam(double ra, double dec, double ra0, double dec0, double f, double f0, int N, double *longitude, double *latitude, double time_jd, int *Nelem, double **x, double **y, double **z, double *beamgain);
 /****************************** predict_withbeam.c ****************************/
 /* precalculate cluster coherencies
  u,v,w: u,v,w coordinates (wavelengths) size Nbase*tilesz x 1 
  u,v,w are ordered with baselines, timeslots
  x: coherencies size Nbase*4*Mx 1
   ordered by XX(re,im),XY(re,im),YX(re,im), YY(re,im), baseline, timeslots
  N: no of stations
  Nbase: total no of baselines (including more than one tile or timeslot)
  barr: baseline to station map, size Nbase*tilesz x 1
  carr: sky model/cluster info size Mx1 of clusters
  M: no of clusters
  freq0: frequency
  fdelta: bandwidth for freq smearing
  tdelta: integration time for time smearing
  dec0: declination for time smearing
  uvmin: baseline length sqrt(u^2+v^2) below which not to include in solution
  uvmax: baseline length higher than this not included in solution
  Station beam specific parameters
  ph_ra0,ph_dec0: beam pointing rad,rad
  ph_freq0: beam reference freq
  longitude,latitude: Nx1 arrays (rad,rad) station locations
  time_utc: JD (day) : tilesz x 1 
  tilesz: how many tiles: == unique time_utc
  Nelem: Nx1 array, size of stations (elements)
  xx,yy,zz: Nx1 arrays of station element locations arrays xx[],yy[],zz[]
  Nt: no of threads
  NOTE: prediction is done for all baselines, even flagged ones
  and flags are set to 2 for baselines lower than uvcut
 */
 extern int
 precalculate_coherencies_withbeam(double *u, double *v, double *w, complex double *x, int N,
   int Nbase, baseline_t *barr,  clus_source_t *carr, int M, double freq0, double fdelta, double tdelta, double dec0, double uvmin, double uvmax, 
 double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int tileze, int *Nelem, double **xx, double **yy, double **zz, int Nt);
 extern int
 predict_visibilities_multifreq_withbeam(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
 double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int Nt, int add_to_data);
 extern int
 calculate_residuals_multifreq_withbeam(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,
 double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int Nt, int ccid, double rho, int phase_only);
 /* change epoch of soure ra,dec from J2000 to JAPP */
 /* also the beam pointing ra_beam,dec_beam */
 extern int
 precess_source_locations(double jd_tdb, clus_source_t *carr, int M, double *ra_beam, double *dec_beam, int Nt);
 /****************************** predict_withbeam_gpu.c ****************************/
 /* if dobeam==0, beam calculation is off */
 extern int
 precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex double *x, int N,
   int Nbase, baseline_t *barr,  clus_source_t *carr, int M, double freq0, double fdelta, double tdelta, double dec0, double uvmin, double uvmax, 
 double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int tileze, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt);
 extern int
 predict_visibilities_multifreq_withbeam_gpu(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
 double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int add_to_data);
 /****************************** predict_model.cu ****************************/
 extern void
 cudakernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude, float *latitude,
 double *time_utc, int *Nelem, float **xx, float **yy, float **zz, float *ra, float *dec, float ph_ra0, float  ph_dec0, float ph_freq0, float *beam);
 extern void
 cudakernel_coherencies(int B, int N, int T, int K, int F, float *u, float *v, float *w,baseline_t *barr, float *freqs, float *beam, float *ll, float *mm, float *nn, float *sI,
  unsigned char *stype, float *sI0, float *f0, float *spec_idx, float *spec_idx1, float *spec_idx2, int **exs, float deltaf, float deltat, float dec0, float *coh,int dobeam);
 extern void
 cudakernel_convert_time(int T, double *time_utc);
 #ifdef __cplusplus
     } /* extern "C" */
 #endif
--- a/src/lib/Solvers/.Common.h.swp
+++ b/src/lib/Solvers/.Common.h.swp
--- a/src/lib/Solvers/.Dirac.h.swp
+++ b/src/lib/Solvers/.Dirac.h.swp
--- a/src/lib/Solvers/.gitignore
+++ b/src/lib/Solvers/.gitignore
@ -1,2 +0,0 @@
 *.swp
 *.swo
--- a/src/lib/Solvers/Common.h
+++ b/src/lib/Solvers/Common.h
--- a/src/lib/Solvers/Dirac.h
+++ b/src/lib/Solvers/Dirac.h
--- a/src/lib/Solvers/Makefile
+++ b/src/lib/Solvers/Makefile
@ -1,54 +0,0 @@
 CC=gcc
 CXX=g++
 #CFLAGS= -Wall -O3 -g #-pg
 CFLAGS= -Wall -O3 -fopt-info-optimized
 CLIBS= -lm -lpthread
 #LAPACK=-L/usr/lib/atlas/sse -llapack -lblas
 #LAPACK=-L/usr/local/GotoBLAS2/lib -lgoto2 -lpthread -lgfortran
 LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
 INCLUDES= -I. 
 LIBPATH=
 #### glib
 GLIBI=-I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include -I/usr/lib/x86_64-linux-gnu/glib-2.0/include/ -I/usr/lib64/glib-2.0/include
 GLIBL=-lglib-2.0
 OBJECTS= lmfit_nocuda.o clmfit_nocuda.o lbfgs_nocuda.o myblas.o residual.o robustlm.o updatenu.o robust_lbfgs_nocuda.o rtr_solve.o  rtr_solve_robust.o manifold_average.o consensus_poly.o rtr_solve_robust_admm.o admm_solve.o
 default:libsolvers.a
 lmfit_nocuda.o:lmfit_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 clmfit_nocuda.o:clmfit_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 lbfgs_nocuda.o:lbfgs_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 myblas.o:myblas.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 residual.o:residual.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 robustlm.o:robustlm.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 updatenu.o:updatenu.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 robust_lbfgs_nocuda.o:robust_lbfgs_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 rtr_solve.o:rtr_solve.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 rtr_solve_robust.o:rtr_solve_robust.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 manifold_average.o:manifold_average.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 consensus_poly.o:consensus_poly.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 rtr_solve_robust_admm.o:rtr_solve_robust_admm.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 admm_solve.o:admm_solve.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 RANLIB=ranlib
 libsolvers.a:$(OBJECTS) Solvers.h
 	ar rv $@ $(OBJECTS); \
 	$(RANLIB) $@;
--- a/src/lib/Solvers/Makefile.gpu
+++ b/src/lib/Solvers/Makefile.gpu
@ -1,88 +0,0 @@
 CC=gcc
 CXX=g++
 NVCC=nvcc
 CFLAGS= -Wall -O3 -g -DHAVE_CUDA -DHYBRID_CODE
 CLIBS= -lm -lpthread
 LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
 CUDAINC=/usr/local/cuda/include
 CUDALIB=-L/usr/local/cuda/lib64 -lcuda -lcudart
 #NVCC=/usr/local/cuda/bin/nvcc
 #NVCFLAGS=-arch=sm_35 -g -G --ptxas-options=-v -O3
 NVCFLAGS=-arch=sm_35 --ptxas-options=-v -O3
 #### glib
 GLIBI=-I/usr/include/glib-2.0 -I/usr/lib64/glib-2.0/include/
 GLIBL=-lglib-2.0 -L/usr/lib64
 # NVML
 NVML_INC=/usr/include/nvidia/gdk/
 NVML_LIB=-lnvidia-ml -L/usr/lib64/nvidia/
 INCLUDES= -I. -I$(CUDAINC) -I$(NVML_INC)
 LIBPATH= $(CUDALIB)
 OBJECTS=lmfit.o lbfgs.o myblas.o mderiv.o clmfit.o clmfit_nocuda.o residual.o barrier.o robust.o robustlm.o oslmfit.o mderiv_fl.o clmfit_fl.o updatenu.o robust_lbfgs_nocuda.o robust_fl.o manifold_fl.o rtr_solve_cuda.o rtr_solve_robust_cuda.o diagnostics.o diag_fl.o manifold_average.o consensus_poly.o rtr_solve_robust_cuda_admm.o rtr_solve_robust_admm.o admm_solve.o load_balance.o 
 default:libsolvers-gpu.a
 lmfit.o:lmfit.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 lbfgs.o:lbfgs.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 myblas.o:myblas.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 mderiv.o:mderiv.cu
 	$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
 clmfit.o:clmfit.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 clmfit_nocuda.o:clmfit_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 residual.o:residual.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 barrier.o:barrier.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 robustlm.o:robustlm.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 robust.o:robust.cu
 	$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
 robust_fl.o:robust_fl.cu
 	$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
 oslmfit.o:oslmfit.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 robust_lbfgs_nocuda.o:robust_lbfgs_nocuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 clmfit_fl.o:clmfit_fl.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 updatenu.o:updatenu.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 mderiv_fl.o:mderiv_fl.cu
 	$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
 manifold_fl.o:manifold_fl.cu
 	$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
 rtr_solve_cuda.o:rtr_solve_cuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 rtr_solve_robust_cuda.o:rtr_solve_robust_cuda.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 diagnostics.o:diagnostics.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 diag_fl.o:diag_fl.cu
 	$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
 manifold_average.o:manifold_average.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 consensus_poly.o:consensus_poly.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 rtr_solve_robust_cuda_admm.o:rtr_solve_robust_cuda_admm.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 rtr_solve_robust_admm.o:rtr_solve_robust_admm.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
 admm_solve.o:admm_solve.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 load_balance.o:load_balance.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 RANLIB=ranlib
 libsolvers-gpu.a:$(OBJECTS) Solvers.h
 	ar rv $@ $(OBJECTS); \
 	$(RANLIB) $@;
--- a/src/lib/Solvers/Solvers.h
+++ b/src/lib/Solvers/Solvers.h
--- a/src/lib/Solvers/admm_solve.c
+++ b/src/lib/Solvers/admm_solve.c
--- a/src/lib/Solvers/admm_solve.o
+++ b/src/lib/Solvers/admm_solve.o
--- a/src/lib/Solvers/barrier.c
+++ b/src/lib/Solvers/barrier.c
@ -1,121 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include "Solvers.h"
 /* implementation of a barrier to sync threads.
  The barrier has two doors (enter and exit). Only one door 
  can be open at a time. Initially the enter door is open.
  All threads that enter the barrier are sleeping (wait).
  The last thread to enter the barrier will 
   1)close the enter door
   2)wakeup all sleeping threads.
   3)open the exit door.
  So the woken up threads will leave the barrier one by 
  one, as they are awoken. The last thread to leave the barrier
  will
   1)open the enter door 
   2)close the exit door,
  So finally the barrier reaches its initial state
 */
 /* initialize barrier */
 /* N - no. of accomodated threads */
 void
 init_th_barrier(th_barrier *barrier, int N)
 {
 barrier->tcount=0; /* initially empty */
 barrier->nthreads=N;
 pthread_mutex_init(&barrier->enter_mutex,NULL);
 pthread_mutex_init(&barrier->exit_mutex,NULL);
 pthread_cond_init(&barrier->lastthread_cond,NULL);
 pthread_cond_init(&barrier->exit_cond,NULL);
 }
 /* destroy barrier */
 void
 destroy_th_barrier(th_barrier *barrier)
 {
 pthread_mutex_destroy(&barrier->enter_mutex);
 pthread_mutex_destroy(&barrier->exit_mutex);
 pthread_cond_destroy(&barrier->lastthread_cond);
 pthread_cond_destroy(&barrier->exit_cond);
 barrier->tcount=barrier->nthreads=0;
 }
 /* the main operation of the barrier */
 void
 sync_barrier(th_barrier *barrier)
 {
 /* trivial case */
 if(barrier->nthreads <2) return;
 /* else */
 /* new threads enters the barrier. Now close the entry door
  so that other threads cannot enter the barrier until we are done */
 pthread_mutex_lock(&barrier->enter_mutex);
 /* next lock the exit mutex - no threads can leave either */
 pthread_mutex_lock(&barrier->exit_mutex);
 /* now check to see if this is the last expected thread */
 if( ++(barrier->tcount) < barrier->nthreads) {
  /* no. this is not the last thread. so open the entry door */
  pthread_mutex_unlock(&barrier->enter_mutex);
 /* go to sleep */
  pthread_cond_wait(&barrier->exit_cond,&barrier->exit_mutex);
 } else {
 /* this is the last thread */
 /* wakeup sleeping threads */
 pthread_cond_broadcast(&barrier->exit_cond);
 /* go to sleep until all threads are woken up
   and leave the barrier */
 pthread_cond_wait(&barrier->lastthread_cond,&barrier->exit_mutex);
 /* now all threads have left the barrier. so open the entry door again */
 pthread_mutex_unlock(&barrier->enter_mutex);
 } 
 /* next to the last thread leaving the barrier */
 if(--(barrier->tcount)==1) {
  /* wakeup the last sleeping thread */
  pthread_cond_broadcast(&barrier->lastthread_cond);
 }
 pthread_mutex_unlock(&barrier->exit_mutex);
 } 
 /* master and two slaves */
 //int
 //main(int argc, char *argv[]) {
 // th_pipeline p;
 // 
 // gbdata g;
 //
 // init_pipeline(&p,&g);
 //sync_barrier(&(p.gate1)); /* stop at gate 1 */
 //   g.status=0; /* master work */
 //sync_barrier(&(p.gate2)); /* stop at gate 2 */
 // //exec_pipeline(&p);
 //sync_barrier(&(p.gate1)); /* stop at gate 1 */
 // g.status=10; /* master work */
 //sync_barrier(&(p.gate2)); /* stop at gate 2 */
 // //exec_pipeline(&p);
 // destroy_pipeline(&p);
 // /* still need to free slave_data structs, from data */
 // return 0;
 //}
--- a/src/lib/Solvers/barrier.o
+++ b/src/lib/Solvers/barrier.o
--- a/src/lib/Solvers/clmfit.c
+++ b/src/lib/Solvers/clmfit.c
--- a/src/lib/Solvers/clmfit.o
+++ b/src/lib/Solvers/clmfit.o
--- a/src/lib/Solvers/clmfit_fl.c
+++ b/src/lib/Solvers/clmfit_fl.c
--- a/src/lib/Solvers/clmfit_fl.o
+++ b/src/lib/Solvers/clmfit_fl.o
--- a/src/lib/Solvers/clmfit_nocuda.c
+++ b/src/lib/Solvers/clmfit_nocuda.c
--- a/src/lib/Solvers/clmfit_nocuda.o
+++ b/src/lib/Solvers/clmfit_nocuda.o
--- a/src/lib/Solvers/consensus_poly.c
+++ b/src/lib/Solvers/consensus_poly.c
@ -1,349 +0,0 @@
 /*
 *
 Copyright (C) 2014 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include "Solvers.h"
 #include <math.h>
 #include <stdio.h>
 //#define DEBUG
 /* build matrix with polynomial terms
  B : Npoly x Nf, each row is one basis function
  Npoly : total basis functions
  Nf: frequencies
  freqs: Nfx1 array freqs
  freq0: reference freq
  type : 
  0 :[1 ((f-fo)/fo) ((f-fo)/fo)^2 ...] basis functions
  1 : normalize each row such that norm is 1
  2 : Bernstein poly \sum N_C_r x^r (1-x)^r where x in [0,1] : use min,max values of freq to normalize
     Note: freqs might not be in sorted order, so need to search array to find min,max values
  3: [1 ((f-fo)/fo) (fo/f-1) ((f-fo)/fo)^2 (fo/f-1)^2 ... ] basis, for this case odd Npoly  preferred
 */
 int
 setup_polynomials(double *B, int Npoly, int Nf, double *freqs, double freq0, int type) {
  if (type==0 || type==1) {
  double frat,dsum;
  double invf=1.0/freq0;
  int ci,cm;
  for (ci=0; ci<Nf; ci++) {
     B[ci*Npoly]=1.0;
     frat=(freqs[ci]-freq0)*invf;
     for (cm=1; cm<Npoly; cm++) {
      B[ci*Npoly+cm]=B[ci*Npoly+cm-1]*frat;
     }
  }
 #ifdef DEBUG
  int cj;
  printf("BT=[\n");
  for(cj=0; cj<Npoly; cj++) {
   for (ci=0; ci<Nf; ci++) {
    printf("%lf ",B[ci*Npoly+cj]); 
   }
   printf("\n");
  }
  printf("];\n");
 #endif
  if (type==1) {
   /* normalize each row such that norm is 1 */
   for (cm=0; cm<Npoly; cm++) {
     dsum=0.0;
     for (ci=0; ci<Nf; ci++) {
      dsum+=B[ci*Npoly+cm]*B[ci*Npoly+cm];
     }
     if (dsum>0.0) {
      invf=1.0/sqrt(dsum);
     } else {
      invf=0.0;
     }
     for (ci=0; ci<Nf; ci++) {
      B[ci*Npoly+cm] *=invf;
     }
   }
  }
  } else if (type==2) {
   /* Bernstein polynomials */
   int idmax=my_idamax(Nf, freqs, 1);
   int idmin=my_idamin(Nf, freqs, 1);
   double fmax=freqs[idmax-1];
   double fmin=freqs[idmin-1];
   double *fact; /* factorial array */
   double *px,*p1x; /* arrays for powers of x and (1+x) */
   if ((fact=(double*)calloc((size_t)Npoly,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
   }
   if ((px=(double*)calloc((size_t)Npoly*Nf,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
   }
   if ((p1x=(double*)calloc((size_t)Npoly*Nf,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
   }
   fact[0]=1.0;
   int ci,cj;
   for (ci=1; ci<Npoly; ci++) {
     fact[ci]=fact[ci-1]*(double)ci;
   }
   double invf=1.0/(fmax-fmin);
   double frat;
   for (ci=0; ci<Nf; ci++) {
     /* normalize coordinates */
     frat=(freqs[ci]-fmin)*invf;
     px[ci]=1.0;
     p1x[ci]=1.0;
     px[ci+Nf]=frat;
     p1x[ci+Nf]=1.0-frat;
   }
   for (cj=2; cj<Npoly; cj++) {
    for (ci=0; ci<Nf; ci++) {
     px[cj*Nf+ci]=px[(cj-1)*Nf+ci]*px[Nf+ci]; 
     p1x[cj*Nf+ci]=p1x[(cj-1)*Nf+ci]*p1x[Nf+ci]; 
    }
   }
   for (cj=0; cj<Npoly; cj++) { /* ci: freq, cj: poly order */
     frat=fact[Npoly-1]/(fact[Npoly-cj-1]*fact[cj]);
     for (ci=0; ci<Nf; ci++) {
      B[ci*Npoly+cj]=frat*px[cj*Nf+ci]*p1x[(Npoly-cj-1)*Nf+ci];
     }
   }
 #ifdef DEBUG
   printf("BT=[\n");
   for(cj=0; cj<Npoly; cj++) {
    for (ci=0; ci<Nf; ci++) {
    printf("%lf ",B[ci*Npoly+cj]); 
   }
   printf("\n");
   }
   printf("];\n");
 #endif
   free(fact);
   free(px);
   free(p1x);
  } else if (type==3) { /* [1 (f-fo)/fo (fo/f-1) ... */
   double frat;
   double invf=1.0/freq0;
   int ci,cm;
   for (ci=0; ci<Nf; ci++) {
     B[ci*Npoly]=1.0;
     frat=(freqs[ci]-freq0)*invf;
     double lastval=frat;
     for (cm=1; cm<Npoly; cm+=2) { /* odd values 1,3,5,... */
      B[ci*Npoly+cm]=lastval;
      lastval*=frat;
     }
     frat=(freq0/freqs[ci]-1.0);
     lastval=frat;
     for (cm=2; cm<Npoly; cm+=2) { /* even values 2,4,6,... */
      B[ci*Npoly+cm]=lastval;
      lastval*=frat;
     }
   }
 #ifdef DEBUG
  int cj;
  printf("BT=[\n");
  for(cj=0; cj<Npoly; cj++) {
   for (ci=0; ci<Nf; ci++) {
    printf("%lf ",B[ci*Npoly+cj]); 
   }
   printf("\n");
  }
  printf("];\n");
 #endif
  } else {
    fprintf(stderr,"%s : %d: undefined polynomial type\n",__FILE__,__LINE__);
  }
  return 0;
 }
 /* build matrix with polynomial terms
  B : Npoly x Nf, each row is one basis function
  Bi: Npoly x Npoly pseudo inverse of sum( B(:,col) x B(:,col)' )
  Npoly : total basis functions
  Nf: frequencies
  fratio: Nfx1 array of weighing factors depending on the flagged data of each freq
  Sum taken is a weighted sum, using weights in fratio
 */
 int
 find_prod_inverse(double *B, double *Bi, int Npoly, int Nf, double *fratio) {
  int ci,status,lwork=0;
  double w[1],*WORK,*U,*S,*VT;
  /* set Bi to zero */
  memset(Bi,0,sizeof(double)*Npoly*Npoly);
  /* find sum */
  for (ci=0; ci<Nf; ci++) { 
   /* outer product */
   my_dgemm('N','T',Npoly,Npoly,1,fratio[ci],&B[ci*Npoly],Npoly,&B[ci*Npoly],Npoly,1.0,Bi,Npoly);
  }
 #ifdef DEBUG
  int cj;
  printf("BT=[\n");
  for (ci=0; ci<Nf; ci++) {
   for(cj=0; cj<Npoly; cj++) {
    printf("%lf ",B[ci*Npoly+cj]); 
   }
   printf("\n");
  }
  printf("];\nBi=[\n");
  for (ci=0; ci<Npoly; ci++) {
   for(cj=0; cj<Npoly; cj++) {
    printf("%lf ",Bi[ci*Npoly+cj]); 
   }
   printf("\n");
  }
  printf("];\n");
 #endif
  if ((U=(double*)calloc((size_t)Npoly*Npoly,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  if ((VT=(double*)calloc((size_t)Npoly*Npoly,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  if ((S=(double*)calloc((size_t)Npoly,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  /* memory for SVD */
  status=my_dgesvd('A','A',Npoly,Npoly,Bi,Npoly,S,U,Npoly,VT,Npoly,w,-1);
  if (!status) {
    lwork=(int)w[0];
  } else {
    printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
    exit(1);
  }
  if ((WORK=(double*)calloc((size_t)lwork,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  status=my_dgesvd('A','A',Npoly,Npoly,Bi,Npoly,S,U,Npoly,VT,Npoly,WORK,lwork);
  if (status) {
    printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
    exit(1);
  }
  /* find 1/singular values, and multiply columns of U with new singular values */
  for (ci=0; ci<Npoly; ci++) {
   if (S[ci]>CLM_EPSILON) {
    S[ci]=1.0/S[ci];
   } else {
    S[ci]=0.0;
   }
   my_dscal(Npoly,S[ci],&U[ci*Npoly]);
  }
  /* find product U 1/S V^T */
  my_dgemm('N','N',Npoly,Npoly,Npoly,1.0,U,Npoly,VT,Npoly,0.0,Bi,Npoly);
 #ifdef DEBUG
  printf("Bii=[\n");
  for (ci=0; ci<Npoly; ci++) {
   for(cj=0; cj<Npoly; cj++) {
    printf("%lf ",Bi[ci*Npoly+cj]); 
   }
   printf("\n");
  }
  printf("];\n");
 #endif
  free(U);
  free(S);
  free(VT);
  free(WORK);
  return 0;
 }
 /* update Z
   Z: 8N Npoly x M double array (real and complex need to be updated separate)
   N : stations
   M : clusters
   Npoly: no of basis functions
   z : right hand side 8NM Npoly x 1 (note the different ordering from Z)
   Bi : NpolyxNpoly matrix, Bi^T=Bi assumed
 */
 int 
 update_global_z(double *Z,int N,int M,int Npoly,double *z,double *Bi) { 
 /* one block of Z for one direction 2Nx2xNpoly (complex)
    and 8NxNpoly  real values : select one column : 2NxNpoly (complex)
    select real,imag : 2NxNpoly each (vector)
    reshape each to 2NxNpoly matrix => Q
    Bi : NpolyxNpoly matrix = B^T
    for each direction (M values)
    select 2N,2N,... : 2Nx Npoly complex values from z (ordered by M)
    select real,imag: size 2NxNpoly, 2NxNpoly vectors
    reshape to 2NxNpoly => R
    reshape to 2NxNpoly => I (imag)
    then Q=([R I] Bi^T) for each column
    Q=[R_1^T I_1^T R_2^T I_2^T]^T Bi^T for 2 columns
    R_1,I_1,R_2,I_2 : size 2NxNpoly 
    R : (2N 4) x Npoly
    so find Q
 */
 double *R,*Q;
 if ((R=(double*)calloc((size_t)2*N*Npoly*4,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
 }
 if ((Q=(double*)calloc((size_t)2*N*Npoly*4,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
 }
 int ci,np;
 for (ci=0; ci<M; ci++) {
  for (np=0; np<Npoly; np++) {
   /* select 2N */
   my_dcopy(2*N, &z[8*N*ci+np*8*N*M], 4, &R[np*8*N], 1); /* R_1 */
   my_dcopy(2*N, &z[8*N*ci+np*8*N*M+1], 4, &R[np*8*N+2*N], 1); /* I_1 */
   my_dcopy(2*N, &z[8*N*ci+np*8*N*M+2], 4, &R[np*8*N+2*2*N], 1); /* R_2 */
   my_dcopy(2*N, &z[8*N*ci+np*8*N*M+3], 4, &R[np*8*N+3*2*N], 1); /* I_2 */
  }
  /* find Q=R B^T */
  memset(Q,0,sizeof(double)*2*N*Npoly*4);
  my_dgemm('N','N',8*N,Npoly,Npoly,1.0,R,8*N,Bi,Npoly,1.0,Q,8*N);
  /* copy back to Z */ 
  for (np=0; np<Npoly; np++) {
   my_dcopy(2*N, &Q[np*8*N], 1, &Z[8*N*Npoly*ci+8*N*np], 4); 
   my_dcopy(2*N, &Q[np*8*N+2*N], 1, &Z[8*N*Npoly*ci+8*N*np+1], 4); 
   my_dcopy(2*N, &Q[np*8*N+2*2*N], 1, &Z[8*N*Npoly*ci+8*N*np+2], 4); 
   my_dcopy(2*N, &Q[np*8*N+3*2*N], 1, &Z[8*N*Npoly*ci+8*N*np+3], 4); 
  }
 }
 free(R);
 free(Q);
 return 0;
 }
--- a/src/lib/Solvers/consensus_poly.o
+++ b/src/lib/Solvers/consensus_poly.o
--- a/src/lib/Solvers/diag_fl.cu
+++ b/src/lib/Solvers/diag_fl.cu
@ -1,270 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include "cuda.h"
 #include <cuComplex.h>
 #include <stdio.h>
 /* enable this for checking for kernel failure */
 #define CUDA_DBG
 __global__ void 
 kernel_sqrtdiv_fl(int M, float eps, float *__restrict__ x){
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* make sure to use only M threads */
  if (tid<M) {
    if (x[tid]>eps) {
      x[tid]=1.0f/sqrtf(x[tid]);
    } else {
      x[tid]=0.0f;
    }
  }
 }
 __global__ void 
 kernel_diagmult_fl(int M, float *__restrict__ U, const float *__restrict__ D) {
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* which column this tid operates on */
  unsigned int col = tid/M;
  if (tid<M*M) {
     U[tid]=U[tid]*D[col];
  }
 }
 __global__ void 
 kernel_jnorm_fl(int N, int M, const float *__restrict__ J, float *__restrict__ d) {
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* each thread handles one row */  
  if (tid<N) {
    d[tid]=0.0f;
    for (int ci=0; ci<M; ci++) {
     /* J is transposed, so read each column */
     d[tid]=d[tid]+J[tid*M+ci]*J[tid*M+ci]; 
    }
  }
 }
 __global__ void 
 kernel_jacf_fl2(int Nbase, int M, float *__restrict__ jac, const float *__restrict__ coh, const float *__restrict__ p, const short *__restrict__ bb, int N){
  /* global thread index : equal to the baseline */
  unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
  /* which parameter:0...M */
  unsigned int m = threadIdx.y + blockDim.y*blockIdx.y;
  if(n<Nbase && m<M) {
    int sta1=(int)bb[3*n];
    int sta2=(int)bb[3*n+1];
    /* condition for calculating this baseline sum is 
     If this baseline is flagged,
     or if this parameter does not belong to sta1 or sta2
     we do not compute
    */
    int stc=m>>3; /* 0...Ns-1 (because M=total par= 8 * Nstations */
    /* flags are not taken into account */
    if (((stc==sta2)||(stc==sta1))) {   
     cuFloatComplex C[4];
     C[0].x=coh[8*n];
     C[0].y=coh[8*n+1];
     C[1].x=coh[8*n+2];
     C[1].y=coh[8*n+3];
     C[2].x=coh[8*n+4];
     C[2].y=coh[8*n+5];
     C[3].x=coh[8*n+6];
     C[3].y=coh[8*n+7]; 
     /* which parameter exactly 0..7 */
     int stoff=m-stc*8;
     float pp1[8]; 
     float pp2[8]; 
     if (stc==sta1) {
      for (int cn=0; cn<8; cn++) {
       pp1[cn]=0.0f;
       pp2[cn]=p[sta2*8+cn];
      }
      pp1[stoff]=1.0f;
     } else if (stc==sta2) {
      for (int cn=0; cn<8; cn++) {
       pp2[cn]=0.0f;
       pp1[cn]=p[sta1*8+cn];
      }
      pp2[stoff]=1.0f;
     }
     cuFloatComplex G1[4];
     G1[0].x=pp1[0];
     G1[0].y=pp1[1];
     G1[1].x=pp1[2];
     G1[1].y=pp1[3];
     G1[2].x=pp1[4];
     G1[2].y=pp1[5];
     G1[3].x=pp1[6];
     G1[3].y=pp1[7];
     cuFloatComplex T1[4];
     /* T=G1*C */
     T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
     T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
     T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
     T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
     cuFloatComplex G2[4];
     /* conjugate this */
     G2[0].x=pp2[0];
     G2[0].y=-pp2[1];
     G2[2].x=pp2[2];
     G2[2].y=-pp2[3];
     G2[1].x=pp2[4];
     G2[1].y=-pp2[5];
     G2[3].x=pp2[6];
     G2[3].y=-pp2[7];
     cuFloatComplex T2[4];
     T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
     T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
     T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
     T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
     /* update jacobian */
     /* NOTE: row major order */
     jac[m+M*8*n]=T2[0].x;
     jac[m+M*(8*n+1)]=T2[0].y;
     jac[m+M*(8*n+2)]=T2[1].x;
     jac[m+M*(8*n+3)]=T2[1].y;
     jac[m+M*(8*n+4)]=T2[2].x;
     jac[m+M*(8*n+5)]=T2[2].y;
     jac[m+M*(8*n+6)]=T2[3].x;
     jac[m+M*(8*n+7)]=T2[3].y;
    } 
   }
 }
 /* only use extern if calling code is C */
 extern "C"
 {
 /* cuda driver for calculating jacf() */
 /* p: params (Mx1), jac: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
 void
 cudakernel_jacf_fl2(float *p, float *jac, int M, int N, float *coh, short *bbh, int Nbase, int Mclus, int Nstations) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  /* NOTE: use small value for ThreadsPerBlock here, like 8 */
  dim3 threadsPerBlock(16, 8);
  /* jacobian: Nbase x Nstations (proportional to N), so */
  dim3 numBlocks((Nbase+threadsPerBlock.x-1)/threadsPerBlock.x, 
               (M+threadsPerBlock.y-1)/threadsPerBlock.y);
  /* set memory of jac to zero */
  cudaMemset(jac, 0, N*M*sizeof(float));
 // printf("Kernel Jax data size=%d, params=%d, block=%d,%d, thread=%d,%d, baselines=%d\n",N, M, numBlocks.x,numBlocks.y, threadsPerBlock.x, threadsPerBlock.y, Nbase);
  kernel_jacf_fl2<<< numBlocks, threadsPerBlock>>>(Nbase,  M, jac, coh, p, bbh, Nstations);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* invert sqrt(singular values)  1/Sd[]  for Sd[]> eps */
 void
 cudakernel_sqrtdiv_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float eps, float *Sd) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_sqrtdiv_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, eps, Sd);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* U <= U D, 
   U : MxM
   D : Mx1, diagonal matrix
 */
 void
 cudakernel_diagmult_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float *U, float *D) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_diagmult_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, U, D);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* diag(J^T J)
   d[i] = J[i,:] * J[i,:]
   J: NxM (in row major order, so J[i,:] is actually J[:,i]
   d: Nx1
 */
 void
 cudakernel_jnorm_fl(int ThreadsPerBlock, int BlocksPerGrid, float *J, int N, int M, float *d) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_jnorm_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N,M,J,d);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 }
--- a/src/lib/Solvers/diag_fl.o
+++ b/src/lib/Solvers/diag_fl.o
--- a/src/lib/Solvers/diagnostics.c
+++ b/src/lib/Solvers/diagnostics.c
@ -1,550 +0,0 @@
 /*
 *
 Copyright (C) 2014 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include "Solvers.h"
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <cuda_runtime.h>
 #include <pthread.h>
 #include <math.h>
 static void
 checkCudaError(cudaError_t err, const char *file, int line)
 {
 #ifdef CUDA_DEBUG
    if(!err)
        return;
    fprintf(stderr,"GPU (CUDA): %s %s %d\n", cudaGetErrorString(err),file,line);
    exit(EXIT_FAILURE);
 #endif
 }
 static void
 checkCublasError(cublasStatus_t cbstatus, char *file, int line)
 {
 #ifdef CUDA_DEBUG
   if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr,"%s: %d: CUBLAS failure\n",file,line);
    exit(EXIT_FAILURE);
   }
 #endif
 }
 /* find for one cluster J (J^T W J+ eW)^-1 J^T  and extract diagonal as output
  p: parameters M x 1
  rd: residual vector N x 1 (on the device, invarient)
  x: (output) diagonal of leverage matrix 
  cbhandle,gWORK: BLAS/storage  pointers
  tileoff: need for hybrid parameters
  adata: has all additional info: coherency,baselines,flags
 */
 static int
 calculate_leverage(float *p, float *rd, float *x, int M, int N, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle, float *gWORK, int tileoff, int ntiles, me_data_t *dp) {
 /* p needs to be copied to device and x needs to be copied back from device
  rd always remains in the device (select part with the right offset) 
  N will change in hybrid mode, so copy back to x with right offset */
 int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
 float *jacd,*xd,*jacTjacd,*pd,*cohd,*Ud,*VTd,*Sd;
 unsigned long int moff=0;
 short *bbd;
 cudaError_t err;
 /* total storage N+M*N+M*M+M+Nbase*8+M*M+M*M+M+M+Nbase*3(short)/(float) */ 
 xd=&gWORK[moff];
 moff+=N;
 jacd=&gWORK[moff];
 moff+=M*N;
 jacTjacd=&gWORK[moff];
 moff+=M*M;
 pd=&gWORK[moff];
 moff+=M;
 cohd=&gWORK[moff];
 moff+=Nbase*8;
 Ud=&gWORK[moff];
 moff+=M*M;
 VTd=&gWORK[moff];
 moff+=M*M;
 Sd=&gWORK[moff];
 moff+=M;
 bbd=(short*)&gWORK[moff];
 moff+=(Nbase*3*sizeof(short))/sizeof(float);
 err=cudaMemcpyAsync(pd, p, M*sizeof(float), cudaMemcpyHostToDevice,0);
 checkCudaError(err,__FILE__,__LINE__);
 /* need to give right offset for coherencies */
 /* offset: cluster offset+time offset */
 err=cudaMemcpyAsync(cohd, &(dp->ddcohf[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(float), cudaMemcpyHostToDevice,0);
 checkCudaError(err,__FILE__,__LINE__);
 /* correct offset for baselines */
 err=cudaMemcpyAsync(bbd, &(dp->ddbase[3*(dp->Nbase)*(tileoff)]), Nbase*3*sizeof(short), cudaMemcpyHostToDevice,0);
 checkCudaError(err,__FILE__,__LINE__);
 cudaDeviceSynchronize();
 int ThreadsPerBlock=DEFAULT_TH_PER_BK;
 int ci,Mi;
 /* extra storage for cusolver */
 int work_size=0;
 int *devInfo;
 err=cudaMalloc((void**)&devInfo, sizeof(int));
 checkCudaError(err,__FILE__,__LINE__);
 float *work;
 float *rwork;
 cusolverDnSgesvd_bufferSize(solver_handle, M, M, &work_size);
 err=cudaMalloc((void**)&work, work_size*sizeof(float));
 checkCudaError(err,__FILE__,__LINE__);
 err=cudaMalloc((void**)&rwork, 5*M*sizeof(float));
 checkCudaError(err,__FILE__,__LINE__);
 /* set mem to 0 */
 cudaMemset(xd, 0, N*sizeof(float));
 /* calculate J^T, not taking flags into account */
 cudakernel_jacf_fl2(pd, jacd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
 /* calculate JTJ=(J^T J - [e] [W]) */
 //status=culaDeviceSgemm('N','T',M,M,N,1.0f,jacd,M,jacd,M,0.0f,jacTjacd,M);
 //checkStatus(status,__FILE__,__LINE__);
 cublasStatus_t cbstatus=CUBLAS_STATUS_SUCCESS;
 float cone=1.0f; float czero=0.0f;
 cbstatus=cublasSgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_T,M,M,N,&cone,jacd,M,jacd,M,&czero,jacTjacd,M);
 /* add mu * I to JTJ */
 cudakernel_diagmu_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, jacTjacd, 1e-9f);
 /* calculate inv(JTJ) using SVD */
 /* inv(JTJ) = Ud x Sid x VTd : we take into account that JTJ is symmetric */
 //status=culaDeviceSgesvd('A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M);
 //checkStatus(status,__FILE__,__LINE__);
 cusolverDnSgesvd(solver_handle,'A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M,work,work_size,rwork,devInfo);
 cudaDeviceSynchronize();
 /* find Sd= 1/sqrt(Sd) of the singular values (positive singular values) */
 cudakernel_sqrtdiv_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, 1e-9f, Sd);
 /* multiply Ud with Sid (diagonal) Ud <= Ud Sid (columns modified) */
 cudakernel_diagmult_fl(ThreadsPerBlock, (M*M+ThreadsPerBlock-1)/ThreadsPerBlock, M, Ud, Sd);
 /* now multiply Ud VTd to get the square root */
 //status=culaDeviceSgemm('N','N',M,M,M,1.0f,Ud,M,VTd,M,0.0f,jacTjacd,M);
 //checkStatus(status,__FILE__,__LINE__);
 cbstatus=cublasSgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_N,M,M,M,&cone,Ud,M,VTd,M,&czero,jacTjacd,M);
 /* calculate J^T, without taking flags into account (use same storage as previous J^T) */
 cudakernel_jacf_fl2(pd, jacd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
 /* multiply (J^T)^T sqrt(B)  == sqrt(B)^T J^T, taking M columns at a time */
 for (ci=0; ci<(N+M-1)/M;ci++) {
  if (ci*M+M<N) {
   Mi=M;
  } else {
   Mi=N-ci*M;
  }
  //status=culaDeviceSgemm('T','N',M,Mi,M,1.0f,jacTjacd,M,&jacd[ci*M*M],M,0.0f,VTd,M);
  //checkStatus(status,__FILE__,__LINE__);
  cbstatus=cublasSgemm(cbhandle,CUBLAS_OP_T,CUBLAS_OP_N,M,Mi,M,&cone,jacTjacd,M,&jacd[ci*M*M],M,&czero,VTd,M);
  err=cudaMemcpy(&jacd[ci*M*M],VTd,Mi*M*sizeof(float),cudaMemcpyDeviceToDevice);
  checkCudaError(err,__FILE__,__LINE__);
 }
 /* xd[i] <= ||J[i,:]||^2 */
 cudakernel_jnorm_fl(ThreadsPerBlock, (N+ThreadsPerBlock-1)/ThreadsPerBlock, jacd, N, M, xd);
 /* output x <=xd */
 err=cudaMemcpyAsync(x, xd, N*sizeof(float), cudaMemcpyDeviceToHost,0);
 cudaDeviceSynchronize();
 checkCudaError(err,__FILE__,__LINE__);
 checkCublasError(cbstatus,__FILE__,__LINE__);
 return 0;
 }
 /******************** pipeline functions **************************/
 typedef struct gb_data_dg_ {
  int status[2]; 
  float *p[2]; /* pointer to parameters being used by each thread (depends on cluster) */
  float *xo; /* residual vector (copied to device) */
  float *x[2]; /* output leverage values from each thread */
  int M[2]; /* no. of parameters (per cluster,hybrid) */
  int N[2]; /* no. of visibilities (might change in hybrid mode) */
  me_data_t *lmdata[2]; /* two for each thread */
  /* GPU related info */
  cublasHandle_t cbhandle[2]; /* CUBLAS handles */
  cusolverDnHandle_t solver_handle[2]; 
  float *rd[2]; /* residual vector on the device (invarient) */
  float *gWORK[2]; /* GPU buffers */
  int64_t data_size; /* size of buffer (bytes) */
 } gbdatadg;
 /* slave thread 2GPU function */
 static void *
 pipeline_slave_code_dg(void *data)
 {
 slave_tdata *td=(slave_tdata*)data;
 gbdatadg *gd=(gbdatadg*)(td->pline->data);
 int tid=td->tid;
 while(1) {
  sync_barrier(&(td->pline->gate1)); /* stop at gate 1*/
  if(td->pline->terminate) break; /* if flag is set, break loop */
  sync_barrier(&(td->pline->gate2)); /* stop at gate 2 */
 /* do work */
  if (gd->status[tid]==PT_DO_CDERIV) {
    me_data_t *t=(me_data_t *)gd->lmdata[tid];
    /* divide the tiles into chunks tilesz/nchunk */
    int tilechunk=(t->tilesz+t->carr[t->clus].nchunk-1)/t->carr[t->clus].nchunk;
    int ci;
    int cj=0;
    int ntiles;
    /* loop over chunk, righ set of parameters and residual vector */
    for (ci=0; ci<t->carr[t->clus].nchunk; ci++) {
     /* divide the tiles into chunks tilesz/nchunk */
     if (cj+tilechunk<t->tilesz) {
      ntiles=tilechunk;
     } else {
      ntiles=t->tilesz-cj;
     }
    /* right offset for rd[] and x[] needed and since no overlap,
       can wait for all chunks to complete  */
    calculate_leverage(&gd->p[tid][ci*(gd->M[tid])],&gd->rd[tid][8*cj*t->Nbase],&gd->x[tid][8*cj*t->Nbase], gd->M[tid], 8*ntiles*t->Nbase, gd->cbhandle[tid], gd->solver_handle[tid], gd->gWORK[tid], cj, ntiles, gd->lmdata[tid]);
    cj=cj+tilechunk;
   }
  } else if (gd->status[tid]==PT_DO_AGPU) {
    attach_gpu_to_thread2(tid,&gd->cbhandle[tid],&gd->solver_handle[tid],&gd->gWORK[tid],gd->data_size,1);
    /* copy residual vector to device */
    cudaError_t err;
    me_data_t *t=(me_data_t *)gd->lmdata[tid];
    err=cudaMalloc((void**)&gd->rd[tid], (size_t)8*t->tilesz*t->Nbase*sizeof(float));
    checkCudaError(err,__FILE__,__LINE__);
    err=cudaMemcpy(gd->rd[tid], gd->xo, 8*t->tilesz*t->Nbase*sizeof(float), cudaMemcpyHostToDevice);
    checkCudaError(err,__FILE__,__LINE__);
  } else if (gd->status[tid]==PT_DO_DGPU) {
    cudaFree(gd->rd[tid]);
    detach_gpu_from_thread2(gd->cbhandle[tid],gd->solver_handle[tid],gd->gWORK[tid],1);
  } else if (gd->status[tid]!=PT_DO_NOTHING) { /* catch error */ 
    fprintf(stderr,"%s: %d: invalid mode for slave tid=%d status=%d\n",__FILE__,__LINE__,tid,gd->status[tid]);
    exit(1);
  }
 }
 return NULL;
 }
 /* initialize the pipeline
  and start the slaves rolling */
 static void
 init_pipeline_dg(th_pipeline *pline,
     void *data)
 {
 slave_tdata *t0,*t1;
 pthread_attr_init(&(pline->attr));
 pthread_attr_setdetachstate(&(pline->attr),PTHREAD_CREATE_JOINABLE);
 init_th_barrier(&(pline->gate1),3); /* 3 threads, including master */
 init_th_barrier(&(pline->gate2),3); /* 3 threads, including master */
 pline->terminate=0;
 pline->data=data; /* data should have pointers to t1 and t2 */
 if ((t0=(slave_tdata*)malloc(sizeof(slave_tdata)))==0) {
    fprintf(stderr,"no free memory\n");
    exit(1);
 }
 if ((t1=(slave_tdata*)malloc(sizeof(slave_tdata)))==0) {
    fprintf(stderr,"no free memory\n");
    exit(1);
 }
 if ((pline->thst=(taskhist*)malloc(sizeof(taskhist)))==0) {
    fprintf(stderr,"no free memory\n");
    exit(1);
 }
 init_task_hist(pline->thst);
 t0->pline=t1->pline=pline;
 t0->tid=0;
 t1->tid=1; /* link back t1, t2 to data so they could be freed */
 pline->sd0=t0;
 pline->sd1=t1;
 pthread_create(&(pline->slave0),&(pline->attr),pipeline_slave_code_dg,(void*)t0);
 pthread_create(&(pline->slave1),&(pline->attr),pipeline_slave_code_dg,(void*)t1);
 }
 /* destroy the pipeline */
 /* need to kill the slaves first */
 static void
 destroy_pipeline_dg(th_pipeline *pline)
 {
 pline->terminate=1;
 sync_barrier(&(pline->gate1));
 pthread_join(pline->slave0,NULL);
 pthread_join(pline->slave1,NULL);
 destroy_th_barrier(&(pline->gate1));
 destroy_th_barrier(&(pline->gate2));
 pthread_attr_destroy(&(pline->attr));
 destroy_task_hist(pline->thst);
 free(pline->thst);
 free(pline->sd0);
 free(pline->sd1);
 pline->data=NULL;
 }
 /******************** end pipeline functions **************************/
 /*  Calculate St.Laurent-Cook Jacobian leverage
  xo: residual  (modified)
  flags: 2 for flags based on uvcut, 1 for normal flags
  coh: coherencies are calculated for all baselines, regardless of flag
  diagmode: 1: replace residual, 2: calc noise/leverage ratio
 */
 int
 calculate_diagnostics(double *u,double *v,double *w,double *p,double *xo,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, complex double *coh, int M,int Mt,int diagmode, int Nt) {
  int cj;
  int n;
  me_data_t lmdata0,lmdata1;
  int Nbase1;
  /* no of data */
  n=Nbase*tilesz*8;
  /* true no of baselines */
  Nbase1=Nbase*tilesz;
  double *ddcoh;
  short *ddbase;
  int c0,c1;
  float *ddcohf, *pf, *xdummy0f, *xdummy1f, *res0, *dgf;
 /********* thread data ******************/
  /* barrier */
  th_pipeline tp;
  gbdatadg tpg;
 /****************************************/
  lmdata0.clus=lmdata1.clus=-1;
  /* setup data for lmfit */
  lmdata0.u=lmdata1.u=u;
  lmdata0.v=lmdata1.v=v;
  lmdata0.w=lmdata1.w=w;
  lmdata0.Nbase=lmdata1.Nbase=Nbase;
  lmdata0.tilesz=lmdata1.tilesz=tilesz;
  lmdata0.N=lmdata1.N=N;
  lmdata0.barr=lmdata1.barr=barr;
  lmdata0.carr=lmdata1.carr=carr;
  lmdata0.M=lmdata1.M=M;
  lmdata0.Mt=lmdata1.Mt=Mt;
  lmdata0.freq0=lmdata1.freq0=NULL; /* not used */
  lmdata0.Nt=lmdata1.Nt=Nt;
  lmdata0.coh=lmdata1.coh=coh;
  /* rearrange coh for GPU use */
  if ((ddcoh=(double*)calloc((size_t)(M*Nbase1*8),sizeof(double)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  if ((ddcohf=(float*)calloc((size_t)(M*Nbase1*8),sizeof(float)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  if ((ddbase=(short*)calloc((size_t)(Nbase1*3),sizeof(short)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  rearrange_coherencies2(Nbase1, barr, coh, ddcoh, ddbase, M, Nt);
  lmdata0.ddcoh=lmdata1.ddcoh=ddcoh;
  lmdata0.ddbase=lmdata1.ddbase=ddbase;
  /* ddcohf (float) << ddcoh (double) */
  double_to_float(ddcohf,ddcoh,M*Nbase1*8,Nt);
  lmdata0.ddcohf=lmdata1.ddcohf=ddcohf;
  if ((pf=(float*)calloc((size_t)(Mt*8*N),sizeof(float)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  double_to_float(pf,p,Mt*8*N,Nt);
  /* residual */
  if ((res0=(float*)calloc((size_t)(n),sizeof(float)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  double_to_float(res0,xo,n,Nt);
  /* sum of diagonal values of leverage */
  if ((dgf=(float*)calloc((size_t)(n),sizeof(float)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  if ((xdummy0f=(float*)calloc((size_t)(n),sizeof(float)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  if ((xdummy1f=(float*)calloc((size_t)(n),sizeof(float)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
 /********** setup threads *******************************/
  /* also calculate the total storage needed to be allocated on a GPU */
   /* determine total size for memory allocation 
     residual = n (separately allocated)
     diagonal = n
    For one cluster,
     Jacobian = nxm,  J^T J = mxm, (also inverse)
   */
   int Mm=8*N; /* no of parameters */
   int64_t data_sz=0;
   data_sz=(int64_t)(n+Mm*n+3*Mm*Mm+3*Mm+Nbase1*8)*sizeof(float)+(int64_t)Nbase1*3*sizeof(short);
  tpg.data_size=data_sz;
  tpg.lmdata[0]=&lmdata0;
  tpg.lmdata[1]=&lmdata1;
  tpg.xo=res0; /* residual */
  init_pipeline_dg(&tp,&tpg);
  sync_barrier(&(tp.gate1)); /* sync at gate 1*/
  tpg.status[0]=tpg.status[1]=PT_DO_AGPU;
  sync_barrier(&(tp.gate2)); /* sync at gate 2*/
  sync_barrier(&(tp.gate1)); /* sync at gate 1*/
  tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
  sync_barrier(&(tp.gate2)); /* sync at gate 2*/
 /********** done setup threads *******************************/
     tpg.x[0]=xdummy0f;
     tpg.M[0]=8*N; /* even though size of p is > M, dont change this */
     tpg.N[0]=n; /* Nbase*tilesz*8 */
     tpg.x[1]=xdummy1f;
     tpg.M[1]=8*N; /* even though size of p is > M, dont change this */
     tpg.N[1]=n; /* Nbase*tilesz*8 */
    for (cj=0; cj<M/2; cj++) { /* iter per cluster pairs */
      c0=2*cj;
      c1=2*cj+1;
  sync_barrier(&(tp.gate1)); /* sync at gate 1 */
     lmdata0.clus=c0;
     lmdata1.clus=c1;
     /* run this from a separate thread */
     tpg.p[0]=&pf[carr[c0].p[0]]; /* length carr[c0].nchunk times */
     tpg.p[1]=&pf[carr[c1].p[0]]; /* length carr[c1].nchunk times */
     tpg.status[0]=tpg.status[1]=PT_DO_CDERIV;
  sync_barrier(&(tp.gate2)); /* sync at gate 2 */
  sync_barrier(&(tp.gate1)); /* sync at gate 1 */
     tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
  sync_barrier(&(tp.gate2)); /* sync at gate 2 */
    /* add result to the sum */
    my_saxpy(n, xdummy0f, 1.0f, dgf);
    my_saxpy(n, xdummy1f, 1.0f, dgf);
   }
   /* odd cluster out, if M is odd */
   if (M%2) {
      c0=M-1;
  sync_barrier(&(tp.gate1)); /* sync at gate 1 */
     tpg.p[0]=&pf[carr[c0].p[0]];
     lmdata0.clus=c0;
     tpg.status[0]=PT_DO_CDERIV;
     tpg.status[1]=PT_DO_NOTHING;
  sync_barrier(&(tp.gate2)); /* sync at gate 2 */
 /**************************************************************************/
  sync_barrier(&(tp.gate1)); /* sync at gate 1 */
     tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
  sync_barrier(&(tp.gate2)); /* sync at gate 2 */
    my_saxpy(n, xdummy0f, 1.0f, dgf);
  }
  free(pf);
  free(ddcohf);
  free(xdummy1f);
  free(res0);
  free(ddcoh);
  /******** free threads ***************/
  sync_barrier(&(tp.gate1)); /* sync at gate 1*/
  tpg.status[0]=tpg.status[1]=PT_DO_DGPU;
  sync_barrier(&(tp.gate2)); /* sync at gate 2*/
  destroy_pipeline_dg(&tp);
  /******** done free threads ***************/
  /* now add 1's to locations with flagged data */
  /* create array for adding */
  create_onezerovec(Nbase1, ddbase, xdummy0f, Nt);
  my_saxpy(n, xdummy0f, 1.0f, dgf);
  free(xdummy0f);
  free(ddbase);
  /* output */
 //  for (cj=0; cj<n; cj++) {
 //   printf("%d %f\n",cj,dgf[cj]);
 //  }
  if (diagmode==1) {
  /* copy back to output */
  float_to_double(xo,dgf,n,Nt);
  } else { 
    /* solve system of  equations a * leverage + b * 1 = |residual|
      to find a,b scalars, and just print them as output */
     /* find  1^T |r| = sum (|residual|) and  lev^T |r|  */
     float sum1,sum2;
     find_sumproduct(n, res0, dgf, &sum1, &sum2, Nt);
     //printf("sum|res|=%f sum(lev^T |res|)=%f\n",sum1,sum2);
     float a00,a01,a11;
     a00=my_fnrm2(n,dgf); /* lev^T lev */
     a01=my_fasum(n,dgf); /* = a10 = sum|leverage| */
     a00=a00*a00;
     a11=(float)n; /* sum( 1 ) */
     float r00,r01;
     r00=sum1;
     r01=sum2;
     //printf("A=[\n %f %f;\n %f %f];\n b=[\n %f\n %f\n]\n",a00,a01,a01,a11,r00,r01);
     /* solve A [a b]^T = r */
     float alpha,beta,denom;
     denom=(a00*a11-a01*a01);
     //printf("denom=%f\n",denom);
     if (denom>1e-6f) { /* can be solved */
      alpha=(r00*a11-r01*a01)/denom;
     } else {
      alpha=0.0f;
     }
     beta=(r00-a00*alpha)/a01; 
     printf("Error Noise/Model %e/%e\n",beta,alpha);
  }
  free(dgf);
 return 0;
 }
--- a/src/lib/Solvers/diagnostics.o
+++ b/src/lib/Solvers/diagnostics.o
--- a/src/lib/Solvers/lbfgs.c
+++ b/src/lib/Solvers/lbfgs.c
--- a/src/lib/Solvers/lbfgs.o
+++ b/src/lib/Solvers/lbfgs.o
--- a/src/lib/Solvers/lbfgs_nocuda.c
+++ b/src/lib/Solvers/lbfgs_nocuda.c
@ -1,926 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include "Solvers.h"
 #include <pthread.h>
 /**** repeated code here ********************/
 /* Jones matrix multiplication 
   C=A*B
 */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static void
 amb(complex double * __restrict a, complex double * __restrict b, complex double * __restrict c) {
 c[0]=a[0]*b[0]+a[1]*b[2];
 c[1]=a[0]*b[1]+a[1]*b[3];
 c[2]=a[2]*b[0]+a[3]*b[2];
 c[3]=a[2]*b[1]+a[3]*b[3];
 }
 /* Jones matrix multiplication 
   C=A*B^H
 */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static void
 ambt(complex double * __restrict a, complex double * __restrict b, complex double * __restrict c) {
 c[0]=a[0]*conj(b[0])+a[1]*conj(b[1]);
 c[1]=a[0]*conj(b[2])+a[1]*conj(b[3]);
 c[2]=a[2]*conj(b[0])+a[3]*conj(b[1]);
 c[3]=a[2]*conj(b[2])+a[3]*conj(b[3]);
 }
 /**** end repeated code ********************/
 /* worker thread for a cpu */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static void *
 cpu_calc_deriv(void *adata) {
 thread_data_grad_t *t=(thread_data_grad_t*)adata;
 int ci,nb;
 int stc,stoff,stm,sta1,sta2;
 int N=t->N; /* stations */
 int M=t->M; /* clusters */
 int Nbase=(t->Nbase)*(t->tilesz);
 complex double xr[4]; /* residuals */
 complex double G1[4],G2[4],C[4],T1[4],T2[4];
 double pp[8];
 complex double csum;
 int cli,tpchunk,pstart,nchunk,tilesperchunk,stci,ttile,tptile,poff;
 /* iterate over each paramter */
 for (ci=t->g_start; ci<=t->g_end; ++ci) {
    t->g[ci]=0.0;
    /* find station and parameter corresponding to this value of ci */
    /* this parameter should correspond to the right baseline (x tilesz)
        to contribute to the derivative */
    cli=0;
    while((cli<M) && (ci<t->carr[cli].p[0] || ci>t->carr[cli].p[0]+8*N*t->carr[cli].nchunk-1)) {
     cli++;
    }
   /* now either cli>=M: cluster not found 
       or cli<M and cli is the right cluster */
   if (cli==M && ci>=t->carr[cli-1].p[0] && ci<=t->carr[cli-1].p[0]+8*N*t->carr[cli-1].nchunk-1) {
    cli--;
   }
   if (cli<M) {
    /* right parameter offset */
    stci=ci-t->carr[cli].p[0];
    stc=(stci%(8*N))/8; /* 0..N-1 */
    /* make sure this baseline contribute to this parameter */
    tpchunk=stci/(8*N);
    nchunk=t->carr[cli].nchunk;
    pstart=t->carr[cli].p[0];
    tilesperchunk=(t->tilesz+nchunk-1)/nchunk;
    /* iterate over all baselines and accumulate sum */
    for (nb=0; nb<Nbase; ++nb) {
     /* which tile is this ? */
     ttile=nb/t->Nbase;
     /* which chunk this tile belongs to */
     tptile=ttile/tilesperchunk;
     /* now tptile has to match tpchunk, otherwise ignore calculation */
     if (tptile==tpchunk) {
     sta1=t->barr[nb].sta1;
     sta2=t->barr[nb].sta2;
     if (((stc==sta1)||(stc==sta2))&& !t->barr[nb].flag) {
      /* this baseline has a contribution */
      /* which paramter of this station */
      stoff=(stci%(8*N))%8; /* 0..7 */
      /* which cluster */
      stm=cli; /* 0..M-1 */
      /* exact expression for derivative 
         2 real( vec^H(residual_this_baseline) 
            * vec(-J_{pm}C_{pqm} J_{qm}^H)
        where m: chosen cluster
        J_{pm},J_{qm} Jones matrices for baseline p-q
        depending on the parameter, J ==> E 
        E: zero matrix, except 1 at location of m
       residual : in x[8*nb:8*nb+7]
       C coh: in coh[8*M*nb+m*8:8*M*nb+m*8+7] (double storage)
           coh[4*M*nb+4*m:4*M*nb+4*m+3] (complex storage)
       J_p,J_q: in p[sta1*8+m*8*N: sta1*8+m*8*N+7]
        and p[sta2*8+m*8*N: sta2*8+m*8*N+ 7]
     */
     /* read in residual vector, conjugated */
     xr[0]=(t->x[nb*8])-_Complex_I*(t->x[nb*8+1]);
     xr[1]=(t->x[nb*8+2])-_Complex_I*(t->x[nb*8+3]);
     xr[2]=(t->x[nb*8+4])-_Complex_I*(t->x[nb*8+5]);
     xr[3]=(t->x[nb*8+6])-_Complex_I*(t->x[nb*8+7]);
     /* read in coherency */
     C[0]=t->coh[4*M*nb+4*stm];
     C[1]=t->coh[4*M*nb+4*stm+1];
     C[2]=t->coh[4*M*nb+4*stm+2];
     C[3]=t->coh[4*M*nb+4*stm+3];
     memset(pp,0,sizeof(double)*8); 
     if (stc==sta1) {
       /* this station parameter gradient */
       pp[stoff]=1.0;
       memset(G1,0,sizeof(complex double)*4); 
       G1[0]=pp[0]+_Complex_I*pp[1];
       G1[1]=pp[2]+_Complex_I*pp[3];
       G1[2]=pp[4]+_Complex_I*pp[5];
       G1[3]=pp[6]+_Complex_I*pp[7];
       poff=pstart+tpchunk*8*N+sta2*8;
       G2[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]);
       G2[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]);
       G2[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+4]);
       G2[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]);
     } else if (stc==sta2) {
       memset(G2,0,sizeof(complex double)*4); 
       pp[stoff]=1.0;
       G2[0]=pp[0]+_Complex_I*pp[1];
       G2[1]=pp[2]+_Complex_I*pp[3];
       G2[2]=pp[4]+_Complex_I*pp[5];
       G2[3]=pp[6]+_Complex_I*pp[7];
       poff=pstart+tpchunk*8*N+sta1*8;
       G1[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]);
       G1[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]);
       G1[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+5]);
       G1[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]);
     }
     /* T1=G1*C */
     amb(G1,C,T1);
     /* T2=T1*G2' */
     ambt(T1,G2,T2);
     /* calculate product xr*vec(J_p C J_q^H ) */
     csum=xr[0]*T2[0];
     csum+=xr[1]*T2[1];
     csum+=xr[2]*T2[2];
     csum+=xr[3]*T2[3];
     /* accumulate sum */
     t->g[ci]+=-2.0*creal(csum);
     }
     }
    }
   }
 }
 return NULL;
 }
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static int
 func_grad(
   void (*func)(double *p, double *hx, int m, int n, void *adata),
   double *p, double *g, double *xo, int m, int n, double step, void *adata) {
  /* gradient for each parameter is
     (||func(p+step*e_i)-x||^2-||func(p-step*e_i)-x||^2)/2*step
    i=0,...,m-1 for all parameters
    e_i: unit vector, 1 only at i-th location
  */
  double *x; /* array to store residual */
  int ci;
  me_data_t *dp=(me_data_t*)adata;
  int Nt=dp->Nt;
  pthread_attr_t attr;
  pthread_t *th_array;
  thread_data_grad_t *threaddata;
  if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  /* evaluate func once, store in x, and create threads */
  /* and calculate the residual x=xo-func */
  func(p,x,m,n,adata);
  /* calculate x<=x-xo */
  my_daxpy(n,xo,-1.0,x);
  /* setup threads */
  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
 #ifndef USE_MIC
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
 #endif
   exit(1);
  }
  if ((threaddata=(thread_data_grad_t*)malloc((size_t)Nt*sizeof(thread_data_grad_t)))==0) {
 #ifndef USE_MIC
    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
 #endif
    exit(1);
  }
  int nth,nth1,Nparm;
  /* parameters per thread */
  Nparm=(m+Nt-1)/Nt;
  /* each thread will calculate derivative of part of 
     parameters */
  ci=0;
  for (nth=0;  nth<Nt; nth++) {
   threaddata[nth].Nbase=dp->Nbase;
   threaddata[nth].tilesz=dp->tilesz;
   threaddata[nth].barr=dp->barr;
   threaddata[nth].carr=dp->carr;
   threaddata[nth].M=dp->M;
   threaddata[nth].N=dp->N;
   threaddata[nth].coh=dp->coh;
   threaddata[nth].m=m;
   threaddata[nth].n=n;
   threaddata[nth].x=x;
   threaddata[nth].p=p;
   threaddata[nth].g=g;
   threaddata[nth].g_start=ci;
   threaddata[nth].g_end=ci+Nparm-1;
   if (threaddata[nth].g_end>=m) {
    threaddata[nth].g_end=m-1;
   }
   ci=ci+Nparm;
   pthread_create(&th_array[nth],&attr,cpu_calc_deriv,(void*)(&threaddata[nth]));
  }
  /* now wait for threads to finish */
  for(nth1=0; nth1<nth; nth1++) {
   pthread_join(th_array[nth1],NULL);
  }
  pthread_attr_destroy(&attr);
  free(th_array);
  free(threaddata);
  free(x);
  return 0;
 }
 /* use algorithm 9.1 to compute pk=Hk gk */
 /* pk,gk: size m x 1
   s, y: size mM x 1 
   rho: size M x 1 
   ii: true location of the k th values in s,y */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static void
 mult_hessian(int m, double *pk, double *gk, double *s, double *y, double *rho, int M, int ii) {
 int ci;
 double *alphai;
 int *idx; /* store sorted locations of s, y here */
 double gamma,beta;
 if ((alphai=(double*)calloc((size_t)M,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
 }
 if ((idx=(int*)calloc((size_t)M,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
 }
 if (M>0) {
  /* find the location of k-1 th value */
  if (ii>0) {
   ii=ii-1;
  } else {
   ii=M-1;
  }
 /* s,y will have 0,1,...,ii,ii+1,...M-1 */
 /* map this to  ii+1,ii+2,...,M-1,0,1,..,ii */
  for (ci=0; ci<M-ii-1; ci++){
   idx[ci]=(ii+ci+1);
  }
  for(ci=M-ii-1; ci<M; ci++) {
   idx[ci]=(ci-M+ii+1);
  }
 }
 #ifdef DEBUG
 printf("prod M=%d, current ii=%d\n",M,ii);
 for(ci=0; ci<M; ci++) {
  printf("%d->%d ",ci,idx[ci]);
 }
 printf("\n");
 #endif
 /* q = grad(f)k : pk<=gk */
 my_dcopy(m,gk,1,pk,1);
 /* this should be done in the right order */
 for (ci=0; ci<M; ci++) {
  /* alphai=rhoi si^T*q */
  alphai[M-ci-1]=rho[idx[M-ci-1]]*my_ddot(m,&s[m*idx[M-ci-1]],pk);
  /* q=q-alphai yi */
  my_daxpy(m,&y[m*idx[M-ci-1]],-alphai[M-ci-1],pk);
 }
 /* r=Hk(0) q : initial hessian */
 /* gamma=s(k-1)^T*y(k-1)/y(k-1)^T*y(k-1)*/
 gamma=1.0;
 if (M>0) {
  gamma=my_ddot(m,&s[m*idx[M-1]],&y[m*idx[M-1]]);
  gamma/=my_ddot(m,&y[m*idx[M-1]],&y[m*idx[M-1]]);
  /* Hk(0)=gamma I, so scale q by gamma */
  /* r= Hk(0) q */
  my_dscal(m,gamma,pk);
 } 
 for (ci=0; ci<M; ci++) {
  /* beta=rhoi yi^T * r */
  beta=rho[idx[ci]]*my_ddot(m,&y[m*idx[ci]],pk);
  /* r = r + (alphai-beta)*si */
  my_daxpy(m,&s[m*idx[ci]],alphai[ci]-beta,pk);
 }
 free(alphai);
 free(idx);
 }
 /* cubic interpolation in interval [a,b] (a>b is possible)
   to find step that minimizes cost function */
 /* func: vector function
   xk: parameter values size m x 1 (at which step is calculated)
   pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
   a/b:  interval for interpolation
   x: size n x 1 (storage)
   xp: size m x 1 (storage)
   xo: observed data size n x 1
   n: size of vector function
   step: step size for differencing 
   adata:  additional data passed to the function
 */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static double 
 cubic_interp(
   void (*func)(double *p, double *hx, int m, int n, void *adata),
   double *xk, double *pk, double a, double b, double *x, double *xp,  double *xo, int m, int n, double step, void *adata) {
  double f0,f1,f0d,f1d; /* function values and derivatives at a,b */
  double p01,p02,z0,fz0;
  double aa,cc;
  my_dcopy(m,xk,1,xp,1); /* xp<=xk */
  my_daxpy(m,pk,a,xp); /* xp<=xp+(a)*pk */
  func(xp,x,m,n,adata);
  my_daxpy(n,xo,-1.0,x);
  f0=my_dnrm2(n,x);
  f0*=f0;
  /* grad(phi_0): evaluate at -step and +step */
  my_daxpy(m,pk,step,xp); /* xp<=xp+(a+step)*pk */
  func(xp,x,m,n,adata);
  my_daxpy(n,xo,-1.0,x);
  p01=my_dnrm2(n,x);
  my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(a-step)*pk */
  func(xp,x,m,n,adata);
  my_daxpy(n,xo,-1.0,x);
  p02=my_dnrm2(n,x);
  f0d=(p01*p01-p02*p02)/(2.0*step);
  my_dcopy(m,xk,1,xp,1); /* xp<=xk */
  my_daxpy(m,pk,b,xp); /* xp<=xp+(b)*pk */
  func(xp,x,m,n,adata);
  my_daxpy(n,xo,-1.0,x);
  f1=my_dnrm2(n,x);
  f1*=f1;
  /* grad(phi_1): evaluate at -step and +step */
  my_daxpy(m,pk,step,xp); /* xp<=xp+(b+step)*pk */
  func(xp,x,m,n,adata);
  my_daxpy(n,xo,-1.0,x);
  p01=my_dnrm2(n,x);
  my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(b-step)*pk */
  func(xp,x,m,n,adata);
  my_daxpy(n,xo,-1.0,x);
  p02=my_dnrm2(n,x);
  f1d=(p01*p01-p02*p02)/(2.0*step);
  //printf("Interp a,f(a),f'(a): (%lf,%lf,%lf) (%lf,%lf,%lf)\n",a,f0,f0d,b,f1,f1d);
  /* cubic poly in [0,1] is f0+f0d z+eta z^2+xi z^3 
    where eta=3(f1-f0)-2f0d-f1d, xi=f0d+f1d-2(f1-f0) 
    derivative f0d+2 eta z+3 xi z^2 => cc+bb z+aa z^2 */
   aa=3.0*(f0-f1)/(b-a)+(f1d-f0d);
   p01=aa*aa-f0d*f1d;
  /* root exist? */
  if (p01>0.0) {
   /* root */
   cc=sqrt(p01);
   z0=b-(f1d+cc-aa)*(b-a)/(f1d-f0d+2.0*cc);
   /* FIXME: check if this is within boundary */
   aa=MAX(a,b);
   cc=MIN(a,b);
   //printf("Root=%lf, in [%lf,%lf]\n",z0,cc,aa);
   if (z0>aa || z0<cc) {
    fz0=f0+f1;
   } else {
    /* evaluate function for this root */
    my_dcopy(m,xk,1,xp,1); /* xp<=xk */
    my_daxpy(m,pk,a+z0*(b-a),xp); /* xp<=xp+(z0)*pk */
    func(xp,x,m,n,adata);
    my_daxpy(n,xo,-1.0,x);
    fz0=my_dnrm2(n,x);
    fz0*=fz0;
   }
   /* now choose between f0,f1,fz0,fz1 */
   if (f0<f1 && f0<fz0) {
     return a;
   }
   if (f1<fz0) {
     return b;
   }
   /* else */
   return (z0);
  } else { 
   /* find the value from a or b that minimizes func */
   if (f0<f1) {
    return a;
   } else {
    return b;
   }
  }
  return 0;
 }
 /*************** Fletcher line search **********************************/
 /* zoom function for line search */
 /* func: vector function
   xk: parameter values size m x 1 (at which step is calculated)
   pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
   a/b: bracket interval [a,b] (a>b) is possible
   x: size n x 1 (storage)
   xp: size m x 1 (storage)
   phi_0: phi(0)
   gphi_0: grad(phi(0))
   xo: observed data size n x 1
   n: size of vector function
   step: step size for differencing 
   adata:  additional data passed to the function
 */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static double 
 linesearch_zoom(
   void (*func)(double *p, double *hx, int m, int n, void *adata),
   double *xk, double *pk, double a, double b, double *x, double *xp,  double phi_0, double gphi_0, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) {
  double alphaj,phi_j,phi_aj;
  double gphi_j,p01,p02,aj,bj;
  double alphak=1.0;
  int ci,found_step=0;
  aj=a;
  bj=b;
  ci=0;
  while(ci<10) {
    /* choose alphaj from [a+t2(b-a),b-t3(b-a)] */
    p01=aj+t2*(bj-aj);
    p02=bj-t3*(bj-aj);
    alphaj=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata);
    //printf("cubic intep [%lf,%lf]->%lf\n",p01,p02,alphaj);
    /* evaluate phi(alphaj) */
    my_dcopy(m,xk,1,xp,1); /* xp<=xk */
    my_daxpy(m,pk,alphaj,xp); /* xp<=xp+(alphaj)*pk */
    func(xp,x,m,n,adata);
    /* calculate x<=x-xo */
    my_daxpy(n,xo,-1.0,x);
    phi_j=my_dnrm2(n,x);
    phi_j*=phi_j;
    /* evaluate phi(aj) */
    my_dcopy(m,xk,1,xp,1); /* xp<=xk */
    my_daxpy(m,pk,aj,xp); /* xp<=xp+(alphaj)*pk */
    func(xp,x,m,n,adata);
    /* calculate x<=x-xo */
    my_daxpy(n,xo,-1.0,x);
    phi_aj=my_dnrm2(n,x);
    phi_aj*=phi_aj;
    if ((phi_j>phi_0+rho*alphaj*gphi_0) || phi_j>=phi_aj) {
      bj=alphaj; /* aj unchanged */
    } else {
     /* evaluate grad(alphaj) */
     my_dcopy(m,xk,1,xp,1); /* xp<=xk */
     my_daxpy(m,pk,alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */
     func(xp,x,m,n,adata);
     /* calculate x<=x-xo */
     my_daxpy(n,xo,-1.0,x);
     p01=my_dnrm2(n,x);
     my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphaj-step)*pk */
     func(xp,x,m,n,adata);
     /* calculate x<=x-xo */
     my_daxpy(n,xo,-1.0,x);
     p02=my_dnrm2(n,x);
     gphi_j=(p01*p01-p02*p02)/(2.0*step);
     /* termination due to roundoff/other errors pp. 38, Fletcher */
     if ((aj-alphaj)*gphi_j<=step) {
      alphak=alphaj;
      found_step=1;
      break;
     }
     if (fabs(gphi_j)<=-sigma*gphi_0) {
      alphak=alphaj;
      found_step=1;
      break;
     }
     if (gphi_j*(bj-aj)>=0) {
       bj=aj;
     } /* else bj unchanged */
     aj=alphaj;
   }
   ci++;
  }
  if (!found_step) {
   /* use bound to find possible step */
   alphak=alphaj;
  }
 #ifdef DEBUG
  printf("Found %g Interval [%lf,%lf]\n",alphak,a,b);
 #endif
  return alphak;
 }
 /* line search */
 /* func: vector function
   xk: parameter values size m x 1 (at which step is calculated)
   pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
   alpha1: initial value for step
   sigma,rho,t1,t2,t3: line search parameters (from Fletcher) 
   xo: observed data size n x 1
   n: size of vector function
   step: step size for differencing 
   adata:  additional data passed to the function
 */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static double 
 linesearch(
   void (*func)(double *p, double *hx, int m, int n, void *adata),
   double *xk, double *pk, double alpha1, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) {
 /* phi(alpha)=f(xk+alpha pk)
  for vector function func 
   f(xk) =||func(xk)||^2 */
  double *x,*xp;
  double alphai,alphai1;
  double phi_0,phi_alphai,phi_alphai1;
  double p01,p02;
  double gphi_0,gphi_i;
  double alphak;
  double mu;
  double tol; /* lower limit for minimization */
  int ci;
  if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  if ((xp=(double*)calloc((size_t)m,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  alphak=1.0;
  /* evaluate phi_0 and grad(phi_0) */
  func(xk,x,m,n,adata);
  my_daxpy(n,xo,-1.0,x);
  phi_0=my_dnrm2(n,x);
  phi_0*=phi_0;
  /* select tolarance 1/100 of current function value */
  tol=MIN(0.01*phi_0,1e-6);
  /* grad(phi_0): evaluate at -step and +step */
  my_dcopy(m,xk,1,xp,1); /* xp<=xk */
  my_daxpy(m,pk,step,xp); /* xp<=xp+(0.0+step)*pk */
  func(xp,x,m,n,adata);
  /* calculate x<=x-xo */
  my_daxpy(n,xo,-1.0,x);
  p01=my_dnrm2(n,x);
  my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(0.0-step)*pk */
  func(xp,x,m,n,adata);
  /* calculate x<=x-xo */
  my_daxpy(n,xo,-1.0,x);
  p02=my_dnrm2(n,x);
  gphi_0=(p01*p01-p02*p02)/(2.0*step);
  /* estimate for mu */
  /* mu = (tol-phi_0)/(rho gphi_0) */
  mu=(tol-phi_0)/(rho*gphi_0);
 #ifdef DEBUG
  printf("mu=%lf, alpha1=%lf\n",mu,alpha1);
 #endif
  ci=1;
  alphai=alpha1; /* initial value for alpha(i) : check if 0<alphai<=mu */
  alphai1=0.0;
  phi_alphai1=phi_0;
  while(ci<10) {
   /* evalualte phi(alpha(i))=f(xk+alphai pk) */
   my_dcopy(m,xk,1,xp,1); /* xp<=xk */
   my_daxpy(m,pk,alphai,xp); /* xp<=xp+alphai*pk */
   func(xp,x,m,n,adata);
   /* calculate x<=x-xo */
   my_daxpy(n,xo,-1.0,x);
   phi_alphai=my_dnrm2(n,x);
   phi_alphai*=phi_alphai;
   if (phi_alphai<tol) {
     alphak=alphai;
 #ifdef DEBUG
     printf("Linesearch : Condition 0 met\n");
 #endif
     break;
   }
   if ((phi_alphai>phi_0+alphai*gphi_0) || (ci>1 && phi_alphai>=phi_alphai1)) {
      /* ai=alphai1, bi=alphai bracket */
      alphak=linesearch_zoom(func,xk,pk,alphai1,alphai,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata);
 #ifdef DEBUG
      printf("Linesearch : Condition 1 met\n");
 #endif
      break;
   } 
   /* evaluate grad(phi(alpha(i))) */
   my_dcopy(m,xk,1,xp,1); /* NOT NEEDED here?? xp<=xk */
   my_daxpy(m,pk,alphai+step,xp); /* xp<=xp+(alphai+step)*pk */
   func(xp,x,m,n,adata);
   /* calculate x<=x-xo */
   my_daxpy(n,xo,-1.0,x);
   p01=my_dnrm2(n,x);
   my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphai-step)*pk */
   func(xp,x,m,n,adata);
   /* calculate x<=x-xo */
   my_daxpy(n,xo,-1.0,x);
   p02=my_dnrm2(n,x);
   gphi_i=(p01*p01-p02*p02)/(2.0*step);
   if (fabs(gphi_i)<=-sigma*gphi_0) {
     alphak=alphai;
 #ifdef DEBUG
     printf("Linesearch : Condition 2 met\n");
 #endif
     break;
   }
   if (gphi_i>=0) {
     /* ai=alphai, bi=alphai1 bracket */
     alphak=linesearch_zoom(func,xk,pk,alphai,alphai1,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata);
 #ifdef DEBUG
     printf("Linesearch : Condition 3 met\n");
 #endif
     break;
   }
   /* else preserve old values */
   if (mu<=(2.0*alphai-alphai1)) {
     /* next step */
     alphai1=alphai;
     alphai=mu;
   } else {
     /* choose by interpolation in [2*alphai-alphai1,min(mu,alphai+t1*(alphai-alphai1)] */
     p01=2.0*alphai-alphai1;
     p02=MIN(mu,alphai+t1*(alphai-alphai1));
     alphai=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata);
     //printf("cubic interp [%lf,%lf]->%lf\n",p01,p02,alphai);
   }
   phi_alphai1=phi_alphai;
   ci++;
  }
  free(x);
  free(xp);
 #ifdef DEBUG
  printf("Step size=%g\n",alphak);
 #endif
  return alphak;
 }
 /*************** END Fletcher line search **********************************/
 int
 lbfgs_fit(
   void (*func)(double *p, double *hx, int m, int n, void *adata),
   double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, void *adata) {
  double *gk; /* gradients at both k+1 and k iter */
  double *xk1,*xk; /* parameters at k+1 and k iter */
  double *pk; /* step direction H_k * grad(f) */
  double step=1e-6; /* step for interpolation */
  double *y, *s; /* storage for delta(grad) and delta(p) */
  double *rho; /* storage for 1/yk^T*sk */
  int ci,ck,cm;
  double alphak=1.0;
  if ((gk=(double*)calloc((size_t)m,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  if ((xk1=(double*)calloc((size_t)m,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  if ((xk=(double*)calloc((size_t)m,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  if ((pk=(double*)calloc((size_t)m,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  /* storage size mM x 1*/
  if ((s=(double*)calloc((size_t)m*M,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  if ((y=(double*)calloc((size_t)m*M,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  if ((rho=(double*)calloc((size_t)M,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  /* initial value for params xk=p */
  my_dcopy(m,p,1,xk,1);
  /*  gradient gk=grad(f)_k */
  func_grad(func,xk,gk,x,m,n,step,adata);
  double gradnrm=my_dnrm2(m,gk);
  /* if gradient is too small, no need to solve, so stop */
  if (gradnrm<CLM_STOP_THRESH) {
   ck=itmax;
   step=0.0;
  } else {
   ck=0;
   /* step in [1e-6,1e-9] */
   step=MAX(1e-9,MIN(1e-3/gradnrm,1e-6));
  }
 #ifdef DEBUG
  printf("||grad||=%g step=%g\n",gradnrm,step);
 #endif
  cm=0;
  ci=0;
  while (ck<itmax) {
   /* mult with hessian  pk=-H_k*gk */
   if (ck<M) {
    mult_hessian(m,pk,gk,s,y,rho,ck,ci);
   } else {
    mult_hessian(m,pk,gk,s,y,rho,M,ci);
   }
   my_dscal(m,-1.0,pk);
   /* linesearch to find step length */
   /* parameters alpha1=10.0,sigma=0.1, rho=0.01, t1=9, t2=0.1, t3=0.5 */
   alphak=linesearch(func,xk,pk,10.0,0.1,0.01,9,0.1,0.5,x,m,n,step,adata);
   /* parameters c1=1e-4 c2=0.9, alpha1=1.0, alphamax=10.0, step (for alpha)=1e-4*/
   //alphak=linesearch_nw(func,xk,pk,1.0,10.0,1e-4,0.9,x,m,n,1e-4,adata);
   //alphak=1.0;
   /* check if step size is too small, then stop */
   if (fabs(alphak)<CLM_EPSILON) {
    break;
   }
   /* update parameters xk1=xk+alpha_k *pk */
   my_dcopy(m,xk,1,xk1,1);
   my_daxpy(m,pk,alphak,xk1);
   /* calculate sk=xk1-xk and yk=gk1-gk */
   /* sk=xk1 */ 
   my_dcopy(m,xk1,1,&s[cm],1); 
   /* sk=sk-xk */
   my_daxpy(m,xk,-1.0,&s[cm]);
   /* yk=-gk */ 
   my_dcopy(m,gk,1,&y[cm],1); 
   my_dscal(m,-1.0,&y[cm]);
   /* update gradient */
   func_grad(func,xk1,gk,x,m,n,step,adata);
   /* yk=yk+gk1 */
   my_daxpy(m,gk,1.0,&y[cm]);
   /* calculate 1/yk^T*sk */
   rho[ci]=1.0/my_ddot(m,&y[cm],&s[cm]);
   /* update xk=xk1 */
   my_dcopy(m,xk1,1,xk,1); 
   //printf("iter %d store %d\n",ck,cm);
   ck++;
   /* increment storage appropriately */
   if (cm<(M-1)*m) {
    /* offset of m */
    cm=cm+m;
    ci++;
   } else {
    cm=ci=0;
   }
  }
 /* copy back solution to p */
 my_dcopy(m,xk,1,p,1);
 /* for (ci=0; ci<m; ci++) {
   printf("grad %d=%lf\n",ci,gk[ci]);
  } */
  free(gk);
  free(xk1);
  free(xk);
  free(pk);
  free(s);
  free(y);
  free(rho);
  return 0;
 }
--- a/src/lib/Solvers/lbfgs_nocuda.o
+++ b/src/lib/Solvers/lbfgs_nocuda.o
--- a/src/lib/Solvers/libdirac-gpu.a
+++ b/src/lib/Solvers/libdirac-gpu.a
--- a/src/lib/Solvers/libdirac.a
+++ b/src/lib/Solvers/libdirac.a
--- a/src/lib/Solvers/libsolvers-gpu.a
+++ b/src/lib/Solvers/libsolvers-gpu.a
--- a/src/lib/Solvers/lmfit.c
+++ b/src/lib/Solvers/lmfit.c
--- a/src/lib/Solvers/lmfit.o
+++ b/src/lib/Solvers/lmfit.o
--- a/src/lib/Solvers/lmfit_nocuda.c
+++ b/src/lib/Solvers/lmfit_nocuda.c
--- a/src/lib/Solvers/lmfit_nocuda.o
+++ b/src/lib/Solvers/lmfit_nocuda.o
--- a/src/lib/Solvers/load_balance.c
+++ b/src/lib/Solvers/load_balance.c
@ -1,161 +0,0 @@
 /*
 *
 Copyright (C) 2006-2015 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <pthread.h>
 #include <math.h>
 #include "Solvers.h"
 #include <nvml.h>
 //#define MPI_BUILD
 #ifdef MPI_BUILD
 #include <mpi.h>
 #endif
 //#define DEBUG
 /* return random value in 0,1,..,maxval */
 #ifndef MPI_BUILD
 static int
 random_pick(int maxval, taskhist *th) {
  double rat=(double)random()/(double)RAND_MAX;
  double y=rat*(double)(maxval+1);
  int x=(int)floor(y); 
  return x;
 }
 #endif
 void
 init_task_hist(taskhist *th) {
 th->prev=-1;
 th->rseed=0;
 pthread_mutex_init(&th->prev_mutex,NULL);
 }
 void
 destroy_task_hist(taskhist *th) {
 th->prev=-1;
 th->rseed=0;
 pthread_mutex_destroy(&th->prev_mutex);
 }
 /* select a GPU from 0,1..,max_gpu
   in such a way to allow load balancing */
 int
 select_work_gpu(int max_gpu, taskhist *th) {
 #ifdef MPI_BUILD
  int rank;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  /* check if max_gpu > no. of actual devices */
  int actual_devcount;
  cudaGetDeviceCount(&actual_devcount);
  if (max_gpu+1>actual_devcount) {
   return rank%(actual_devcount);
  }
  return rank%(max_gpu+1); /* modulo value */
 #endif
 #ifndef MPI_BUILD
  /* sequentially query the devices to find 
     one with the min load/memory usage */
  nvmlReturn_t result;
  result = nvmlInit();
  int retval;
  int minid=-1;
  int maxid=-1;
  if (result!=NVML_SUCCESS) {
    fprintf(stderr,"%s: %d: cannot access NVML\n",__FILE__,__LINE__);
    /* return random pick */
    retval=random_pick(max_gpu, th);
    /* if this matches the previous value, select again */
    pthread_mutex_lock(&th->prev_mutex);
    while (retval==th->prev) {
     retval=random_pick(max_gpu, th);
    }
    th->prev=retval;
    pthread_mutex_unlock(&th->prev_mutex);
    return retval;
  } else {
    /* iterate */
    nvmlDevice_t device;
    nvmlUtilization_t nvmlUtilization;
    nvmlMemory_t nvmlMemory; 
    unsigned int min_util=101; /* GPU utilization */
    unsigned int max_util=0; /* GPU utilization */
    unsigned long long int max_free=0; /* max free memory */
    unsigned long long int min_free=ULLONG_MAX; /* max free memory */
    int ci;
    for (ci=0; ci<=max_gpu; ci++) {
      result=nvmlDeviceGetHandleByIndex(ci, &device);
      result=nvmlDeviceGetUtilizationRates(device, &nvmlUtilization);
      result=nvmlDeviceGetMemoryInfo(device, &nvmlMemory); 
      if (min_util>nvmlUtilization.gpu) {
          min_util=nvmlUtilization.gpu;
          minid=ci;
      }
      if (max_util<nvmlUtilization.gpu) {
          max_util=nvmlUtilization.gpu;
      }
      if (max_free<nvmlMemory.free) {
         max_free=nvmlMemory.free;
         maxid=ci;
      }
      if (min_free>nvmlMemory.free) {
         min_free=nvmlMemory.free;
      }
    }
    result = nvmlShutdown();
    /* give priority for selection a GPU with max free memory,
       if there is a tie, use min utilization as second criterion */ 
    /* if all have 0 usage, again use random */
    if (max_free==min_free && max_util==min_util) {
     retval=random_pick(max_gpu,th);
     /* if this value matches previous one, select again */
     pthread_mutex_lock(&th->prev_mutex);
     while(retval==th->prev) {
      retval=random_pick(max_gpu,th);
     }
     th->prev=retval;
     pthread_mutex_unlock(&th->prev_mutex);
     return retval;
    } else {
     if (max_free==min_free) { /* all cards have equal free mem */
       retval=(int)minid;
     } else {
       retval=(int)maxid;
     }
    }
  }
  /* update last pick */
  pthread_mutex_lock(&th->prev_mutex);
  th->prev=retval;
  pthread_mutex_unlock(&th->prev_mutex);
  return retval;
 #endif
 }
--- a/src/lib/Solvers/load_balance.o
+++ b/src/lib/Solvers/load_balance.o
--- a/src/lib/Solvers/manifold_average.c
+++ b/src/lib/Solvers/manifold_average.c
@ -1,627 +0,0 @@
 /*
 *
 Copyright (C) 2014 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include "Solvers.h"
 #include <math.h>
 //#define DEBUG
 typedef struct thread_data_manavg_ {
 double *Y;
 int startM;
 int endM;
 int Niter;
 int N;
 int M;
 int Nf;
 } thread_data_manavg_t;
 /* worker thread function for manifold average+projection */
 static void*
 manifold_average_threadfn(void *data) {
 thread_data_manavg_t *t=(thread_data_manavg_t*)data;
 int ci,cj,iter;
 double *Yl;
 complex double *J3,*Jp;
 /* local storage 2Nx2 x Nf complex values */
 if ((Yl=(double*)malloc((size_t)t->N*8*t->Nf*sizeof(double)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
 }
 if ((J3=(complex double*)malloc((size_t)t->N*4*sizeof(complex double)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
 }
 if ((Jp=(complex double*)malloc((size_t)t->N*4*sizeof(complex double)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
 }
 #ifdef DEBUG
 complex double *Jerr;
 if ((Jerr=(complex double*)malloc((size_t)t->N*4*sizeof(complex double)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
 }
 #endif
 complex double *Yc=(complex double*)Yl;
 complex double a=1.0/(double)t->Nf+0.0*_Complex_I;
 /* work for SVD */
 complex double *WORK=0;
 complex double w[1];
 double RWORK[32]; /* size > 5*max_matrix_dimension */
 complex double JTJ[4],U[4],VT[4];
 double S[2];
 int status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,w,-1,RWORK);
 if (status!=0) {
   fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
   exit(1);
 } 
 int lwork=(int)w[0];
 if ((WORK=(complex double*)malloc((size_t)(int)lwork*sizeof(complex double)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
 }
 for (ci=t->startM; ci<=t->endM; ci++) {
   /* copy to local storage */
   for (cj=0; cj<t->Nf; cj++) {
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N], 8, &Yl[cj*8*t->N], 4);
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+1], 8, &Yl[cj*8*t->N+1], 4);
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+4], 8, &Yl[cj*8*t->N+2], 4);
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+5], 8, &Yl[cj*8*t->N+3], 4);
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+2], 8, &Yl[cj*8*t->N+4*t->N], 4);
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+3], 8, &Yl[cj*8*t->N+4*t->N+1], 4);
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+6], 8, &Yl[cj*8*t->N+4*t->N+2], 4);
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+7], 8, &Yl[cj*8*t->N+4*t->N+3], 4);
   }
   /* first averaging, select random block in [0,Nf-1] to project to */
   int cr=rand()%(t->Nf); /* remainder always in [0,Nf-1] */
   /* J3 <= cr th  block */
   my_ccopy(t->N*4,&Yc[cr*t->N*4],1,J3,1);
   /* project the remainder */
   for (cj=0; cj<cr; cj++) {
      project_procrustes_block(t->N,J3,&Yc[cj*t->N*4]);
   }
   for (cj=cr+1; cj<t->Nf; cj++) {
      project_procrustes_block(t->N,J3,&Yc[cj*t->N*4]);
   }
   /* now each 2, 2N complex vales is one J block */
   /* average values and project to common average */
   for (iter=0; iter<t->Niter; iter++) {
     /* J3 <= 1st block */
     my_ccopy(t->N*4,Yc,1,J3,1); 
     /* add the remainder */
     for (cj=1; cj<t->Nf; cj++) {
     my_caxpy(t->N*4,&Yc[cj*t->N*4],1.0+_Complex_I*0.0,J3);
     }
     my_cscal(t->N*4,a,J3);
     /* now find unitary matrix using Procrustes problem */
     for (cj=0; cj<t->Nf; cj++) {
       /* find product JTJ = J^H J3 */
       my_zgemm('C','N',2,2,2*t->N,1.0+_Complex_I*0.0,&Yc[cj*t->N*4],2*t->N,J3,2*t->N,0.0+_Complex_I*0.0,JTJ,2);
       status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,WORK,lwork,RWORK);
       //printf("%d %d %lf %lf\n",ci,cj,S[0],S[1]);
       /* find JTJ= U V^H */
       my_zgemm('N','N',2,2,2,1.0+_Complex_I*0.0,U,2,VT,2,0.0+_Complex_I*0.0,JTJ,2);
       /* find J*(JTJ) : projected matrix */
       my_zgemm('N','N',2*t->N,2,2,1.0+_Complex_I*0.0,&Yc[cj*t->N*4],2*t->N,JTJ,2,0.0+_Complex_I*0.0,Jp,2*t->N);
       /* copy back */
       my_ccopy(t->N*4,Jp,1,&Yc[cj*t->N*4],1); 
 #ifdef DEBUG
     /* calculate error between projected value and global mean */
     my_ccopy(t->N*4,J3,1,Jerr,1); 
     my_caxpy(t->N*4,&Yc[cj*t->N*4],-1.0+_Complex_I*0.0,Jerr);
     printf("Error freq=%d dir=%d iter=%d %lf\n",cj,ci,iter,my_cnrm2(t->N*4,Jerr));
 #endif
     }
   }
   /* now get a fresh copy, because we should modify Y only by 
      one unitary matrix  */
   my_ccopy(t->N*4,Yc,1,J3,1);
   /* add the remainder */
   for (cj=1; cj<t->Nf; cj++) {
      my_caxpy(t->N*4,&Yc[cj*t->N*4],1.0+_Complex_I*0.0,J3);
   }
   my_cscal(t->N*4,a,J3);
   for (cj=0; cj<t->Nf; cj++) {
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N], 8, &Yl[cj*8*t->N], 4);
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+1], 8, &Yl[cj*8*t->N+1], 4);
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+4], 8, &Yl[cj*8*t->N+2], 4);
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+5], 8, &Yl[cj*8*t->N+3], 4);
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+2], 8, &Yl[cj*8*t->N+4*t->N], 4);
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+3], 8, &Yl[cj*8*t->N+4*t->N+1], 4);
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+6], 8, &Yl[cj*8*t->N+4*t->N+2], 4);
     my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+7], 8, &Yl[cj*8*t->N+4*t->N+3], 4);
   }
   for (cj=0; cj<t->Nf; cj++) {
       /* find product JTJ = J^H J3 */
       my_zgemm('C','N',2,2,2*t->N,1.0+_Complex_I*0.0,&Yc[cj*t->N*4],2*t->N,J3,2*t->N,0.0+_Complex_I*0.0,JTJ,2);
       status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,WORK,lwork,RWORK);
       /* find JTJ= U V^H */
       my_zgemm('N','N',2,2,2,1.0+_Complex_I*0.0,U,2,VT,2,0.0+_Complex_I*0.0,JTJ,2);
       /* find J*(JTJ) : projected matrix */
       my_zgemm('N','N',2*t->N,2,2,1.0+_Complex_I*0.0,&Yc[cj*t->N*4],2*t->N,JTJ,2,0.0+_Complex_I*0.0,Jp,2*t->N);
       /* copy back */
       my_ccopy(t->N*4,Jp,1,&Yc[cj*t->N*4],1);
   }
   /* copy back from local storage */
   for (cj=0; cj<t->Nf; cj++) {
     my_dcopy(t->N, &Yl[cj*8*t->N], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N], 8);
     my_dcopy(t->N, &Yl[cj*8*t->N+1], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+1], 8);
     my_dcopy(t->N, &Yl[cj*8*t->N+2], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+4], 8);
     my_dcopy(t->N, &Yl[cj*8*t->N+3], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+5], 8);
     my_dcopy(t->N, &Yl[cj*8*t->N+4*t->N], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+2], 8);
     my_dcopy(t->N, &Yl[cj*8*t->N+4*t->N+1], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+3], 8);
     my_dcopy(t->N, &Yl[cj*8*t->N+4*t->N+2], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+6], 8);
     my_dcopy(t->N, &Yl[cj*8*t->N+4*t->N+3], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+7], 8);
   }
 }
 #ifdef DEBUG
 free(Jerr);
 #endif
 free(Yl);
 free(J3);
 free(Jp);
 free(WORK);
 return NULL;
 }
 int
 calculate_manifold_average(int N,int M,int Nf,double *Y,int Niter,int Nt) {
 /* Y : each 2Nx2xM blocks belong to one freq,
   select one 2Nx2 from this, reorder to J format : Nf blocks
   and average */
  pthread_attr_t attr;
  pthread_t *th_array;
  thread_data_manavg_t *threaddata;
  int ci,Nthb0,Nthb,nth,nth1;
  /* clusters per thread */
  Nthb0=(M+Nt-1)/Nt;
  /* setup threads */
  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
  }
  if ((threaddata=(thread_data_manavg_t*)malloc((size_t)Nt*sizeof(thread_data_manavg_t)))==0) {
    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  ci=0;
  for (nth=0;  nth<Nt && ci<M; nth++) {
    if (ci+Nthb0<M) {
     Nthb=Nthb0;
    } else {
     Nthb=M-ci;
    }
    threaddata[nth].Y=Y;
    threaddata[nth].N=N;
    threaddata[nth].M=M;
    threaddata[nth].Nf=Nf;
    threaddata[nth].Niter=Niter;
    threaddata[nth].startM=ci;
    threaddata[nth].endM=ci+Nthb-1;
    pthread_create(&th_array[nth],&attr,manifold_average_threadfn,(void*)(&threaddata[nth]));
    ci=ci+Nthb;
  }
  for(nth1=0; nth1<nth; nth1++) {
   pthread_join(th_array[nth1],NULL);
  }
  pthread_attr_destroy(&attr);
  free(th_array);
  free(threaddata);
  return 0;
 }
 int
 project_procrustes(int N,double *J,double *J1) {
 /* min ||J - J1 U || find U */
 complex double *X,*Y;
 /* local storage */
 if ((X=(complex double*)malloc((size_t)N*4*sizeof(complex double)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
 }
 if ((Y=(complex double*)malloc((size_t)N*4*sizeof(complex double)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
 }
 double *Jx=(double*)X;
 double *Jy=(double*)Y;
 /* copy to get correct format */
 my_dcopy(N, &J[0], 8, &Jx[0], 4);
 my_dcopy(N, &J[0+1], 8, &Jx[1], 4);
 my_dcopy(N, &J[0+4], 8, &Jx[2], 4);
 my_dcopy(N, &J[0+5], 8, &Jx[3], 4);
 my_dcopy(N, &J[0+2], 8, &Jx[4*N], 4);
 my_dcopy(N, &J[0+3], 8, &Jx[4*N+1], 4);
 my_dcopy(N, &J[0+6], 8, &Jx[4*N+2], 4);
 my_dcopy(N, &J[0+7], 8, &Jx[4*N+3], 4);
 my_dcopy(N, &J1[0], 8, &Jy[0], 4);
 my_dcopy(N, &J1[0+1], 8, &Jy[1], 4);
 my_dcopy(N, &J1[0+4], 8, &Jy[2], 4);
 my_dcopy(N, &J1[0+5], 8, &Jy[3], 4);
 my_dcopy(N, &J1[0+2], 8, &Jy[4*N], 4);
 my_dcopy(N, &J1[0+3], 8, &Jy[4*N+1], 4);
 my_dcopy(N, &J1[0+6], 8, &Jy[4*N+2], 4);
 my_dcopy(N, &J1[0+7], 8, &Jy[4*N+3], 4);
 /* min ||X - Y U|| find U */
 /* work for SVD */
 complex double *WORK=0;
 complex double w[1];
 double RWORK[32]; /* size > 5*max_matrix_dimension */
 complex double JTJ[4],U[4],VT[4];
 double S[2];
 int status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,w,-1,RWORK);
 if (status!=0) {
   fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
   exit(1);
 } 
 int lwork=(int)w[0];
 if ((WORK=(complex double*)malloc((size_t)(int)lwork*sizeof(complex double)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
 }
 /* find product JTJ = Y^H X */
 my_zgemm('C','N',2,2,2*N,1.0+_Complex_I*0.0,Y,2*N,X,2*N,0.0+_Complex_I*0.0,JTJ,2);
 /* JTJ = U S V^H */
 status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,WORK,lwork,RWORK);
 /* find JTJ= U V^H */
 my_zgemm('N','N',2,2,2,1.0+_Complex_I*0.0,U,2,VT,2,0.0+_Complex_I*0.0,JTJ,2);
 /* find Y*(JTJ) : projected matrix -> store in X */
 my_zgemm('N','N',2*N,2,2,1.0+_Complex_I*0.0,Y,2*N,JTJ,2,0.0+_Complex_I*0.0,X,2*N);
 my_dcopy(N, &Jx[0], 4, &J1[0], 8);
 my_dcopy(N, &Jx[1], 4, &J1[0+1], 8);
 my_dcopy(N, &Jx[2], 4, &J1[0+4], 8);
 my_dcopy(N, &Jx[3], 4, &J1[0+5], 8);
 my_dcopy(N, &Jx[4*N], 4, &J1[0+2], 8);
 my_dcopy(N, &Jx[4*N+1], 4, &J1[0+3], 8);
 my_dcopy(N, &Jx[4*N+2], 4, &J1[0+6], 8);
 my_dcopy(N, &Jx[4*N+3], 4, &J1[0+7], 8);
 free(WORK);
 free(X);
 free(Y);
 return 0;
 }
 int
 project_procrustes_block(int N,complex double *X,complex double *Y) {
 /* min ||X - Y U || find U */
 complex double *Jlocal;
 /* local storage */
 if ((Jlocal=(complex double*)malloc((size_t)N*4*sizeof(complex double)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
 }
 /* work for SVD */
 complex double *WORK=0;
 complex double w[1];
 double RWORK[32]; /* size > 5*max_matrix_dimension */
 complex double JTJ[4],U[4],VT[4];
 double S[2];
 int status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,w,-1,RWORK);
 if (status!=0) {
   fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
   exit(1);
 } 
 int lwork=(int)w[0];
 if ((WORK=(complex double*)malloc((size_t)(int)lwork*sizeof(complex double)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
 }
 /* find product JTJ = Y^H X */
 my_zgemm('C','N',2,2,2*N,1.0+_Complex_I*0.0,Y,2*N,X,2*N,0.0+_Complex_I*0.0,JTJ,2);
 /* JTJ = U S V^H */
 status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,WORK,lwork,RWORK);
 /* find JTJ= U V^H */
 my_zgemm('N','N',2,2,2,1.0+_Complex_I*0.0,U,2,VT,2,0.0+_Complex_I*0.0,JTJ,2);
 /* find Y*(JTJ) : projected matrix -> store in Jlocal */
 my_zgemm('N','N',2*N,2,2,1.0+_Complex_I*0.0,Y,2*N,JTJ,2,0.0+_Complex_I*0.0,Jlocal,2*N);
 /* copy Jlocal -> Y */
 my_dcopy(8*N, (double*)Jlocal, 1, (double*)Y, 1);
 free(WORK);
 free(Jlocal);
 return 0;
 }
 //#define DEBUG
 /* Extract only the phase of diagonal entries from solutions 
   p: 8Nx1 solutions, orders as [(real,imag)vec(J1),(real,imag)vec(J2),...]
   pout: 8Nx1 phases (exp(j*phase)) of solutions, after joint diagonalization of p
   N: no. of 2x2 Jones matrices in p, having common unitary ambiguity
   niter: no of iterations for Jacobi rotation */
 int
 extract_phases(double *p, double *pout, int N, int niter) {
  /* local storage */
  complex double *J,*Jcopy;
  /* local storage, change ordering of solutions [J_1^T,J_2^T,...]^T  */
  if ((J=(complex double*)malloc((size_t)N*4*sizeof(complex double)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
  }
  if ((Jcopy=(complex double*)malloc((size_t)N*4*sizeof(complex double)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
  }
  double *Jx=(double *)J;
  /* copy to get correct format */
  my_dcopy(N, &p[0], 8, &Jx[0], 4);
  my_dcopy(N, &p[0+1], 8, &Jx[1], 4);
  my_dcopy(N, &p[0+4], 8, &Jx[2], 4);
  my_dcopy(N, &p[0+5], 8, &Jx[3], 4);
  my_dcopy(N, &p[0+2], 8, &Jx[4*N], 4);
  my_dcopy(N, &p[0+3], 8, &Jx[4*N+1], 4);
  my_dcopy(N, &p[0+6], 8, &Jx[4*N+2], 4);
  my_dcopy(N, &p[0+7], 8, &Jx[4*N+3], 4);
  complex double h[3],Hc[9];
  double H[9]; 
  double W[3],Z[3];
  double w[1],*WORK;
  int IWORK[15],IFAIL[3],info;
  int ni,ci;
  complex double c,s,G[4];
 #ifdef DEBUG
  printf("J=[\n");
  for (ci=0; ci<N; ci++) {
   printf("%lf+j*(%lf), %lf+j*(%lf)\n",p[8*ci],p[8*ci+1],p[8*ci+2],p[8*ci+3]);
   printf("%lf+j*(%lf), %lf+j*(%lf)\n",p[8*ci+4],p[8*ci+5],p[8*ci+6],p[8*ci+7]);
  }
  printf("];\n");
 #endif
  /* setup workspace for eigenvalue decomposition */
  info=my_dsyevx('V','I','L',3,H,3,0.0,0.0,3,3,dlamch('S'),1,W,Z,3,w,-1,IWORK,IFAIL);
  if (info) {
   fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,info);
   exit(1);
  }
  /* get work size */
  int lwork=(int)w[0];
  /* allocate memory */
  if ((WORK=(double*)malloc((size_t)lwork*sizeof(double)))==0) {
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
   exit(1);
  } 
  /* iteration loop */
  for (ni=0; ni<niter; ni++) {
    /************** for element (1,2) **********************/
    /* accumulate h*h^H product */
    memset(Hc,0,9*sizeof(complex double));
    for (ci=0; ci<N; ci++) {
       /* [a_ii-a_jj,a_ij+a_ji,I*(a_ji-a_ij)] */
       h[0]=conj(J[2*ci]-J[2*ci+2*N+1]);
       h[1]=conj(J[2*ci+2*N]+J[2*ci+1]);
       h[2]=conj(_Complex_I*(J[2*ci+1]-J[2*ci+2*N]));
       /* store results onto lower triangle */
       my_zher('L',3,1.0,h,1,Hc,3);
    }
    /* get real part, copy it to lower triangle */
    H[0]=creal(Hc[0]);
    H[1]=creal(Hc[1]);
    H[2]=creal(Hc[2]);
    H[4]=creal(Hc[4]);
    H[5]=creal(Hc[5]);
    H[8]=creal(Hc[8]);
 #ifdef DEBUG
    printf("H=[\n");
    printf("%e %e %e\n",H[0],H[1],H[2]);
    printf("%e %e %e\n",H[1],H[4],H[5]);
    printf("%e %e %e\n",H[2],H[5],H[8]);
    printf("];\n");
 #endif
    info=my_dsyevx('V','I','L',3,H,3,0.0,0.0,3,3,dlamch('S'),1,W,Z,3,WORK,lwork,IWORK,IFAIL);
    if (info<0) {
     fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,info);
     exit(1);
    }
 #ifdef DEBUG
    printf("max eigenvalue=%e\n",W[0]);
    printf("ev=[\n");
    printf("%e\n",Z[0]);
    printf("%e\n",Z[1]);
    printf("%e\n",Z[2]);
    printf("];\n");
 #endif
   /* form sin,cos values */
   if (Z[0]>=0.0) {
    c=sqrt(0.5+Z[0]*0.5)+_Complex_I*0.0;
    s=0.5*(Z[1]-_Complex_I*Z[2])/c;
   } else {
    /* flip sign of eigenvector */
    c=sqrt(0.5-Z[0]*0.5)+_Complex_I*0.0;
    s=0.5*(-Z[1]+_Complex_I*Z[2])/c;
   }
   /* form Givens rotation matrix */
   G[0]=c;
   G[1]=-s;
   G[2]=conj(s);
   G[3]=conj(c);
 #ifdef DEBUG
   printf("G=[\n");
   printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(G[0]),cimag(G[0]),creal(G[2]),cimag(G[2]));
   printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(G[1]),cimag(G[1]),creal(G[3]),cimag(G[3]));
   printf("];\n");
 #endif
   /* rotate J <= J * G^H: Jcopy = 1 x J x G^H  + 0 x Jcopy */
   my_zgemm('N','C',2*N,2,2,1.0+_Complex_I*0.0,J,2*N,G,2,0.0+_Complex_I*0.0,Jcopy,2*N);
   memcpy(J,Jcopy,(size_t)4*N*sizeof(complex double));
 #ifdef DEBUG
   printf("JGH=[\n");
   for (ci=0; ci<N; ci++) {
    printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci]),cimag(J[2*ci]),creal(J[2*N+2*ci]),cimag(J[2*N+2*ci]));
    printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci+1]),cimag(J[2*ci+1]),creal(J[2*N+2*ci+1]),cimag(J[2*N+2*ci+1]));
   }
   printf("];\n");
 #endif
    /************** for element (2,1) **********************/
    /* accumulate h*h^H product */
    memset(Hc,0,9*sizeof(complex double));
    for (ci=0; ci<N; ci++) {
       /* [a_ii-a_jj,a_ij+a_ji,I*(a_ji-a_ij)] */
       h[0]=conj(J[2*ci+2*N+1]-J[2*ci]);
       h[1]=conj(J[2*ci+1]+J[2*ci+2*N]);
       h[2]=conj(_Complex_I*(J[2*ci+2*N]-J[2*ci+1]));
       /* store results onto lower triangle */
       my_zher('L',3,1.0,h,1,Hc,3);
    }
    /* get real part, copy it to lower triangle */
    H[0]=creal(Hc[0]);
    H[1]=creal(Hc[1]);
    H[2]=creal(Hc[2]);
    H[4]=creal(Hc[4]);
    H[5]=creal(Hc[5]);
    H[8]=creal(Hc[8]);
 #ifdef DEBUG
    printf("H=[\n");
    printf("%e %e %e\n",H[0],H[1],H[2]);
    printf("%e %e %e\n",H[1],H[4],H[5]);
    printf("%e %e %e\n",H[2],H[5],H[8]);
    printf("];\n");
 #endif
    info=my_dsyevx('V','I','L',3,H,3,0.0,0.0,3,3,dlamch('S'),1,W,Z,3,WORK,lwork,IWORK,IFAIL);
    if (info<0) {
     fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,info);
     exit(1);
    }
 #ifdef DEBUG
    printf("max eigenvalue=%e\n",W[0]);
    printf("ev=[\n");
    printf("%e\n",Z[0]);
    printf("%e\n",Z[1]);
    printf("%e\n",Z[2]);
    printf("];\n");
 #endif
   /* form sin,cos values */
   if (Z[0]>=0.0) {
    c=sqrt(0.5+Z[0]*0.5)+_Complex_I*0.0;
    s=0.5*(Z[1]-_Complex_I*Z[2])/c;
   } else {
    /* flip sign of eigenvector */
    c=sqrt(0.5-Z[0]*0.5)+_Complex_I*0.0;
    s=0.5*(-Z[1]+_Complex_I*Z[2])/c;
   }
   /* form Givens rotation matrix */
   G[0]=c;
   G[1]=-s;
   G[2]=conj(s);
   G[3]=conj(c);
 #ifdef DEBUG
   printf("G=[\n");
   printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(G[0]),cimag(G[0]),creal(G[2]),cimag(G[2]));
   printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(G[1]),cimag(G[1]),creal(G[3]),cimag(G[3]));
   printf("];\n");
 #endif
   /* rotate J <= J * G^H: Jcopy = 1 x J x G^H  + 0 x Jcopy */
   my_zgemm('N','C',2*N,2,2,1.0+_Complex_I*0.0,J,2*N,G,2,0.0+_Complex_I*0.0,Jcopy,2*N);
   /* before copying updated result, find residual norm */
   /* J = -Jcopy + J */
   my_caxpy(4*N,Jcopy,-1.0+_Complex_I*0.0,J); 
 #ifdef DEBUG
   printf("Iter %d residual=%lf\n",ni,my_cnrm2(4*N,J));
 #endif
   memcpy(J,Jcopy,(size_t)4*N*sizeof(complex double));
 #ifdef DEBUG
   printf("JGH=[\n");
   for (ci=0; ci<N; ci++) {
    printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci]),cimag(J[2*ci]),creal(J[2*N+2*ci]),cimag(J[2*N+2*ci]));
    printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci+1]),cimag(J[2*ci+1]),creal(J[2*N+2*ci+1]),cimag(J[2*N+2*ci+1]));
   }
   printf("];\n");
 #endif
  }
  free(WORK);
 #ifdef DEBUG
  printf("Jfinal=[\n");
  for (ci=0; ci<N; ci++) {
    printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci]),cimag(J[2*ci]),creal(J[2*N+2*ci]),cimag(J[2*N+2*ci]));
    printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci+1]),cimag(J[2*ci+1]),creal(J[2*N+2*ci+1]),cimag(J[2*N+2*ci+1]));
  }
  printf("];\n");
 #endif
  /* extract phase only from diagonal elements */
  for (ci=0; ci<N; ci++) {
    J[2*ci]=J[2*ci]/cabs(J[2*ci]);
    J[2*ci+2*N+1]=J[2*ci+2*N+1]/cabs(J[2*ci+2*N+1]);
  }
  /* copy back to output (only the diagonal values) */
  memset(pout,0,sizeof(double)*8*N);
  my_dcopy(N, &Jx[0], 4, &pout[0], 8);
  my_dcopy(N, &Jx[1], 4, &pout[0+1], 8);
  my_dcopy(N, &Jx[4*N+2], 4, &pout[0+6], 8);
  my_dcopy(N, &Jx[4*N+3], 4, &pout[0+7], 8);
  free(J);
  free(Jcopy);
  return 0;
 }
--- a/src/lib/Solvers/manifold_average.o
+++ b/src/lib/Solvers/manifold_average.o
--- a/src/lib/Solvers/manifold_fl.cu
+++ b/src/lib/Solvers/manifold_fl.cu
--- a/src/lib/Solvers/manifold_fl.o
+++ b/src/lib/Solvers/manifold_fl.o
--- a/src/lib/Solvers/mderiv.cu
+++ b/src/lib/Solvers/mderiv.cu
--- a/src/lib/Solvers/mderiv.o
+++ b/src/lib/Solvers/mderiv.o
--- a/src/lib/Solvers/mderiv_fl.cu
+++ b/src/lib/Solvers/mderiv_fl.cu
@ -1,380 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include "cuda.h"
 #include <cuComplex.h>
 #include <stdio.h>
 /* enable this for checking for kernel failure */
 //#define CUDA_DBG
 __global__ void kernel_diagdiv_fl(int M, float eps, float *y, float *x){
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* make sure to use only M threads */
  if (tid<M) {
    if (x[tid]>eps) {
      y[tid]=y[tid]/x[tid];
    } else {
      y[tid]=0.0f;
    }
  }
 }
 __global__ void kernel_diagmu_fl(int M, float *A,float mu){
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* make sure to use only M threads */
  if (tid<M) {
    A[tid*(M+1)]=A[tid*(M+1)]+mu;
  } 
 }
 __global__ void kernel_func_fl(int Nbase, float *x, float *coh, float *p, short *bb, int N){
  /* global thread index : equal to the baseline */
  unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
  /* this thread works on 
    x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
    bb[2*n:2*n+1] (sta1,sta2)
    organization of p (N stations and M clusters)
             sta 0          sta 1           sta 2        ....  sta N-1 
  clus 0   0...7            8...15          16...23      ...   8N-8     8N-1
  clus 1   8N..8N+7         8N+8..8N+15     8N+16..8N+23 ....  8N+8N-8...8N+8N-1
  ......
  clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15....  ...(M-1)N+8N-8 (M-1)N+8N-1
    organization of coherencies (coh)
        [0, 8*M-1] : baseline 0
        [8*M, 8*M+8*M-1]: baseline 1
        [n*8*M, n*8*M+8*M-1]: baseline n
        ......
        [n*8*M+cm*8, n*8*M+cm*8+7]  cluster cm, baseline n
    residual error stored at sum[n]
  */ 
  if(n<Nbase) {
    int sta1=(int)bb[2*n];
    int sta2=(int)bb[2*n+1];
    /* condition for calculating this baseline sum is 
      1) its not flagged (sta1,sta2)>=0
    */
    if (sta1>=0 && sta2>=0) {   
     cuFloatComplex G1[4];
     float pp[8]; 
     pp[0]=p[sta1*8];
     pp[1]=p[sta1*8+1];
     pp[2]=p[sta1*8+2];
     pp[3]=p[sta1*8+3];
     pp[4]=p[sta1*8+4];
     pp[5]=p[sta1*8+5];
     pp[6]=p[sta1*8+6];
     pp[7]=p[sta1*8+7];
     G1[0].x=pp[0];
     G1[0].y=pp[1];
     G1[1].x=pp[2];
     G1[1].y=pp[3];
     G1[2].x=pp[4];
     G1[2].y=pp[5];
     G1[3].x=pp[6];
     G1[3].y=pp[7];
     cuFloatComplex C[4];
     C[0].x=coh[8*n];
     C[0].y=coh[8*n+1];
     C[1].x=coh[8*n+2];
     C[1].y=coh[8*n+3];
     C[2].x=coh[8*n+4];
     C[2].y=coh[8*n+5];
     C[3].x=coh[8*n+6];
     C[3].y=coh[8*n+7]; 
     cuFloatComplex T1[4];
     /* T=G1*C */
     T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
     T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
     T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
     T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
     cuFloatComplex G2[4];
     /* conjugate this */
     pp[0]=p[sta2*8];
     pp[1]=-p[sta2*8+1];
     pp[2]=p[sta2*8+2];
     pp[3]=-p[sta2*8+3];
     pp[4]=p[sta2*8+4];
     pp[5]=-p[sta2*8+5];
     pp[6]=p[sta2*8+6];
     pp[7]=-p[sta2*8+7];
     G2[0].x=pp[0];
     G2[0].y=pp[1];
     G2[2].x=pp[2];
     G2[2].y=pp[3];
     G2[1].x=pp[4];
     G2[1].y=pp[5];
     G2[3].x=pp[6];
     G2[3].y=pp[7];
     cuFloatComplex T2[4];
     T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
     T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
     T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
     T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
     /* update model vector */
     x[8*n]=T2[0].x;
     x[8*n+1]=T2[0].y;
     x[8*n+2]=T2[1].x;
     x[8*n+3]=T2[1].y;
     x[8*n+4]=T2[2].x;
     x[8*n+5]=T2[2].y;
     x[8*n+6]=T2[3].x;
     x[8*n+7]=T2[3].y;
    } 
   }
 }
 __global__ void kernel_jacf_fl(int Nbase, int M, float *jac, float *coh, float *p, short *bb, int N){
  /* global thread index : equal to the baseline */
  unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
  /* which parameter:0...M */
  unsigned int m = threadIdx.y + blockDim.y*blockIdx.y;
  /* this thread works on 
    x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
    bb[2*n:2*n+1] (sta1,sta2)
    organization of p (N stations and M clusters)
             sta 0          sta 1           sta 2        ....  sta N-1 
  clus 0   0...7            8...15          16...23      ...   8N-8     8N-1
  clus 1   8N..8N+7         8N+8..8N+15     8N+16..8N+23 ....  8N+8N-8...8N+8N-1
  ......
  clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15....  ...(M-1)N+8N-8 (M-1)N+8N-1
    organization of coherencies (coh)
        [0, 8*M-1] : baseline 0
        [8*M, 8*M+8*M-1]: baseline 1
        [n*8*M, n*8*M+8*M-1]: baseline n
        ......
        [n*8*M+cm*8, n*8*M+cm*8+7]  cluster cm, baseline n
    residual error stored at sum[n]
  */ 
  if(n<Nbase && m<M) {
    int sta1=(int)bb[2*n];
    int sta2=(int)bb[2*n+1];
    /* condition for calculating this baseline sum is 
     If this baseline is flagged,
     or if this parameter does not belong to sta1 or sta2
     we do not compute
    */
    //int stc=m/8; /* 0...Ns-1 (because M=total par= 8 * Nstations */
    int stc=m>>3; /* 0...Ns-1 (because M=total par= 8 * Nstations */
    if (((stc==sta2)||(stc==sta1)) && sta1>=0 && sta2>=0 ) {   
     cuFloatComplex C[4];
     C[0].x=coh[8*n];
     C[0].y=coh[8*n+1];
     C[1].x=coh[8*n+2];
     C[1].y=coh[8*n+3];
     C[2].x=coh[8*n+4];
     C[2].y=coh[8*n+5];
     C[3].x=coh[8*n+6];
     C[3].y=coh[8*n+7]; 
     /* which parameter exactly 0..7 */
     //int stoff=m%8;
     int stoff=m-stc*8;
     float pp1[8]; 
     float pp2[8]; 
     if (stc==sta1) {
      for (int cn=0; cn<8; cn++) {
       pp1[cn]=0.0f;
       pp2[cn]=p[sta2*8+cn];
      }
      pp1[stoff]=1.0f;
     } else if (stc==sta2) {
      for (int cn=0; cn<8; cn++) {
       pp2[cn]=0.0f;
       pp1[cn]=p[sta1*8+cn];
      }
      pp2[stoff]=1.0f;
     }
     cuFloatComplex G1[4];
     G1[0].x=pp1[0];
     G1[0].y=pp1[1];
     G1[1].x=pp1[2];
     G1[1].y=pp1[3];
     G1[2].x=pp1[4];
     G1[2].y=pp1[5];
     G1[3].x=pp1[6];
     G1[3].y=pp1[7];
     cuFloatComplex T1[4];
     /* T=G1*C */
     T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
     T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
     T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
     T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
     cuFloatComplex G2[4];
     /* conjugate this */
     G2[0].x=pp2[0];
     G2[0].y=-pp2[1];
     G2[2].x=pp2[2];
     G2[2].y=-pp2[3];
     G2[1].x=pp2[4];
     G2[1].y=-pp2[5];
     G2[3].x=pp2[6];
     G2[3].y=-pp2[7];
     cuFloatComplex T2[4];
     T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
     T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
     T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
     T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
     /* update jacobian */
     /* NOTE: row major order */
     jac[m+M*8*n]=T2[0].x;
     jac[m+M*(8*n+1)]=T2[0].y;
     jac[m+M*(8*n+2)]=T2[1].x;
     jac[m+M*(8*n+3)]=T2[1].y;
     jac[m+M*(8*n+4)]=T2[2].x;
     jac[m+M*(8*n+5)]=T2[2].y;
     jac[m+M*(8*n+6)]=T2[3].x;
     jac[m+M*(8*n+7)]=T2[3].y;
    } 
   }
 }
 /* only use extern if calling code is C */
 extern "C"
 {
 /* divide by singular values  Dpd[]/Sd[]  for Sd[]> eps */
 void 
 cudakernel_diagdiv_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float eps, float *Dpd, float *Sd) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_diagdiv_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, eps, Dpd, Sd);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* cuda driver for calculating
  A<= A+mu I, adding mu to diagonal entries of A
  A: size MxM
  ThreadsPerBlock, BlocksPerGrid calculated to meet M
 */
 void
 cudakernel_diagmu_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float *A, float mu) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_diagmu_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, A, mu);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* cuda driver for calculating f() */
 /* p: params (Mx1), x: data (Nx1), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
 void
 cudakernel_func_fl(int ThreadsPerBlock, int BlocksPerGrid, float *p, float *x, int M, int N, float *coh, short *bbh, int Nbase, int Mclus, int Nstations) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  cudaMemset(x, 0, N*sizeof(float));
 //  printf("Kernel data size=%d, block=%d, thread=%d, baselines=%d\n",N,BlocksPerGrid, ThreadsPerBlock,Nbase);
  kernel_func_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(Nbase,  x, coh, p, bbh, Nstations);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* cuda driver for calculating jacf() */
 /* p: params (Mx1), jac: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
 void
 cudakernel_jacf_fl(int ThreadsPerBlock_row, int  ThreadsPerBlock_col, float *p, float *jac, int M, int N, float *coh, short *bbh, int Nbase, int Mclus, int Nstations) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  /* NOTE: use small value for ThreadsPerBlock here, like 8 */
  dim3 threadsPerBlock(16, 8);
  /* jacobian: Nbase x Nstations (proportional to N), so */
  dim3 numBlocks((Nbase+threadsPerBlock.x-1)/threadsPerBlock.x, 
               (M+threadsPerBlock.y-1)/threadsPerBlock.y);
  /* set memory of jac to zero */
  cudaMemset(jac, 0, N*M*sizeof(float));
 // printf("Kernel Jax data size=%d, params=%d, block=%d,%d, thread=%d,%d, baselines=%d\n",N, M, numBlocks.x,numBlocks.y, threadsPerBlock.x, threadsPerBlock.y, Nbase);
  kernel_jacf_fl<<< numBlocks, threadsPerBlock>>>(Nbase,  M, jac, coh, p, bbh, Nstations);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 }
--- a/src/lib/Solvers/mderiv_fl.o
+++ b/src/lib/Solvers/mderiv_fl.o
--- a/src/lib/Solvers/myblas.c
+++ b/src/lib/Solvers/myblas.c
@ -1,462 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include "Solvers.h"
 #include <string.h> /* for memcpy */
 /* machine precision */
 double
 dlamch(char CMACH) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern double dlamch_(char *CMACH);
  return(dlamch_(&CMACH));
 }
 /* blas dcopy */
 /* y = x */
 /* read x values spaced by Nx (so x size> N*Nx) */
 /* write to y values spaced by Ny  (so y size > N*Ny) */
 void
 my_dcopy(int N, double *x, int Nx, double *y, int Ny) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern void dcopy_(int *N, double *x, int *incx, double *y, int *incy);
  /* use memcpy if Nx=Ny=1 */
  if (Nx==1&&Ny==1) {
   memcpy((void*)y,(void*)x,sizeof(double)*(size_t)N);
  } else {
   dcopy_(&N,x,&Nx,y,&Ny);
  }
 }
 /* blas scale */
 /* x = a. x */
 void
 my_dscal(int N, double a, double *x) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern void dscal_(int *N, double *alpha, double *x, int *incx);
  int i=1;
  dscal_(&N,&a,x,&i);
 }
 void
 my_sscal(int N, float a, float *x) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern void sscal_(int *N, float *alpha, float *x, int *incx);
  int i=1;
  sscal_(&N,&a,x,&i);
 }
 /* x^T*y */
 double
 my_ddot(int N, double *x, double *y) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern double  ddot_(int *N, double *x, int *incx, double *y, int *incy);
  int i=1;
  return(ddot_(&N,x,&i,y,&i));
 }
 /* ||x||_2 */
 double
 my_dnrm2(int N, double *x) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern double  dnrm2_(int *N, double *x, int *incx);
  int i=1;
  return(dnrm2_(&N,x,&i));
 }
 float
 my_fnrm2(int N, float *x) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern float snrm2_(int *N, float *x, int *incx);
  int i=1;
  return(snrm2_(&N,x,&i));
 }
 /* sum||x||_1 */
 double
 my_dasum(int N, double *x) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern double  dasum_(int *N, double *x, int *incx);
  int i=1;
  return(dasum_(&N,x,&i));
 }
 float
 my_fasum(int N, float *x) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern float sasum_(int *N, float *x, int *incx);
  int i=1;
  return(sasum_(&N,x,&i));
 }
 /* BLAS y = a.x + y */
 void
 my_daxpy(int N, double *x, double a, double *y) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
    extern void daxpy_(int *N, double *alpha, double *x, int *incx, double *y, int *incy);
    int i=1; /* strides */
    daxpy_(&N,&a,x,&i,y,&i);
 }
 /* BLAS y = a.x + y */
 void
 my_daxpys(int N, double *x, int incx, double a, double *y, int incy) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
    extern void daxpy_(int *N, double *alpha, double *x, int *incx, double *y, int *incy);
    daxpy_(&N,&a,x,&incx,y,&incy);
 }
 void
 my_saxpy(int N, float *x, float a, float *y) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
    extern void saxpy_(int *N, float *alpha, float *x, int *incx, float *y, int *incy);
    int i=1; /* strides */
    saxpy_(&N,&a,x,&i,y,&i);
 }
 /* max |x|  index (start from 1...)*/
 int
 my_idamax(int N, double *x, int incx) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
    extern int idamax_(int *N, double *x, int *incx);
    return idamax_(&N,x,&incx);
 }
 int
 my_isamax(int N, float *x, int incx) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
    extern int isamax_(int *N, float *x, int *incx);
    return isamax_(&N,x,&incx);
 }
 /* min |x|  index (start from 1...)*/
 int
 my_idamin(int N, double *x, int incx) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
    extern int idamin_(int *N, double *x, int *incx);
    return idamin_(&N,x,&incx);
 }
 /* BLAS DGEMM C = alpha*op(A)*op(B)+ beta*C */
 void
 my_dgemm(char transa, char transb, int M, int N, int K, double alpha, double *A, int lda, double *B, int ldb, double beta, double *C, int ldc) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern void dgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, double *ALPHA, double *A, int *LDA, double *B, int * LDB, double *BETA, double *C, int *LDC);
  dgemm_(&transa, &transb, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
 }
 /* BLAS DGEMV  y = alpha*op(A)*x+ beta*y : op 'T' or 'N' */
 void
 my_dgemv(char trans, int M, int N, double alpha, double *A, int lda, double *x, int incx,  double beta, double *y, int incy) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern void dgemv_(char *TRANS, int *M, int *N, double *ALPHA, double *A, int *LDA, double *X, int *INCX, double *BETA, double *Y, int *INCY);
  dgemv_(&trans, &M, &N, &alpha, A, &lda, x, &incx, &beta, y, &incy);
 }
 /* following routines used in LAPACK solvers */
 /* cholesky factorization: real symmetric */
 int
 my_dpotrf(char uplo, int N, double *A, int lda) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern void dpotrf_(char *uplo, int *N, double *A, int *lda, int *info);
  int info;
  dpotrf_(&uplo,&N,A,&lda,&info);
  return info;
 }
 /* solve Ax=b using cholesky factorization */
 int 
 my_dpotrs(char uplo, int N, int nrhs, double *A, int lda, double *b, int ldb){
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
   extern void dpotrs_(char  *uplo, int *N, int *nrhs, double *A, int *lda, double *b, int *ldb, int *info);
   int info;
   dpotrs_(&uplo,&N,&nrhs,A,&lda,b,&ldb,&info);
   return info;
 }
 /* solve Ax=b using QR factorization */
 int
 my_dgels(char TRANS, int M, int N, int NRHS, double *A, int LDA, double *B, int LDB, double *WORK, int LWORK) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern void dgels_(char *TRANS, int *M, int *N, int *NRHS, double *A, int *LDA, double *B, int *LDB, double *WORK, int *LWORK, int *INFO);
  int info;
  dgels_(&TRANS,&M,&N,&NRHS,A,&LDA,B,&LDB,WORK,&LWORK,&info);
  return info;
 }
 /* A=U S VT, so V needs NOT to be transposed */
 int
 my_dgesvd(char JOBU, char JOBVT, int M, int N, double *A, int LDA, double *S,
   double *U, int LDU, double *VT, int LDVT, double *WORK, int LWORK) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
   extern void dgesvd_(char *JOBU, char *JOBVT, int *M, int *N, double *A, 
    int *LDA, double *S, double *U, int *LDU, double *VT, int *LDVT,
    double *WORK, int *LWORK, int *info);
   int info;
   dgesvd_(&JOBU,&JOBVT,&M,&N,A,&LDA,S,U,&LDU,VT,&LDVT,WORK,&LWORK,&info);
   return info;
 }
 /* QR factorization QR=A, only TAU is used for Q, R stored in A*/
 int
 my_dgeqrf(int M, int N, double *A, int LDA, double *TAU, double *WORK, int LWORK) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 extern void dgeqrf_(int *M, int *N, double *A,  int *LDA, double *TAU, double *WORK, int *LWORK, int *INFO);
  int info;
  dgeqrf_(&M,&N,A,&LDA,TAU,WORK,&LWORK,&info);
  return info;
 }
 /* calculate Q using elementary reflections */
 int
 my_dorgqr(int M,int  N,int  K,double *A,int  LDA,double *TAU,double *WORK,int  LWORK) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern void dorgqr_(int *M, int *N, int *K, double *A, int *LDA, double *TAU, double *WORK, int *LWORK, int *INFO);
  int info;
  dorgqr_(&M, &N, &K, A, &LDA, TAU, WORK, &LWORK, &info);
  return info;
 }
 /* solves a triangular system of equations Ax=b, A triangular */
 int
 my_dtrtrs(char UPLO, char TRANS, char DIAG,int N,int  NRHS,double *A,int  LDA,double *B,int  LDB) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern void dtrtrs_(char *UPLO,char *TRANS,char  *DIAG,int *N,int *NRHS,double *A,int *LDA,double *B,int *LDB,int *INFO);
  int info;
  dtrtrs_(&UPLO,&TRANS,&DIAG,&N,&NRHS,A,&LDA,B,&LDB,&info);
  return info;
 }
 /* blas ccopy */
 /* y = x */
 /* read x values spaced by Nx (so x size> N*Nx) */
 /* write to y values spaced by Ny  (so y size > N*Ny) */
 void
 my_ccopy(int N, complex double *x, int Nx, complex double *y, int Ny) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern void zcopy_(int *N, complex double *x, int *incx, complex double *y, int *incy);
  /* use memcpy if Nx=Ny=1 */
  if (Nx==1&&Ny==1) {
   memcpy((void*)y,(void*)x,sizeof(complex double)*(size_t)N);
  } else {
   zcopy_(&N,x,&Nx,y,&Ny);
  }
 }
 /* blas scale */
 /* x = a. x */
 void
 my_cscal(int N, complex double a, complex double *x) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern void zscal_(int *N, complex double *alpha, complex double *x, int *incx);
  int i=1;
  zscal_(&N,&a,x,&i);
 }
 /* BLAS y = a.x + y */
 void
 my_caxpy(int N, complex double *x, complex double a, complex double *y) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
    extern void zaxpy_(int *N, complex double *alpha, complex double *x, int *incx, complex double *y, int *incy);
    int i=1; /* strides */
    zaxpy_(&N,&a,x,&i,y,&i);
 }
 /* BLAS x^H*y */
 complex double
 my_cdot(int N, complex double *x, complex double *y) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern complex double  zdotc_(int *N, complex double *x, int *incx, complex double *y, int *incy);
  int i=1;
  return(zdotc_(&N,x,&i,y,&i));
 }
 /* A=U S VT, so V needs NOT to be transposed */
 int
 my_zgesvd(char JOBU, char JOBVT, int M, int N, complex double *A, int LDA, double *S,
   complex double *U, int LDU, complex double *VT, int LDVT, complex double *WORK, int LWORK, double *RWORK) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
   extern void zgesvd_(char *JOBU, char *JOBVT, int *M, int *N, complex double *A, 
    int *LDA, double *S, complex double *U, int *LDU, complex double *VT, int *LDVT,
    complex double *WORK, int *LWORK, double *RWORK, int *info);
   int info;
   zgesvd_(&JOBU,&JOBVT,&M,&N,A,&LDA,S,U,&LDU,VT,&LDVT,WORK,&LWORK,RWORK,&info);
   return info;
 }
 /* solve Ax=b using QR factorization */
 int
 my_zgels(char TRANS, int M, int N, int NRHS, complex double *A, int LDA, complex double *B, int LDB, complex double *WORK, int LWORK) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern void zgels_(char *TRANS, int *M, int *N, int *NRHS, complex double *A, int *LDA, complex double *B, int *LDB, complex double *WORK, int *LWORK, int *INFO);
  int info;
  zgels_(&TRANS,&M,&N,&NRHS,A,&LDA,B,&LDB,WORK,&LWORK,&info);
  return info;
 }
 /* solve Ax=b using QR factorization */
 int
 my_cgels(char TRANS, int M, int N, int NRHS, complex float *A, int LDA, complex float *B, int LDB, complex float *WORK, int LWORK) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern void cgels_(char *TRANS, int *M, int *N, int *NRHS, complex float *A, int *LDA, complex float *B, int *LDB, complex float *WORK, int *LWORK, int *INFO);
  int info;
  cgels_(&TRANS,&M,&N,&NRHS,A,&LDA,B,&LDB,WORK,&LWORK,&info);
  return info;
 }
 /* BLAS ZGEMM C = alpha*op(A)*op(B)+ beta*C */
 void
 my_zgemm(char transa, char transb, int M, int N, int K, complex double alpha, complex double *A, int lda, complex double *B, int ldb, complex double beta, complex double *C, int ldc) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern void zgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, complex double *ALPHA, complex double *A, int *LDA, complex double *B, int * LDB, complex double *BETA, complex double *C, int *LDC);
  zgemm_(&transa, &transb, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
 }
 /* ||x||_2 */
 double
 my_cnrm2(int N, complex double *x) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern double  dznrm2_(int *N, complex double *x, int *incx);
  int i=1;
  return(dznrm2_(&N,x,&i));
 }
 /* blas fcopy */
 /* y = x */
 /* read x values spaced by Nx (so x size> N*Nx) */
 /* write to y values spaced by Ny  (so y size > N*Ny) */
 void
 my_fcopy(int N, float *x, int Nx, float *y, int Ny) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
  extern void scopy_(int *N, float *x, int *incx, float *y, int *incy);
  /* use memcpy if Nx=Ny=1 */
  if (Nx==1&&Ny==1) {
   memcpy((void*)y,(void*)x,sizeof(float)*(size_t)N);
  } else {
   scopy_(&N,x,&Nx,y,&Ny);
  }
 }
 /* LAPACK eigen value expert routine, real symmetric  matrix */
 int 
 my_dsyevx(char jobz, char range, char uplo, int N, double *A, int lda,
  double vl, double vu, int il, int iu, double abstol, int M, double  *W,
  double *Z, int ldz, double *WORK, int lwork, int *iwork, int *ifail) {
  extern void dsyevx_(char *JOBZ, char *RANGE, char *UPLO, int *N, double *A, int *LDA,
   double  *VL, double *VU, int *IL, int *IU, double *ABSTOL, int *M, double *W, double *Z, 
   int *LDZ, double *WORK, int *LWORK, int *IWORK, int *IFAIL, int *INFO);
  int info;
  dsyevx_(&jobz,&range,&uplo,&N,A,&lda,&vl,&vu,&il,&iu,&abstol,&M,W,Z,&ldz,WORK,&lwork,iwork,ifail,&info);
  return info;
 } 
 /* BLAS vector outer product
   A= alpha x x^H + A
 */
 void
 my_zher(char uplo, int N, double alpha, complex double *x, int incx, complex double *A, int lda) {
  extern void zher_(char *UPLO, int *N, double *ALPHA, complex double *X, int *INCX, complex double *A, int *LDA);
  zher_(&uplo,&N,&alpha,x,&incx,A,&lda);
 }
--- a/src/lib/Solvers/myblas.o
+++ b/src/lib/Solvers/myblas.o
--- a/src/lib/Solvers/oslmfit.c
+++ b/src/lib/Solvers/oslmfit.c
@ -1,705 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #include <float.h>
 #include "Solvers.h"
 #include <cuda_runtime.h>
 //#define DEBUG
 /* helper functions for diagnostics */
 static void
 checkCudaError(cudaError_t err, char *file, int line)
 {
 #ifdef CUDA_DEBUG
    if(!err)
        return;
    fprintf(stderr,"GPU (CUDA): %s %s %d\n", cudaGetErrorString(err),file,line);
    exit(EXIT_FAILURE);
 #endif
 }
 static void
 checkCublasError(cublasStatus_t cbstatus, char *file, int line)
 {
 #ifdef CUDA_DEBUG
   if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr,"%s: %d: CUBLAS failure\n",file,line);
    exit(EXIT_FAILURE);  
   }
 #endif
 }
 /* OS-LM, but f() and jac() calculations are done 
  entirely in the GPU */
 int
 oslevmar_der_single_cuda(
  void (*func)(double *p, double *hx, int m, int n, void *adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in  R^n */
  void (*jacf)(double *p, double *j, int m, int n, void *adata),  /* function to evaluate the Jacobian \part x / \part p */
  double *p,         /* I/O: initial parameter estimates. On output has the estimated solution */
  double *x,         /* I: measurement vector. NULL implies a zero vector */
  int M,              /* I: parameter vector dimension (i.e. #unknowns) */
  int N,              /* I: measurement vector dimension */
  int itmax,          /* I: maximum number of iterations */
  double opts[4],   /* I: minim. options [\mu, \epsilon1, \epsilon2, \epsilon3]. Respectively the scale factor for initial \mu,
                       * stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2. Set to NULL for defaults to be used
                       */
  double info[10], 
                      /* O: information regarding the minimization. Set to NULL if don't care
                      * info[0]= ||e||_2 at initial p.
                      * info[1-4]=[ ||e||_2, ||J^T e||_inf,  ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p.
                      * info[5]= # iterations,
                      * info[6]=reason for terminating: 1 - stopped by small gradient J^T e
                      *                                 2 - stopped by small Dp
                      *                                 3 - stopped by itmax
                      *                                 4 - singular matrix. Restart from current p with increased mu 
                      *                                 5 - no further error reduction is possible. Restart with increased mu
                      *                                 6 - stopped by small ||e||_2
                      *                                 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
                      * info[7]= # function evaluations
                      * info[8]= # Jacobian evaluations
                      * info[9]= # linear systems solved, i.e. # attempts for reducing error
                      */
  cublasHandle_t cbhandle, /* device handle */
  cusolverDnHandle_t solver_handle, /* solver handle */
  double *gWORK, /* GPU allocated memory */
  int linsolv, /* 0 Cholesky, 1 QR, 2 SVD */
  int tileoff, /* tile offset when solving for many chunks */
  int ntiles, /* total tile (data) size being solved for */
  int randomize, /* if >0 randomize */
  void *adata)       /* pointer to possibly additional data, passed uninterpreted to func & jacf.
                      * Set to NULL if not needed
                      */
 {
  /* general note: all device variables end with a 'd' */
  int stop=0;
  cudaError_t err;
  cublasStatus_t cbstatus;
  int nu=2,nu2;
  double p_L2, Dp_L2=DBL_MAX, dF, dL, p_eL2, jacTe_inf=0.0, pDp_eL2, init_p_eL2;
  double tmp,mu=0.0;
  double tau, eps1, eps2, eps2_sq, eps3;
  int k,ci,issolved;
  double *hxd;
  double *ed;
  double *xd;
  double *jacd;
  double *jacTjacd,*jacTjacd0;
  double *Dpd,*bd;
  double *pd,*pnewd;
  double *jacTed;
  /* used in QR solver */
  double *taud;
  /* used in SVD solver */
  double *Ud;
  double *VTd;
  double *Sd;
  /* ME data */
  me_data_t *dp=(me_data_t*)adata;
  int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
  /* coherency on device */
  double *cohd;
  /* baseline-station map on device/host */
  short *bbd;
  int solve_axb=linsolv;
  /* setup default settings */
  if(opts){
    tau=opts[0];
    eps1=opts[1];
    eps2=opts[2];
    eps2_sq=opts[2]*opts[2];
    eps3=opts[3];
  } else {
    tau=CLM_INIT_MU;
    eps1=CLM_STOP_THRESH;
    eps2=CLM_STOP_THRESH;
    eps2_sq=CLM_STOP_THRESH*CLM_STOP_THRESH;
    eps3=CLM_STOP_THRESH;
  }
  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  unsigned long int moff;
  if (!gWORK) {
  err=cudaMalloc((void**)&xd, N*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&jacd, M*N*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&jacTjacd, M*M*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&jacTed, M*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&jacTjacd0, M*M*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&Dpd, M*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&bd, M*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&pd, M*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&pnewd, M*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  /* needed for calculating f()  and jac() */
  err=cudaMalloc((void**) &bbd, Nbase*2*sizeof(short));
  checkCudaError(err,__FILE__,__LINE__);
  /* we need coherencies for only this cluster */
  err=cudaMalloc((void**) &cohd, Nbase*8*sizeof(double)); 
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&hxd, N*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  err=cudaMalloc((void**)&ed, N*sizeof(double));
  checkCudaError(err,__FILE__,__LINE__);
  /* memory allocation: different solvers */
  if (solve_axb==1) {
    err=cudaMalloc((void**)&taud, M*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
  } else if (solve_axb==2) {
    err=cudaMalloc((void**)&Ud, M*M*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
    err=cudaMalloc((void**)&VTd, M*M*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
    err=cudaMalloc((void**)&Sd, M*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
  }
  } else {
    moff=0;
    xd=&gWORK[moff];
    moff+=N;
    jacd=&gWORK[moff];
    moff+=M*N;
    jacTjacd=&gWORK[moff];
    moff+=M*M;
    jacTed=&gWORK[moff];
    moff+=M;
    jacTjacd0=&gWORK[moff];
    moff+=M*M;
    Dpd=&gWORK[moff];
    moff+=M;
    bd=&gWORK[moff];
    moff+=M;
    pd=&gWORK[moff];
    moff+=M;
    pnewd=&gWORK[moff];
    moff+=M;
    cohd=&gWORK[moff];
    moff+=Nbase*8;
    hxd=&gWORK[moff];
    moff+=N;
    ed=&gWORK[moff];
    moff+=N;
    if (solve_axb==1) {
     taud=&gWORK[moff];
     moff+=M;
    } else if (solve_axb==2) {
     Ud=&gWORK[moff];
     moff+=M*M;
     VTd=&gWORK[moff];
     moff+=M*M;
     Sd=&gWORK[moff];
     moff+=M;
    }
    bbd=(short*)&gWORK[moff];
    moff+=(Nbase*2*sizeof(short))/sizeof(double);
  }
  /* extra storage for cusolver */
  int work_size=0;
  int *devInfo;
  int devInfo_h=0;
  err=cudaMalloc((void**)&devInfo, sizeof(int));
  checkCudaError(err,__FILE__,__LINE__);
  double *work;
  double *rwork;
  if (solve_axb==0) {
    cusolverDnDpotrf_bufferSize(solver_handle, CUBLAS_FILL_MODE_UPPER, M, jacTjacd, M, &work_size);
    err=cudaMalloc((void**)&work, work_size*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
  } else if (solve_axb==1) {
    cusolverDnDgeqrf_bufferSize(solver_handle, M, M, jacTjacd, M, &work_size);
    err=cudaMalloc((void**)&work, work_size*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
  } else {
    cusolverDnDgesvd_bufferSize(solver_handle, M, M, &work_size);
    err=cudaMalloc((void**)&work, work_size*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
    err=cudaMalloc((void**)&rwork, 5*M*sizeof(double));
    checkCudaError(err,__FILE__,__LINE__);
  }
  err=cudaMemcpyAsync(pd, p, M*sizeof(double), cudaMemcpyHostToDevice,0);
  checkCudaError(err,__FILE__,__LINE__);
  /* need to give right offset for coherencies */
  /* offset: cluster offset+time offset */
  err=cudaMemcpyAsync(cohd, &(dp->ddcoh[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(double), cudaMemcpyHostToDevice,0);
  checkCudaError(err,__FILE__,__LINE__);
  /* correct offset for baselines */
  err=cudaMemcpyAsync(bbd, &(dp->ddbase[2*(dp->Nbase)*(tileoff)]), Nbase*2*sizeof(short), cudaMemcpyHostToDevice,0);
  checkCudaError(err,__FILE__,__LINE__);
  cudaDeviceSynchronize();
  /* xd <=x */
  err=cudaMemcpyAsync(xd, x, N*sizeof(double), cudaMemcpyHostToDevice,0);
  checkCudaError(err,__FILE__,__LINE__);
  /* ### compute e=x - f(p) and its L2 norm */
  /* ### e=x-hx, p_eL2=||e|| */
  /* p: params (Mx1), x: data (Nx1), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations*/
  cudakernel_func(ThreadsPerBlock, (Nbase+ThreadsPerBlock-1)/ThreadsPerBlock, pd,hxd,M,N, cohd, bbd, Nbase, dp->M, dp->N);
  /* e=x */
  cbstatus=cublasDcopy(cbhandle, N, xd, 1, ed, 1);
  /* e=x-hx */
  double alpha=-1.0;
  cbstatus=cublasDaxpy(cbhandle, N, &alpha, hxd, 1, ed, 1);
  /* norm ||e|| */
  cbstatus=cublasDnrm2(cbhandle, N, ed, 1, &p_eL2);
  /* square */
  p_eL2=p_eL2*p_eL2;
  init_p_eL2=p_eL2;
  if(!finite(p_eL2)) stop=7;
  /* setup OS subsets and stating offsets */
  /* ed : N, cohd : Nbase*8, bbd : Nbase*2 full size */
  /* if ntiles<Nsubsets, make Nsubsets=ntiles */
  int Nsubsets=10; 
  if (ntiles<Nsubsets) { Nsubsets=ntiles; }
  /* FIXME: is 0.1 enough ? */
  int max_os_iter=(int)ceil(0.1*(double)Nsubsets);
  int Npersubset=(N+Nsubsets-1)/Nsubsets;
  int Nbasepersubset=(Nbase+Nsubsets-1)/Nsubsets;
  int *Nos,*Nbaseos,*edI,*NbI,*subI=0;
  if ((Nos=(int*)calloc((size_t)Nsubsets,sizeof(int)))==0) {
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
      exit(1);
  }
  if ((Nbaseos=(int*)calloc((size_t)Nsubsets,sizeof(int)))==0) {
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
      exit(1);
  }
  if ((edI=(int*)calloc((size_t)Nsubsets,sizeof(int)))==0) {
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
      exit(1);
  }
  if ((NbI=(int*)calloc((size_t)Nsubsets,sizeof(int)))==0) {
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
      exit(1);
  }
  int l,ositer;
  k=l=0;
  for (ci=0; ci<Nsubsets; ci++) {
    edI[ci]=k;
    NbI[ci]=l;
    if (k+Npersubset<N) {
      Nos[ci]=Npersubset;
      Nbaseos[ci]=Nbasepersubset;
    } else {
      Nos[ci]=N-k;
      Nbaseos[ci]=Nbase-l;
    }
    k=k+Npersubset;
    l=l+Nbasepersubset;
  }
 #ifdef DEBUG
  for (ci=0; ci<Nsubsets; ci++) {
   printf("ci=%d, Nos=%d, edI=%d, Nbseos=%d, NbI=%d\n",ci,Nos[ci],edI[ci],Nbaseos[ci],NbI[ci]);
  }
 #endif
  /**** iteration loop ***********/
  for(k=0; k<itmax && !stop; ++k){
 #ifdef DEBUG
    printf("iter=%d err=%lf\n",k,p_eL2);
 #endif
    if(p_eL2<=eps3){ /* error is small */
      stop=6;
      break;
    }
    if (randomize) {
     /* random permutation of subsets */
     subI=random_permutation(Nsubsets,0,0);
    }
 /**************** OS loop ***************************/
    for (ositer=0; ositer<max_os_iter; ositer++) {
     /* select subset to compute Jacobian */
     if (randomize) {
      l=subI[ositer];
     } else {
      l=(k+ositer)%Nsubsets;
     }
     /* NOTE: no. of subsets >= no. of OS iterations, so select
        a random set of subsets */
     /* N, Nbase changes with subset, cohd,bbd,ed gets offsets */
     /* ed : N, cohd : Nbase*8, bbd : Nbase*2 full size */
    /* p: params (Mx1), jacd: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations*/
    /* FIXME thread/block sizes 16x16=256, so 16 is chosen */
     //cudakernel_jacf(ThreadsPerBlock, ThreadsPerBlock/4, pd, jacd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
     cudakernel_jacf(ThreadsPerBlock, ThreadsPerBlock/4, pd, jacd, M, Nos[l], &cohd[8*NbI[l]], &bbd[2*NbI[l]], Nbaseos[l], dp->M, dp->N);
     /* Compute J^T J and J^T e */
     /* Cache efficient computation of J^T J based on blocking
     */
     /* since J is in ROW major order, assume it is transposed,
       so actually calculate A=J*J^T, where J is size MxN */
     //status=culaDeviceDgemm('N','T',M,M,Nos[l],1.0,jacd,M,jacd,M,0.0,jacTjacd,M);
     //checkStatus(status,__FILE__,__LINE__);
     double cone=1.0; double czero=0.0;
     cbstatus=cublasDgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_T,M,M,Nos[l],&cone,jacd,M,jacd,M,&czero,jacTjacd,M);
     /* create backup */
     /* copy jacTjacd0<=jacTjacd */
     cbstatus=cublasDcopy(cbhandle, M*M, jacTjacd, 1, jacTjacd0, 1);
     /* J^T e */
     /* calculate b=J^T*e (actually compute b=J*e, where J in row major (size MxN) */
     //status=culaDeviceDgemv('N',M,Nos[l],1.0,jacd,M,&ed[edI[l]],1,0.0,jacTed,1);
     //checkStatus(status,__FILE__,__LINE__);
     cbstatus=cublasDgemv(cbhandle,CUBLAS_OP_N,M,Nos[l],&cone,jacd,M,&ed[edI[l]],1,&czero,jacTed,1);
     /* Compute ||J^T e||_inf and ||p||^2 */
     /* find infinity norm of J^T e, 1 based indexing*/
     cbstatus=cublasIdamax(cbhandle, M, jacTed, 1, &ci);
     err=cudaMemcpy(&jacTe_inf,&(jacTed[ci-1]),sizeof(double),cudaMemcpyDeviceToHost);
     checkCudaError(err,__FILE__,__LINE__);
     /* L2 norm of current parameter values */
     /* norm ||Dp|| */
     cbstatus=cublasDnrm2(cbhandle, M, pd, 1, &p_L2);
     p_L2=p_L2*p_L2;
     if(jacTe_inf<0.0) {jacTe_inf=-jacTe_inf;}
 #ifdef DEBUG
     printf("Inf norm=%lf\n",jacTe_inf);
 #endif
    /* check for convergence */
    if((jacTe_inf <= eps1)){
      Dp_L2=0.0; /* no increment for p in this case */
      stop=1;
      break;
    }
    /* compute initial (k=0) damping factor */
    if (k==0) {
      /* find max diagonal element (stride is M+1) */
      /* should be MAX not MAX(ABS) */
      cbstatus=cublasIdamax(cbhandle, M, jacTjacd, M+1, &ci); /* 1 based index */
      ci=(ci-1)*(M+1); /* right value of the diagonal */
      err=cudaMemcpy(&tmp,&(jacTjacd[ci]),sizeof(double),cudaMemcpyDeviceToHost);
      checkCudaError(err,__FILE__,__LINE__);
      mu=tau*tmp;
    }
    /* determine increment using adaptive damping */
    while(1){
      /* augment normal equations */
      /* increment A => A+ mu*I, increment diagonal entries */
      /* copy jacTjacd<=jacTjacd0 */
      cbstatus=cublasDcopy(cbhandle, M*M, jacTjacd0, 1, jacTjacd, 1);
      cudakernel_diagmu(ThreadsPerBlock, BlocksPerGrid, M, jacTjacd, mu);
 #ifdef DEBUG
      printf("mu=%lf\n",mu);
 #endif
 /*************************************************************************/
      issolved=0;
      /* solve augmented equations A x = b */
      /* A==jacTjacd, b==Dpd, after solving, x==Dpd */
      /* b=jacTed : intially right hand side, at exit the solution */
      if (solve_axb==0) {
        /* Cholesky solver **********************/
        /* lower triangle of Ad is destroyed */
        //status=culaDeviceDpotrf('U',M,jacTjacd,M);
        cusolverDnDpotrf(solver_handle, CUBLAS_FILL_MODE_UPPER, M, jacTjacd, M, work, work_size, devInfo);
        cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
        if (!devInfo_h) {
         issolved=1;
        } else {
         issolved=0;
 #ifdef DEBUG
         fprintf(stderr,"Singular matrix\n");
 #endif
        }
        if (issolved) {
         /* copy Dpd<=jacTed */
         cbstatus=cublasDcopy(cbhandle, M, jacTed, 1, Dpd, 1);
 #ifdef DEBUG
         checkCublasError(cbstatus,__FILE__,__LINE__);
 #endif
         //status=culaDeviceDpotrs('U',M,1,jacTjacd,M,Dpd,M);
         cusolverDnDpotrs(solver_handle, CUBLAS_FILL_MODE_UPPER,M,1,jacTjacd,M,Dpd,M,devInfo);
         cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
         if (devInfo_h) {
           issolved=0;
 #ifdef DEBUG
           fprintf(stderr,"Singular matrix\n");
 #endif
         }
        }
      } else if (solve_axb==1) {
        /* QR solver ********************************/
        //status=culaDeviceDgeqrf(M,M,jacTjacd,M,taud);
        cusolverDnDgeqrf(solver_handle, M, M, jacTjacd, M, taud, work, work_size, devInfo);
        cudaDeviceSynchronize();
        cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
        if (!devInfo_h) {
         issolved=1;
        } else {
         issolved=0;
 #ifdef DEBUG
         fprintf(stderr,"Singular matrix\n");
 #endif
        }
        if (issolved) {
         /* copy Dpd<=jacTed */
         cbstatus=cublasDcopy(cbhandle, M, jacTed, 1, Dpd, 1);
         //status=culaDeviceDgeqrs(M,M,1,jacTjacd,M,taud,Dpd,M);
         cusolverDnDormqr(solver_handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, M, 1, M, jacTjacd, M, taud, Dpd, M, work, work_size, devInfo);
         cudaDeviceSynchronize();
         cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
         if (devInfo_h) {
           issolved=0;
 #ifdef DEBUG
           fprintf(stderr,"Singular matrix\n");
 #endif
         } else {
          cone=1.0;
          cbstatus=cublasDtrsm(cbhandle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_UPPER,CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,M,1,&cone,jacTjacd,M,Dpd,M);
         }
        }
      } else {
        /* SVD solver *********************************/
        /* U S VT = A */
        //status=culaDeviceDgesvd('A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M);
        //checkStatus(status,__FILE__,__LINE__);
        cusolverDnDgesvd(solver_handle,'A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M,work,work_size,rwork,devInfo);
        cudaDeviceSynchronize();
        /* copy Dpd<=jacTed */
        cbstatus=cublasDcopy(cbhandle, M, jacTed, 1, Dpd, 1);
        /* b<=U^T * b */
        //status=culaDeviceDgemv('T',M,M,1.0,Ud,M,Dpd,1,0.0,Dpd,1);
        //checkStatus(status,__FILE__,__LINE__);
        cone=1.0; czero=0.0;
        cbstatus=cublasDgemv(cbhandle,CUBLAS_OP_T,M,M,&cone,Ud,M,Dpd,1,&czero,Dpd,1);
        /* divide by singular values  Dpd[]/Sd[]  for Sd[]> eps1 */
        cudakernel_diagdiv(ThreadsPerBlock, BlocksPerGrid, M, eps1, Dpd, Sd);
        /* b<=VT^T * b */
        //status=culaDeviceDgemv('T',M,M,1.0,VTd,M,Dpd,1,0.0,Dpd,1);
        //checkStatus(status,__FILE__,__LINE__);
        cbstatus=cublasDgemv(cbhandle,CUBLAS_OP_T,M,M,&cone,VTd,M,Dpd,1,&czero,Dpd,1);
        issolved=1;
      }
 /*************************************************************************/
      /* compute p's new estimate and ||Dp||^2 */
      if (issolved) {
          /* compute p's new estimate and ||Dp||^2 */
          /* pnew=p+Dp */
          /* pnew=p */
          cbstatus=cublasDcopy(cbhandle, M, pd, 1, pnewd, 1);
          /* pnew=pnew+Dp */
          alpha=1.0;
          cbstatus=cublasDaxpy(cbhandle, M, &alpha, Dpd, 1, pnewd, 1);
          /* norm ||Dp|| */
          cbstatus=cublasDnrm2(cbhandle, M, Dpd, 1, &Dp_L2);
          Dp_L2=Dp_L2*Dp_L2;
 #ifdef DEBUG
 printf("norm ||dp|| =%lf, norm ||p||=%lf\n",Dp_L2,p_L2);
 #endif
          if(Dp_L2<=eps2_sq*p_L2){ /* relative change in p is small, stop */
           stop=2;
           break;
          }
         if(Dp_L2>=(p_L2+eps2)/(CLM_EPSILON*CLM_EPSILON)){ /* almost singular */
          stop=4;
          break;
         }
        /* new function value */
        /* compute ||e(pDp)||_2 */
        /* ### hx=x-hx, pDp_eL2=||hx|| */
        /* copy to device */
        /* hxd<=hx */
        cudakernel_func(ThreadsPerBlock, (Nbase+ThreadsPerBlock-1)/ThreadsPerBlock, pnewd, hxd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
        /* e=x */
        cbstatus=cublasDcopy(cbhandle, N, xd, 1, ed, 1);
        /* e=x-hx */
        alpha=-1.0;
        cbstatus=cublasDaxpy(cbhandle, N, &alpha, hxd, 1, ed, 1);
        /* note: e is updated */
        /* norm ||e|| */
        cbstatus=cublasDnrm2(cbhandle, N, ed, 1, &pDp_eL2);
        pDp_eL2=pDp_eL2*pDp_eL2;
        if(!finite(pDp_eL2)){ /* sum of squares is not finite, most probably due to a user error.
                                  */
          stop=7;
          break;
        }
        /* dL=Dp'*(mu*Dp+jacTe) */
        /* bd=jacTe+mu*Dp */
        cbstatus=cublasDcopy(cbhandle, M, jacTed, 1, bd, 1);
        cbstatus=cublasDaxpy(cbhandle, M, &mu, Dpd, 1, bd, 1);
        cbstatus=cublasDdot(cbhandle, M, Dpd, 1, bd, 1, &dL);
        dF=p_eL2-pDp_eL2;
 #ifdef DEBUG
        printf("dF=%lf, dL=%lf\n",dF,dL);
 #endif
        if(dL>0.0 && dF>0.0){ /* reduction in error, increment is accepted */
          tmp=(2.0*dF/dL-1.0);
          tmp=1.0-tmp*tmp*tmp;
          mu=mu*((tmp>=CLM_ONE_THIRD)? tmp : CLM_ONE_THIRD);
          nu=2;
          /* update p's estimate */
          cbstatus=cublasDcopy(cbhandle, M, pnewd, 1, pd, 1);
          /* update ||e||_2 */
          p_eL2=pDp_eL2;
          break;
        }
      }
      /* if this point is reached, either the linear system could not be solved or
       * the error did not reduce; in any case, the increment must be rejected
       */
      mu*=(double)nu;
      nu2=nu<<1; // 2*nu;
      if(nu2<=nu){ /* nu has wrapped around (overflown). */
        stop=5;
        break;
      }
      nu=nu2;
    } /* inner loop */
  }
  if (randomize) {
   free(subI);
  }
 /**************** end OS loop ***************************/
  }
  /**** end iteration loop ***********/
  free(Nos);
  free(Nbaseos);
  free(edI);
  free(NbI);
  if(k>=itmax) stop=3;
  /* copy back current solution */
  err=cudaMemcpyAsync(p,pd,M*sizeof(double),cudaMemcpyDeviceToHost,0);
  checkCudaError(err,__FILE__,__LINE__);
  checkCublasError(cbstatus,__FILE__,__LINE__);
  /* synchronize async operations */
  cudaDeviceSynchronize();
  if (!gWORK) {
  cudaFree(xd);
  cudaFree(jacd);
  cudaFree(jacTjacd);
  cudaFree(jacTjacd0);
  cudaFree(jacTed);
  cudaFree(Dpd);
  cudaFree(bd);
  cudaFree(pd);
  cudaFree(pnewd);
  cudaFree(hxd);
  cudaFree(ed);
  if (solve_axb==1) {
   cudaFree(taud);
  } else if (solve_axb==2) {
   cudaFree(Ud);
   cudaFree(VTd);
   cudaFree(Sd);
  }
  cudaFree(cohd);
  cudaFree(bbd);
  }
  cudaFree(devInfo);
  cudaFree(work);
  if (solve_axb==2) {
    cudaFree(rwork);
  }
 #ifdef DEBUG
  printf("stop=%d\n",stop);
 #endif
  if(info){
    info[0]=init_p_eL2;
    info[1]=p_eL2;
    info[2]=jacTe_inf;
    info[3]=Dp_L2;
    info[4]=mu;
    info[5]=(double)k;
    info[6]=(double)stop;
    info[7]=(double)0;
    info[8]=(double)0;
    info[9]=(double)0;
  }
  return 0;
 }
--- a/src/lib/Solvers/oslmfit.o
+++ b/src/lib/Solvers/oslmfit.o
--- a/src/lib/Solvers/residual.c
+++ b/src/lib/Solvers/residual.c
--- a/src/lib/Solvers/residual.o
+++ b/src/lib/Solvers/residual.o
--- a/src/lib/Solvers/robust.cu
+++ b/src/lib/Solvers/robust.cu
@ -1,721 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include "cuda.h"
 #include <cuComplex.h>
 #include <stdio.h>
 /* enable this for kernel failure detection */
 //#define CUDA_DBG
 __global__ void kernel_deriv_robust(int Nbase, int tilesz, int M, int Ns, int Nparam, int goff, double robust_nu, double *x, double *coh, double *p, short *bb, int *ptoclus, double *grad){
  /* global thread index */
  unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
  /* parameter number of this thread */
  unsigned int np=n+goff;
  /* this thread works on 
    x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
    bb[2*n:2*n+1] (sta1,sta2)
    organization of p (N stations and M clusters)
             sta 0          sta 1           sta 2        ....  sta N-1 
  clus 0   0...7            8...15          16...23      ...   8N-8     8N-1
  clus 1   8N..8N+7         8N+8..8N+15     8N+16..8N+23 ....  8N+8N-8...8N+8N-1
  ......
  clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15....  ...(M-1)N+8N-8 (M-1)N+8N-1
    organization of coherencies (coh)
        [0, 8*M-1] : baseline 0
        [8*M, 8*M+8*M-1]: baseline 1
        [n*8*M, n*8*M+8*M-1]: baseline n
        ......
        [n*8*M+cm*8, n*8*M+cm*8+7]  cluster cm, baseline n
    residual error stored at sum[n]
  */ 
  if (n<Nparam) {
    /* this thread will calculate derivative for parameter np,
      and store it in grad[n] */
    double gsum=0.0;
    /* find which cluster this parameter belongs to */
    /* ptoclus[0,1]  are nchunk, and p[start index] for each cluster */
    int cli=0;
    /* np should be within ptoclus[2*cli+1]....ptoclus[2*cli+1]+ptoclus[2*cli]*8*Ns-1 */
    while ((cli<M) && (np<ptoclus[2*cli+1] || np>ptoclus[2*cli+1]+ptoclus[2*cli]*8*Ns-1)) { cli++; }
    /* now either ci>=M: cluster not found 
       or ci<M and ci is the right cluster */
    if ((cli==M) && np>=ptoclus[2*cli-1] && np<=ptoclus[2*cli-1]+ptoclus[2*cli-2]*8*Ns-1) {
     cli--;
    }
    if (cli<M) {
      int pstart=ptoclus[2*cli+1];
      int nchunk=ptoclus[2*cli];
      /* find station and which parameter for this thread (parameter) */
      /* this cluster has parameters ptoclus[2*cli+1] ..... +ptoclus[2*cli]*8*Ns-1 */
      unsigned int np_s=(np-pstart)%(8*Ns);
      unsigned int stc=np_s/8; /* this is the station of this param */
      /* which chunk does this parameter belong to */
      unsigned int tpchunk=(np-pstart)/(8*Ns);
      int tilesperchunk=(tilesz+nchunk-1)/nchunk;
      /* total baselines in one tile */
      int Nbase0=(Ns-1)*Ns/2;
      for(unsigned int nb=0; nb<Nbase; nb++) {
        /* which tile is this ? */
        int ttile=nb/Nbase0;
        /* which chunk this tile belongs to */
        int tptile=ttile/tilesperchunk;
        /* now tptile has to match tpchunk, otherwise ignore calculation */
        if (tptile==tpchunk) {
        int sta1=(int)bb[2*nb];
        int sta2=(int)bb[2*nb+1];
        /* only calculate deriv if baseline corresponds
          to this station and baseline is not flagged */
        /* flagged baselines will have sta1==sta2==-1 */
        if (((stc==sta1)||(stc==sta2)) && sta1>=0 && sta2>=0) {
         /* which parameter 0..7 */
         unsigned int stoff=np_s-stc*8; 
         /* which cluster 0..M-1 */
         unsigned int stm=cli;
         /* read residual vector, real,imag separate*/
         double xr[8];
         xr[0]=x[nb*8];
         xr[1]=x[nb*8+1];
         xr[2]=x[nb*8+2];
         xr[3]=x[nb*8+3];
         xr[4]=x[nb*8+4];
         xr[5]=x[nb*8+5];
         xr[6]=x[nb*8+6];
         xr[7]=x[nb*8+7];
         /* read in coherency */
         cuDoubleComplex C[4];
         C[0].x=coh[8*nb*M+8*stm];
         C[0].y=coh[8*nb*M+8*stm+1];
         C[1].x=coh[8*nb*M+8*stm+2];
         C[1].y=coh[8*nb*M+8*stm+3];
         C[2].x=coh[8*nb*M+8*stm+4];
         C[2].y=coh[8*nb*M+8*stm+5];
         C[3].x=coh[8*nb*M+8*stm+6];
         C[3].y=coh[8*nb*M+8*stm+7];
         cuDoubleComplex G1[4];
         cuDoubleComplex G2[4];
         if(stc==sta1) {
           double pp[8]; 
           pp[0]=0.0;
           pp[1]=0.0;
           pp[2]=0.0;
           pp[3]=0.0;
           pp[4]=0.0;
           pp[5]=0.0;
           pp[6]=0.0;
           pp[7]=0.0;
           pp[stoff]=1.0;
           G1[0].x=pp[0];
           G1[0].y=pp[1];
           G1[1].x=pp[2];
           G1[1].y=pp[3];
           G1[2].x=pp[4];
           G1[2].y=pp[5];
           G1[3].x=pp[6];
           G1[3].y=pp[7];
           /* conjugate and transpose G2 */
           G2[0].x=p[pstart+tpchunk*8*Ns+sta2*8];
           G2[0].y=-p[pstart+tpchunk*8*Ns+sta2*8+1];
           G2[2].x=p[pstart+tpchunk*8*Ns+sta2*8+2];
           G2[2].y=-p[pstart+tpchunk*8*Ns+sta2*8+3];
           G2[1].x=p[pstart+tpchunk*8*Ns+sta2*8+4];
           G2[1].y=-p[pstart+tpchunk*8*Ns+sta2*8+5];
           G2[3].x=p[pstart+tpchunk*8*Ns+sta2*8+6];
           G2[3].y=-p[pstart+tpchunk*8*Ns+sta2*8+7];
         } else if (stc==sta2) {
           double pp[8]; 
           pp[0]=0.0;
           pp[1]=0.0;
           pp[2]=0.0;
           pp[3]=0.0;
           pp[4]=0.0;
           pp[5]=0.0;
           pp[6]=0.0;
           pp[7]=0.0;
           pp[stoff]=1.0;
           /* conjugate and transpose G2 */
           G2[0].x=pp[0];
           G2[0].y=-pp[1];
           G2[2].x=pp[2];
           G2[2].y=-pp[3];
           G2[1].x=pp[4];
           G2[1].y=-pp[5];
           G2[3].x=pp[6];
           G2[3].y=-pp[7];
           /* conjugate and transpose G2 */
           G1[0].x=p[pstart+tpchunk*8*Ns+sta1*8];
           G1[0].y=p[pstart+tpchunk*8*Ns+sta1*8+1];
           G1[1].x=p[pstart+tpchunk*8*Ns+sta1*8+2];
           G1[1].y=p[pstart+tpchunk*8*Ns+sta1*8+3];
           G1[2].x=p[pstart+tpchunk*8*Ns+sta1*8+4];
           G1[2].y=p[pstart+tpchunk*8*Ns+sta1*8+5];
           G1[3].x=p[pstart+tpchunk*8*Ns+sta1*8+6];
           G1[3].y=p[pstart+tpchunk*8*Ns+sta1*8+7];
         }
         cuDoubleComplex T1[4];
         /* T1=G1*C */
         T1[0]=cuCadd(cuCmul(G1[0],C[0]),cuCmul(G1[1],C[2]));
         T1[1]=cuCadd(cuCmul(G1[0],C[1]),cuCmul(G1[1],C[3]));
         T1[2]=cuCadd(cuCmul(G1[2],C[0]),cuCmul(G1[3],C[2]));
         T1[3]=cuCadd(cuCmul(G1[2],C[1]),cuCmul(G1[3],C[3]));
         cuDoubleComplex T2[4];
         /* T2=T1*G2 , G2 conjugate transposed */
         T2[0]=cuCadd(cuCmul(T1[0],G2[0]),cuCmul(T1[1],G2[2]));
         T2[1]=cuCadd(cuCmul(T1[0],G2[1]),cuCmul(T1[1],G2[3]));
         T2[2]=cuCadd(cuCmul(T1[2],G2[0]),cuCmul(T1[3],G2[2]));
         T2[3]=cuCadd(cuCmul(T1[2],G2[1]),cuCmul(T1[3],G2[3]));
         /* calculate product xr*vec(J_p C J_q^H )/(nu+residual^2) */
         double dsum;
         dsum=xr[0]*T2[0].x/(robust_nu+xr[0]*xr[0]);
         dsum+=xr[1]*T2[0].y/(robust_nu+xr[1]*xr[1]);
         dsum+=xr[2]*T2[1].x/(robust_nu+xr[2]*xr[2]);
         dsum+=xr[3]*T2[1].y/(robust_nu+xr[3]*xr[3]);
         dsum+=xr[4]*T2[2].x/(robust_nu+xr[4]*xr[4]);
         dsum+=xr[5]*T2[2].y/(robust_nu+xr[5]*xr[5]);
         dsum+=xr[6]*T2[3].x/(robust_nu+xr[6]*xr[6]);
         dsum+=xr[7]*T2[3].y/(robust_nu+xr[7]*xr[7]);
       /* accumulate sum NOTE
       its important to get the sign right,
      depending on res=data-model or res=model-data  */
        gsum+=2.0*dsum;     
      } 
     }
    }
    }
    grad[n]=gsum;
  }   
 }
 __global__ void kernel_func_wt(int Nbase, double *x, double *coh, double *p, short *bb, double *wt, int N){
  /* global thread index : equal to the baseline */
  unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
  /* this thread works on 
    x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
    bb[2*n:2*n+1] (sta1,sta2)
    organization of p (N stations and M clusters)
             sta 0          sta 1           sta 2        ....  sta N-1 
  clus 0   0...7            8...15          16...23      ...   8N-8     8N-1
  clus 1   8N..8N+7         8N+8..8N+15     8N+16..8N+23 ....  8N+8N-8...8N+8N-1
  ......
  clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15....  ...(M-1)N+8N-8 (M-1)N+8N-1
    organization of coherencies (coh)
        [0, 8*M-1] : baseline 0
        [8*M, 8*M+8*M-1]: baseline 1
        [n*8*M, n*8*M+8*M-1]: baseline n
        ......
        [n*8*M+cm*8, n*8*M+cm*8+7]  cluster cm, baseline n
    residual error stored at sum[n]
  */ 
  if(n<Nbase) {
    int sta1=(int)bb[2*n];
    int sta2=(int)bb[2*n+1];
    /* condition for calculating this baseline sum is 
      1) its not flagged (sta1,sta2)>=0
    */
    if (sta1>=0 && sta2>=0) {   
     cuDoubleComplex G1[4];
     double pp[8]; 
     pp[0]=p[sta1*8];
     pp[1]=p[sta1*8+1];
     pp[2]=p[sta1*8+2];
     pp[3]=p[sta1*8+3];
     pp[4]=p[sta1*8+4];
     pp[5]=p[sta1*8+5];
     pp[6]=p[sta1*8+6];
     pp[7]=p[sta1*8+7];
     G1[0].x=pp[0];
     G1[0].y=pp[1];
     G1[1].x=pp[2];
     G1[1].y=pp[3];
     G1[2].x=pp[4];
     G1[2].y=pp[5];
     G1[3].x=pp[6];
     G1[3].y=pp[7];
     cuDoubleComplex C[4];
     C[0].x=coh[8*n];
     C[0].y=coh[8*n+1];
     C[1].x=coh[8*n+2];
     C[1].y=coh[8*n+3];
     C[2].x=coh[8*n+4];
     C[2].y=coh[8*n+5];
     C[3].x=coh[8*n+6];
     C[3].y=coh[8*n+7]; 
     cuDoubleComplex T1[4];
     /* T=G1*C */
     T1[0]=cuCadd(cuCmul(G1[0],C[0]),cuCmul(G1[1],C[2]));
     T1[1]=cuCadd(cuCmul(G1[0],C[1]),cuCmul(G1[1],C[3]));
     T1[2]=cuCadd(cuCmul(G1[2],C[0]),cuCmul(G1[3],C[2]));
     T1[3]=cuCadd(cuCmul(G1[2],C[1]),cuCmul(G1[3],C[3]));
     cuDoubleComplex G2[4];
     /* conjugate this */
     pp[0]=p[sta2*8];
     pp[1]=-p[sta2*8+1];
     pp[2]=p[sta2*8+2];
     pp[3]=-p[sta2*8+3];
     pp[4]=p[sta2*8+4];
     pp[5]=-p[sta2*8+5];
     pp[6]=p[sta2*8+6];
     pp[7]=-p[sta2*8+7];
     G2[0].x=pp[0];
     G2[0].y=pp[1];
     G2[2].x=pp[2];
     G2[2].y=pp[3];
     G2[1].x=pp[4];
     G2[1].y=pp[5];
     G2[3].x=pp[6];
     G2[3].y=pp[7];
     cuDoubleComplex T2[4];
     T2[0]=cuCadd(cuCmul(T1[0],G2[0]),cuCmul(T1[1],G2[2]));
     T2[1]=cuCadd(cuCmul(T1[0],G2[1]),cuCmul(T1[1],G2[3]));
     T2[2]=cuCadd(cuCmul(T1[2],G2[0]),cuCmul(T1[3],G2[2]));
     T2[3]=cuCadd(cuCmul(T1[2],G2[1]),cuCmul(T1[3],G2[3]));
     /* update model vector, with weights */
     x[8*n]=wt[8*n]*T2[0].x;
     x[8*n+1]=wt[8*n+1]*T2[0].y;
     x[8*n+2]=wt[8*n+2]*T2[1].x;
     x[8*n+3]=wt[8*n+3]*T2[1].y;
     x[8*n+4]=wt[8*n+4]*T2[2].x;
     x[8*n+5]=wt[8*n+5]*T2[2].y;
     x[8*n+6]=wt[8*n+6]*T2[3].x;
     x[8*n+7]=wt[8*n+7]*T2[3].y;
    } 
   }
 }
 __global__ void kernel_jacf_wt(int Nbase, int M, double *jac, double *coh, double *p, short *bb, double *wt, int N){
  /* global thread index : equal to the baseline */
  unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
  /* which parameter:0...M */
  unsigned int m = threadIdx.y + blockDim.y*blockIdx.y;
  /* this thread works on 
    x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
    bb[2*n:2*n+1] (sta1,sta2)
    organization of p (N stations and M clusters)
             sta 0          sta 1           sta 2        ....  sta N-1 
  clus 0   0...7            8...15          16...23      ...   8N-8     8N-1
  clus 1   8N..8N+7         8N+8..8N+15     8N+16..8N+23 ....  8N+8N-8...8N+8N-1
  ......
  clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15....  ...(M-1)N+8N-8 (M-1)N+8N-1
    organization of coherencies (coh)
        [0, 8*M-1] : baseline 0
        [8*M, 8*M+8*M-1]: baseline 1
        [n*8*M, n*8*M+8*M-1]: baseline n
        ......
        [n*8*M+cm*8, n*8*M+cm*8+7]  cluster cm, baseline n
    residual error stored at sum[n]
  */ 
  if(n<Nbase && m<M) {
    int sta1=(int)bb[2*n];
    int sta2=(int)bb[2*n+1];
    /* condition for calculating this baseline sum is 
     If this baseline is flagged,
     or if this parameter does not belong to sta1 or sta2
     we do not compute
    */
    //int stc=m/8; /* 0...Ns-1 (because M=total par= 8 * Nstations */
    int stc=m>>3; /* 0...Ns-1 (because M=total par= 8 * Nstations */
    if (((stc==sta2)||(stc==sta1)) && sta1>=0 && sta2>=0 ) {   
     cuDoubleComplex C[4];
     C[0].x=coh[8*n];
     C[0].y=coh[8*n+1];
     C[1].x=coh[8*n+2];
     C[1].y=coh[8*n+3];
     C[2].x=coh[8*n+4];
     C[2].y=coh[8*n+5];
     C[3].x=coh[8*n+6];
     C[3].y=coh[8*n+7]; 
     /* which parameter exactly 0..7 */
     //int stoff=m%8;
     int stoff=m-stc*8;
     double pp1[8]; 
     double pp2[8]; 
     if (stc==sta1) {
      for (int cn=0; cn<8; cn++) {
       pp1[cn]=0.0;
       pp2[cn]=p[sta2*8+cn];
      }
      pp1[stoff]=1.0;
     } else if (stc==sta2) {
      for (int cn=0; cn<8; cn++) {
       pp2[cn]=0.0;
       pp1[cn]=p[sta1*8+cn];
      }
      pp2[stoff]=1.0;
     }
     cuDoubleComplex G1[4];
     G1[0].x=pp1[0];
     G1[0].y=pp1[1];
     G1[1].x=pp1[2];
     G1[1].y=pp1[3];
     G1[2].x=pp1[4];
     G1[2].y=pp1[5];
     G1[3].x=pp1[6];
     G1[3].y=pp1[7];
     cuDoubleComplex T1[4];
     /* T=G1*C */
     T1[0]=cuCadd(cuCmul(G1[0],C[0]),cuCmul(G1[1],C[2]));
     T1[1]=cuCadd(cuCmul(G1[0],C[1]),cuCmul(G1[1],C[3]));
     T1[2]=cuCadd(cuCmul(G1[2],C[0]),cuCmul(G1[3],C[2]));
     T1[3]=cuCadd(cuCmul(G1[2],C[1]),cuCmul(G1[3],C[3]));
     cuDoubleComplex G2[4];
     /* conjugate this */
     G2[0].x=pp2[0];
     G2[0].y=-pp2[1];
     G2[2].x=pp2[2];
     G2[2].y=-pp2[3];
     G2[1].x=pp2[4];
     G2[1].y=-pp2[5];
     G2[3].x=pp2[6];
     G2[3].y=-pp2[7];
     cuDoubleComplex T2[4];
     T2[0]=cuCadd(cuCmul(T1[0],G2[0]),cuCmul(T1[1],G2[2]));
     T2[1]=cuCadd(cuCmul(T1[0],G2[1]),cuCmul(T1[1],G2[3]));
     T2[2]=cuCadd(cuCmul(T1[2],G2[0]),cuCmul(T1[3],G2[2]));
     T2[3]=cuCadd(cuCmul(T1[2],G2[1]),cuCmul(T1[3],G2[3]));
     /* update jacobian , with row weights */
     /* NOTE: row major order */
     jac[m+M*8*n]=wt[8*n]*T2[0].x;
     jac[m+M*(8*n+1)]=wt[8*n+1]*T2[0].y;
     jac[m+M*(8*n+2)]=wt[8*n+2]*T2[1].x;
     jac[m+M*(8*n+3)]=wt[8*n+3]*T2[1].y;
     jac[m+M*(8*n+4)]=wt[8*n+4]*T2[2].x;
     jac[m+M*(8*n+5)]=wt[8*n+5]*T2[2].y;
     jac[m+M*(8*n+6)]=wt[8*n+6]*T2[3].x;
     jac[m+M*(8*n+7)]=wt[8*n+7]*T2[3].y;
    } 
   }
 }
 __global__ void kernel_setweights(int N, double *wt, double alpha){
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* make sure to use only M threads */
  if (tid<N) {
     wt[tid]=alpha;
  }
 }
 __global__ void kernel_hadamard(int N, double *wt, double *x){
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* make sure to use only M threads */
  if (tid<N) {
     x[tid]*=wt[tid];
  }
 }
 __global__ void kernel_updateweights(int N, double *wt, double *x, double *q, double nu){
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* make sure to use only M threads */
  if (tid<N) {
     wt[tid]=((nu+1.0)/(nu+x[tid]*x[tid]));
     q[tid]=wt[tid]-log(wt[tid]); /* so that its +ve */
  }
 }
 __global__ void kernel_sqrtweights(int N, double *wt){
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* make sure to use only M threads */
  if (tid<N) {
     wt[tid]=sqrt(wt[tid]); 
  }
 }
 __device__ double
 digamma(double x) {
  double result = 0.0, xx, xx2, xx4;
  for ( ; x < 7.0; ++x) { /* reduce x till x<7 */
    result -= 1.0/x;
  }
  x -= 1.0/2.0;
  xx = 1.0/x;
  xx2 = xx*xx;
  xx4 = xx2*xx2;
  result += log(x)+(1./24.)*xx2-(7.0/960.0)*xx4+(31.0/8064.0)*xx4*xx2-(127.0/30720.0)*xx4*xx4;
  return result;
 }
 __global__ void kernel_evaluatenu(int Nd, double qsum, double *q, double deltanu,double nulow) {
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  if (tid<Nd) {
   double thisnu=(nulow+((double)tid)*deltanu);
   double dgm=digamma(thisnu*0.5+0.5);
   q[tid]=dgm-log((thisnu+1.0)*0.5); /* psi((nu+1)/2)-log((nu+1)/2) */
   dgm=digamma(thisnu*0.5);
   q[tid]+=-dgm+log((thisnu)*0.5); /* -psi((nu)/2)+log((nu)/2) */
   q[tid]+=-qsum+1.0; /* -(-sum(ln(w_i))/N+sum(w_i)/N)+1 */
  }
 }
 /* only use extern if calling code is C */
 extern "C"
 {
 /* set initial weights to 1 by a cuda kernel */
 void
 cudakernel_setweights(int ThreadsPerBlock, int BlocksPerGrid, int N, double *wt, double alpha) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_setweights<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, alpha);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* hadamard product by a cuda kernel x<= x*wt */
 void
 cudakernel_hadamard(int ThreadsPerBlock, int BlocksPerGrid, int N, double *wt, double *x) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_hadamard<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, x);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* update weights by a cuda kernel */
 void
 cudakernel_updateweights(int ThreadsPerBlock, int BlocksPerGrid, int N, double *wt, double *x, double *q, double robust_nu) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_updateweights<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, x, q, robust_nu);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* update weights by a cuda kernel */
 void
 cudakernel_sqrtweights(int ThreadsPerBlock, int BlocksPerGrid, int N, double *wt) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_sqrtweights<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* evaluate expression for finding optimum nu for 
  a range of nu values */
 void
 cudakernel_evaluatenu(int ThreadsPerBlock, int BlocksPerGrid, int Nd, double qsum, double *q, double deltanu,double nulow) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_evaluatenu<<< BlocksPerGrid, ThreadsPerBlock >>>(Nd, qsum, q, deltanu, nulow);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* cuda driver for calculating wt \odot f() */
 /* p: params (Mx1), x: data (Nx1), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
 void
 cudakernel_func_wt(int ThreadsPerBlock, int BlocksPerGrid, double *p, double *x, int M, int N, double *coh, short *bbh, double *wt, int Nbase, int Mclus, int Nstations) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  cudaMemset(x, 0, N*sizeof(double));
 //  printf("Kernel data size=%d, block=%d, thread=%d, baselines=%d\n",N,BlocksPerGrid, ThreadsPerBlock,Nbase);
  kernel_func_wt<<< BlocksPerGrid, ThreadsPerBlock >>>(Nbase,  x, coh, p, bbh, wt, Nstations);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* cuda driver for calculating wt \odot jacf() */
 /* p: params (Mx1), jac: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
 void
 cudakernel_jacf_wt(int ThreadsPerBlock_row, int  ThreadsPerBlock_col, double *p, double *jac, int M, int N, double *coh, short *bbh, double *wt, int Nbase, int Mclus, int Nstations, int clus) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  /* NOTE: use small value for ThreadsPerBlock here, like 8 */
  dim3 threadsPerBlock(16, 8);
  /* jacobian: Nbase x Nstations (proportional to N), so */
  dim3 numBlocks((Nbase+threadsPerBlock.x-1)/threadsPerBlock.x, 
               (M+threadsPerBlock.y-1)/threadsPerBlock.y);
  /* set memory of jac to zero */
  cudaMemset(jac, 0, N*M*sizeof(double));
 // printf("Kernel Jax data size=%d, params=%d, block=%d,%d, thread=%d,%d, baselines=%d\n",N, M, numBlocks.x,numBlocks.y, threadsPerBlock.x, threadsPerBlock.y, Nbase);
  kernel_jacf_wt<<< numBlocks, threadsPerBlock>>>(Nbase,  M, jac, coh, p, bbh, wt, Nstations);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* cuda driver for kernel */
 /* ThreadsPerBlock: keep <= 128 ???
   BlocksPerGrid: depends on the threads/baselines> Threads*Blocks approx baselines
   Nbase: no of baselines (total, including tilesz >1)
   tilesz: tile size
   M: no of clusters
   Ns: no of stations
   Nparam: no of actual parameters  <=total 
   goff: starting point of gradient calculation 0..Nparams
   x: N*8 x 1 residual
   coh: N*8*M x 1
   p: M*Ns*8 x 1
   bb: 2*N x 1
   ptoclus: 2*M x 1
   grad: Nparamsx1 gradient values
 */
 void cudakernel_lbfgs_robust(int ThreadsPerBlock, int BlocksPerGrid, int Nbase, int tilesz, int M, int Ns, int Nparam, int goff, double robust_nu, double *x, double *coh, double *p, short *bb, int *ptoclus, double *grad){
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  /* invoke device on this block/thread grid (last argument is buffer size in bytes) */
  kernel_deriv_robust<<< BlocksPerGrid, ThreadsPerBlock, ThreadsPerBlock*sizeof(double) >>> (Nbase, tilesz, M, Ns, Nparam, goff, robust_nu, x, coh, p, bb, ptoclus, grad);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 }
--- a/src/lib/Solvers/robust.o
+++ b/src/lib/Solvers/robust.o
--- a/src/lib/Solvers/robust_fl.cu
+++ b/src/lib/Solvers/robust_fl.cu
@ -1,536 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include "cuda.h"
 #include <cuComplex.h>
 #include <stdio.h>
 /* enable this for checking for kernel failure */
 //#define CUDA_DBG
 __global__ void 
 kernel_func_wt_fl(int Nbase, float *x, float *coh, float *p, short *bb, float *wt, int N){
  /* global thread index : equal to the baseline */
  unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
  /* this thread works on 
    x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
    bb[2*n:2*n+1] (sta1,sta2)
    organization of p (N stations and M clusters)
             sta 0          sta 1           sta 2        ....  sta N-1 
  clus 0   0...7            8...15          16...23      ...   8N-8     8N-1
  clus 1   8N..8N+7         8N+8..8N+15     8N+16..8N+23 ....  8N+8N-8...8N+8N-1
  ......
  clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15....  ...(M-1)N+8N-8 (M-1)N+8N-1
    organization of coherencies (coh)
        [0, 8*M-1] : baseline 0
        [8*M, 8*M+8*M-1]: baseline 1
        [n*8*M, n*8*M+8*M-1]: baseline n
        ......
        [n*8*M+cm*8, n*8*M+cm*8+7]  cluster cm, baseline n
    residual error stored at sum[n]
  */ 
  if(n<Nbase) {
    int sta1=(int)bb[2*n];
    int sta2=(int)bb[2*n+1];
    /* condition for calculating this baseline sum is 
      1) its not flagged (sta1,sta2)>=0
    */
    if (sta1>=0 && sta2>=0) {   
     cuComplex G1[4];
     float pp[8]; 
     pp[0]=p[sta1*8];
     pp[1]=p[sta1*8+1];
     pp[2]=p[sta1*8+2];
     pp[3]=p[sta1*8+3];
     pp[4]=p[sta1*8+4];
     pp[5]=p[sta1*8+5];
     pp[6]=p[sta1*8+6];
     pp[7]=p[sta1*8+7];
     G1[0].x=pp[0];
     G1[0].y=pp[1];
     G1[1].x=pp[2];
     G1[1].y=pp[3];
     G1[2].x=pp[4];
     G1[2].y=pp[5];
     G1[3].x=pp[6];
     G1[3].y=pp[7];
     cuComplex C[4];
     C[0].x=coh[8*n];
     C[0].y=coh[8*n+1];
     C[1].x=coh[8*n+2];
     C[1].y=coh[8*n+3];
     C[2].x=coh[8*n+4];
     C[2].y=coh[8*n+5];
     C[3].x=coh[8*n+6];
     C[3].y=coh[8*n+7]; 
     cuComplex T1[4];
     /* T=G1*C */
     T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
     T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
     T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
     T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
     cuComplex G2[4];
     /* conjugate this */
     pp[0]=p[sta2*8];
     pp[1]=-p[sta2*8+1];
     pp[2]=p[sta2*8+2];
     pp[3]=-p[sta2*8+3];
     pp[4]=p[sta2*8+4];
     pp[5]=-p[sta2*8+5];
     pp[6]=p[sta2*8+6];
     pp[7]=-p[sta2*8+7];
     G2[0].x=pp[0];
     G2[0].y=pp[1];
     G2[2].x=pp[2];
     G2[2].y=pp[3];
     G2[1].x=pp[4];
     G2[1].y=pp[5];
     G2[3].x=pp[6];
     G2[3].y=pp[7];
     cuComplex T2[4];
     T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
     T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
     T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
     T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
     /* update model vector, with weights */
     x[8*n]=wt[8*n]*T2[0].x;
     x[8*n+1]=wt[8*n+1]*T2[0].y;
     x[8*n+2]=wt[8*n+2]*T2[1].x;
     x[8*n+3]=wt[8*n+3]*T2[1].y;
     x[8*n+4]=wt[8*n+4]*T2[2].x;
     x[8*n+5]=wt[8*n+5]*T2[2].y;
     x[8*n+6]=wt[8*n+6]*T2[3].x;
     x[8*n+7]=wt[8*n+7]*T2[3].y;
    } 
   }
 }
 __global__ void 
 kernel_jacf_wt_fl(int Nbase, int M, float *jac, float *coh, float *p, short *bb, float *wt, int N){
  /* global thread index : equal to the baseline */
  unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
  /* which parameter:0...M */
  unsigned int m = threadIdx.y + blockDim.y*blockIdx.y;
  /* this thread works on 
    x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
    bb[2*n:2*n+1] (sta1,sta2)
    organization of p (N stations and M clusters)
             sta 0          sta 1           sta 2        ....  sta N-1 
  clus 0   0...7            8...15          16...23      ...   8N-8     8N-1
  clus 1   8N..8N+7         8N+8..8N+15     8N+16..8N+23 ....  8N+8N-8...8N+8N-1
  ......
  clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15....  ...(M-1)N+8N-8 (M-1)N+8N-1
    organization of coherencies (coh)
        [0, 8*M-1] : baseline 0
        [8*M, 8*M+8*M-1]: baseline 1
        [n*8*M, n*8*M+8*M-1]: baseline n
        ......
        [n*8*M+cm*8, n*8*M+cm*8+7]  cluster cm, baseline n
    residual error stored at sum[n]
  */ 
  if(n<Nbase && m<M) {
    int sta1=(int)bb[2*n];
    int sta2=(int)bb[2*n+1];
    /* condition for calculating this baseline sum is 
     If this baseline is flagged,
     or if this parameter does not belong to sta1 or sta2
     we do not compute
    */
    //int stc=m/8; /* 0...Ns-1 (because M=total par= 8 * Nstations */
    int stc=m>>3; /* 0...Ns-1 (because M=total par= 8 * Nstations */
    if (((stc==sta2)||(stc==sta1)) && sta1>=0 && sta2>=0 ) {   
     cuComplex C[4];
     C[0].x=coh[8*n];
     C[0].y=coh[8*n+1];
     C[1].x=coh[8*n+2];
     C[1].y=coh[8*n+3];
     C[2].x=coh[8*n+4];
     C[2].y=coh[8*n+5];
     C[3].x=coh[8*n+6];
     C[3].y=coh[8*n+7]; 
     /* which parameter exactly 0..7 */
     //int stoff=m%8;
     int stoff=m-stc*8;
     float pp1[8]; 
     float pp2[8]; 
     if (stc==sta1) {
      for (int cn=0; cn<8; cn++) {
       pp1[cn]=0.0f;
       pp2[cn]=p[sta2*8+cn];
      }
      pp1[stoff]=1.0f;
     } else if (stc==sta2) {
      for (int cn=0; cn<8; cn++) {
       pp2[cn]=0.0f;
       pp1[cn]=p[sta1*8+cn];
      }
      pp2[stoff]=1.0f;
     }
     cuComplex G1[4];
     G1[0].x=pp1[0];
     G1[0].y=pp1[1];
     G1[1].x=pp1[2];
     G1[1].y=pp1[3];
     G1[2].x=pp1[4];
     G1[2].y=pp1[5];
     G1[3].x=pp1[6];
     G1[3].y=pp1[7];
     cuComplex T1[4];
     /* T=G1*C */
     T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
     T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
     T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
     T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
     cuComplex G2[4];
     /* conjugate this */
     G2[0].x=pp2[0];
     G2[0].y=-pp2[1];
     G2[2].x=pp2[2];
     G2[2].y=-pp2[3];
     G2[1].x=pp2[4];
     G2[1].y=-pp2[5];
     G2[3].x=pp2[6];
     G2[3].y=-pp2[7];
     cuComplex T2[4];
     T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
     T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
     T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
     T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
     /* update jacobian , with row weights */
     /* NOTE: row major order */
     jac[m+M*8*n]=wt[8*n]*T2[0].x;
     jac[m+M*(8*n+1)]=wt[8*n+1]*T2[0].y;
     jac[m+M*(8*n+2)]=wt[8*n+2]*T2[1].x;
     jac[m+M*(8*n+3)]=wt[8*n+3]*T2[1].y;
     jac[m+M*(8*n+4)]=wt[8*n+4]*T2[2].x;
     jac[m+M*(8*n+5)]=wt[8*n+5]*T2[2].y;
     jac[m+M*(8*n+6)]=wt[8*n+6]*T2[3].x;
     jac[m+M*(8*n+7)]=wt[8*n+7]*T2[3].y;
    } 
   }
 }
 __global__ void 
 kernel_setweights_fl(int N, float *wt, float alpha){
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* make sure to use only M threads */
  if (tid<N) {
     wt[tid]=alpha;
  }
 }
 __global__ void 
 kernel_hadamard_fl(int N, float *wt, float *x){
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* make sure to use only M threads */
  if (tid<N) {
     x[tid]*=wt[tid];
  }
 }
 __global__ void 
 kernel_updateweights_fl(int N, float *wt, float *x, float *q, float nu){
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* make sure to use only M threads */
  if (tid<N) {
     wt[tid]=((nu+1.0f)/(nu+x[tid]*x[tid]));
     q[tid]=wt[tid]-logf(wt[tid]); /* so that its +ve */
  }
 }
 __global__ void 
 kernel_sqrtweights_fl(int N, float *wt){
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* make sure to use only M threads */
  if (tid<N) {
     wt[tid]=sqrtf(wt[tid]); 
  }
 }
 __device__ float 
 digamma_fl(float x) {
  float result = 0.0f, xx, xx2, xx4;
  for ( ; x < 7.0f; ++x) { /* reduce x till x<7 */
    result -= 1.0f/x;
  }
  x -= 1.0f/2.0f;
  xx = 1.0f/x;
  xx2 = xx*xx;
  xx4 = xx2*xx2;
  result += logf(x)+(1.0f/24.0f)*xx2-(7.0f/960.0f)*xx4+(31.0f/8064.0f)*xx4*xx2-(127.0f/30720.0f)*xx4*xx4;
  return result;
 }
 __global__ void 
 kernel_evaluatenu_fl(int Nd, float qsum, float *q, float deltanu,float nulow) {
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  if (tid<Nd) {
   float thisnu=(nulow+((float)tid)*deltanu);
   float dgm=digamma_fl(thisnu*0.5f+0.5f);
   q[tid]=dgm-logf((thisnu+1.0f)*0.5f); /* psi((nu+1)/2)-log((nu+1)/2) */
   dgm=digamma_fl(thisnu*0.5f);
   q[tid]+=-dgm+logf((thisnu)*0.5f); /* -psi((nu)/2)+log((nu)/2) */
   q[tid]+=-qsum+1.0f; /* -(-sum(ln(w_i))/N+sum(w_i)/N)+1 */
  }
 }
 __global__ void 
 kernel_evaluatenu_fl_eight(int Nd, float qsum, float *q, float deltanu,float nulow, float nu0) {
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* each block calculte  psi((nu+8)/2)-log((nu+8)/2) */
  /* actually p=2, so psi((nu+2)/2)-log((nu+2)/2) */
  float dgm0;
  if (threadIdx.x==0) {
   dgm0=digamma_fl(nu0*0.5f+1.0f);
   dgm0=dgm0-logf((nu0+2.0f)*0.5f); /* psi((nu0+8)/2)-log((nu0+8)/2) */
  }
  __syncthreads();
  if (tid<Nd) {
   float thisnu=(nulow+((float)tid)*deltanu);
   q[tid]=dgm0; /* psi((nu0+8)/2)-log((nu0+8)/2) */
   float dgm=digamma_fl(thisnu*0.5f);
   q[tid]+=-dgm+logf((thisnu)*0.5f); /* -psi((nu)/2)+log((nu)/2) */
   q[tid]+=-qsum+1.0f; /* -(-sum(ln(w_i))/N+sum(w_i)/N)+1 */
  }
 }
 /* only use extern if calling code is C */
 extern "C"
 {
 /* set initial weights to 1 by a cuda kernel */
 void
 cudakernel_setweights_fl(int ThreadsPerBlock, int BlocksPerGrid, int N, float *wt, float alpha) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_setweights_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, alpha);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* hadamard product by a cuda kernel x<= x*wt */
 void
 cudakernel_hadamard_fl(int ThreadsPerBlock, int BlocksPerGrid, int N, float *wt, float *x) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_hadamard_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, x);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* update weights by a cuda kernel */
 void
 cudakernel_updateweights_fl(int ThreadsPerBlock, int BlocksPerGrid, int N, float *wt, float *x, float *q, float robust_nu) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_updateweights_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, x, q, robust_nu);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* update weights by a cuda kernel */
 void
 cudakernel_sqrtweights_fl(int ThreadsPerBlock, int BlocksPerGrid, int N, float *wt) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_sqrtweights_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* evaluate expression for finding optimum nu for 
  a range of nu values */
 void
 cudakernel_evaluatenu_fl(int ThreadsPerBlock, int BlocksPerGrid, int Nd, float qsum, float *q, float deltanu,float nulow) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_evaluatenu_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(Nd, qsum, q, deltanu,nulow);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* evaluate expression for finding optimum nu for 
  a range of nu values, using AECM (p=8 before, but now p=2)
  nu0: current value of robust_nu*/
 void
 cudakernel_evaluatenu_fl_eight(int ThreadsPerBlock, int BlocksPerGrid, int Nd, float qsum, float *q, float deltanu,float nulow, float nu0) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_evaluatenu_fl_eight<<< BlocksPerGrid, ThreadsPerBlock >>>(Nd, qsum, q, deltanu,nulow, nu0);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* cuda driver for calculating wt \odot f() */
 /* p: params (Mx1), x: data (Nx1), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
 void
 cudakernel_func_wt_fl(int ThreadsPerBlock, int BlocksPerGrid, float *p, float *x, int M, int N, float *coh, short *bbh, float *wt, int Nbase, int Mclus, int Nstations) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  cudaMemset(x, 0, N*sizeof(float));
 //  printf("Kernel data size=%d, block=%d, thread=%d, baselines=%d\n",N,BlocksPerGrid, ThreadsPerBlock,Nbase);
  kernel_func_wt_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(Nbase,  x, coh, p, bbh, wt, Nstations);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* cuda driver for calculating wt \odot jacf() */
 /* p: params (Mx1), jac: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
 void
 cudakernel_jacf_wt_fl(int ThreadsPerBlock_row, int  ThreadsPerBlock_col, float *p, float *jac, int M, int N, float *coh, short *bbh, float *wt, int Nbase, int Mclus, int Nstations, int clus) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  /* NOTE: use small value for ThreadsPerBlock here, like 8 */
  dim3 threadsPerBlock(16, 8);
  /* jacobian: Nbase x Nstations (proportional to N), so */
  dim3 numBlocks((Nbase+threadsPerBlock.x-1)/threadsPerBlock.x, 
               (M+threadsPerBlock.y-1)/threadsPerBlock.y);
  /* set memory of jac to zero */
  cudaMemset(jac, 0, N*M*sizeof(float));
 // printf("Kernel Jax data size=%d, params=%d, block=%d,%d, thread=%d,%d, baselines=%d\n",N, M, numBlocks.x,numBlocks.y, threadsPerBlock.x, threadsPerBlock.y, Nbase);
  kernel_jacf_wt_fl<<< numBlocks, threadsPerBlock>>>(Nbase,  M, jac, coh, p, bbh, wt, Nstations);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 }
--- a/src/lib/Solvers/robust_fl.o
+++ b/src/lib/Solvers/robust_fl.o
--- a/src/lib/Solvers/robust_lbfgs_nocuda.c
+++ b/src/lib/Solvers/robust_lbfgs_nocuda.c
--- a/src/lib/Solvers/robust_lbfgs_nocuda.o
+++ b/src/lib/Solvers/robust_lbfgs_nocuda.o
--- a/src/lib/Solvers/robustlm.c
+++ b/src/lib/Solvers/robustlm.c
--- a/src/lib/Solvers/robustlm.o
+++ b/src/lib/Solvers/robustlm.o
--- a/src/lib/Solvers/rtr_solve.c
+++ b/src/lib/Solvers/rtr_solve.c
--- a/src/lib/Solvers/rtr_solve.o
+++ b/src/lib/Solvers/rtr_solve.o
--- a/src/lib/Solvers/rtr_solve_cuda.c
+++ b/src/lib/Solvers/rtr_solve_cuda.c
@ -1,894 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #include <float.h>
 #include "Solvers.h"
 #include <cuda_runtime.h>
 //#define DEBUG
 /* helper functions for diagnostics */
 static void
 checkCudaError(cudaError_t err, char *file, int line)
 {
 #ifdef CUDA_DEBUG
    if(!err)
        return;
    fprintf(stderr,"GPU (CUDA): %s %s %d\n", cudaGetErrorString(err),file,line);
    exit(EXIT_FAILURE);
 #endif
 }
 static void
 checkCublasError(cublasStatus_t cbstatus, char *file, int line)
 {
 #ifdef CUDA_DEBUG
   if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr,"%s: %d: CUBLAS failure\n",file,line);
    exit(EXIT_FAILURE);  
   }
 #endif
 }
 /* Retraction 
   rnew: new value */
 /* rnew = x + r */
 void
 cudakernel_fns_R(int N, cuFloatComplex *x, cuFloatComplex *r, cuFloatComplex *rnew, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
  cublasStatus_t cbstatus;
  cbstatus=cublasCcopy(cbhandle,4*N,x,1,rnew,1);
  cuFloatComplex alpha;
  alpha.x=1.0f; alpha.y=0.0f;
  cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, r, 1, rnew, 1);
  checkCublasError(cbstatus,__FILE__,__LINE__);
 }
 /* inner product (metric) */
 float
 cudakernel_fns_g(int N,cuFloatComplex *x,cuFloatComplex *eta, cuFloatComplex *gamma,cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
 /* 2 x real( trace(eta'*gamma) )
  = 2 x real( eta(:,1)'*gamma(:,1) + eta(:,2)'*gamma(:,2) )
  no need to calculate off diagonal terms
  )*/
 cublasStatus_t cbstatus;
 cuFloatComplex r1,r2;
 //complex double v1=my_cdot(2*N,eta,gamma);
 cbstatus=cublasCdotc(cbhandle,2*N,eta,1,gamma,1,&r1);
 //complex double v2=my_cdot(2*N,&eta[2*N],&gamma[2*N]);
 cbstatus=cublasCdotc(cbhandle,2*N,&eta[2*N],1,&gamma[2*N],1,&r2);
 checkCublasError(cbstatus,__FILE__,__LINE__);
 return 2.0f*(r1.x+r2.x);
 }
 /* Projection 
   rnew: new value */
 void
 cudakernel_fns_proj(int N, cuFloatComplex *x, cuFloatComplex *z, cuFloatComplex *rnew, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
  /* projection  = Z-X Om, where
   Om X^H X+X^H X Om = X^H  Z - Z^H X
   is solved to find Om */
  cublasStatus_t cbstatus;
  /* find X^H X */
  cuFloatComplex xx00,xx01,xx10,xx11,*bd;
  //xx00=my_cdot(2*N,x,x);
  cbstatus=cublasCdotc(cbhandle,2*N,x,1,x,1,&xx00);
  //xx01=my_cdot(2*N,x,&x[2*N]);
  cbstatus=cublasCdotc(cbhandle,2*N,x,1,&x[2*N],1,&xx01);
  xx10=cuConjf(xx01);
  //xx11=my_cdot(2*N,&x[2*N],&x[2*N]);
  cbstatus=cublasCdotc(cbhandle,2*N,&x[2*N],1,&x[2*N],1,&xx11);
  /* find X^H Z (and using this just calculte Z^H X directly) */
  cuFloatComplex xz00,xz01,xz10,xz11;
  //xz00=my_cdot(2*N,x,z);
  cbstatus=cublasCdotc(cbhandle,2*N,x,1,z,1,&xz00);
  //xz01=my_cdot(2*N,x,&z[2*N]);
  cbstatus=cublasCdotc(cbhandle,2*N,x,1,&z[2*N],1,&xz01);
  //xz10=my_cdot(2*N,&x[2*N],z);
  cbstatus=cublasCdotc(cbhandle,2*N,&x[2*N],1,z,1,&xz10);
  //xz11=my_cdot(2*N,&x[2*N],&z[2*N]);
  cbstatus=cublasCdotc(cbhandle,2*N,&x[2*N],1,&z[2*N],1,&xz11);
  /* find X^H Z - Z^H X */
  cuFloatComplex rr00,rr01,rr10,rr11;
  //rr00=xz00-conj(xz00);
  rr00=cuCsubf(xz00,cuConjf(xz00));
  //rr01=xz01-conj(xz10);
  rr01=cuCsubf(xz01,cuConjf(xz10));
  //rr10=-conj(rr01);
  rr10.x=-rr01.x; rr10.y=rr01.y;
  //rr11=xz11-conj(xz11);
  rr11=cuCsubf(xz11,cuConjf(xz11));
  /* find I_2 kron (X^H X) + (X^H X)^T kron I_2 */
  /* A = [2*xx00  xx01       xx10         0
          xx10    xx11+xx00  0            xx10
          xx01    0          xx11+xx00    xx01
          0       xx01       xx10         2*xx11 ]
  */
  cuFloatComplex A[16],*Ad;
  A[0]=cuCmulf(make_cuFloatComplex(2.0f,0.0f),xx00);
  A[5]=A[10]=cuCaddf(xx00,xx11);
  A[15]=cuCmulf(make_cuFloatComplex(2.0f,0.0f),xx11);
  A[1]=A[8]=A[11]=A[13]=xx10;
  A[2]=A[4]=A[7]=A[14]=xx01;
  A[3]=A[6]=A[9]=A[12]=make_cuFloatComplex(0.0f,0.0f);
  cuFloatComplex b[4];
  b[0]=rr00;
  b[1]=rr10;
  b[2]=rr01;
  b[3]=rr11;
 #ifdef DEBUG
  printf("BEFOREA=[\n");
  printf("%f+j*(%f) %f+j*(%f) %f+j*(%f) %f+j*(%f)\n",A[0].x,A[0].y,A[4].x,A[4].y,A[8].x,A[8].y,A[12].x,A[12].y);
  printf("%f+j*(%f) %f+j*(%f) %f+j*(%f) %f+j*(%f)\n",A[1].x,A[1].y,A[5].x,A[5].y,A[9].x,A[9].y,A[13].x,A[13].y);
  printf("%f+j*(%f) %f+j*(%f) %f+j*(%f) %f+j*(%f)\n",A[2].x,A[2].y,A[6].x,A[6].y,A[10].x,A[10].y,A[14].x,A[14].y);
  printf("%f+j*(%f) %f+j*(%f) %f+j*(%f) %f+j*(%f)\n",A[3].x,A[3].y,A[7].x,A[7].y,A[11].x,A[11].y,A[15].x,A[15].y);
  printf("];\n");
  printf("BEFOREb=[\n");
  printf("%f+j*(%f)\n",b[0].x,b[0].y);
  printf("%f+j*(%f)\n",b[1].x,b[1].y);
  printf("%f+j*(%f)\n",b[2].x,b[2].y);
  printf("%f+j*(%f)\n",b[3].x,b[3].y);
  printf("];\n");
 #endif
  /* solve A u = b to find u , using double precision */
  cudaMalloc((void **)&Ad, 16*sizeof(cuFloatComplex));
  cudaMemcpy(Ad,A,16*sizeof(cuFloatComplex),cudaMemcpyHostToDevice);
  /* copy b to device */
  cudaMalloc((void **)&bd, 4*sizeof(cuFloatComplex));
  cudaMemcpy(bd,b,4*sizeof(cuFloatComplex),cudaMemcpyHostToDevice);
  //culaStatus status;
  //status=culaDeviceCgels('N',4,4,1,(culaDeviceFloatComplex *)Ad,4,(culaDeviceFloatComplex *)bd,4);
  //checkStatus(status,__FILE__,__LINE__);
  int work_size=0;
  int *devInfo;
  cudaError_t err;
  err=cudaMalloc((void**)&devInfo, sizeof(int));
  checkCudaError(err,__FILE__,__LINE__);
  cuFloatComplex *work,*taud;
  cusolverDnCgeqrf_bufferSize(solver_handle, 4, 4, (cuFloatComplex *)Ad, 4, &work_size);
  err=cudaMalloc((void**)&work, work_size*sizeof(cuFloatComplex));
  err=cudaMalloc((void**)&taud, 4*sizeof(cuFloatComplex));
  checkCudaError(err,__FILE__,__LINE__);
  cusolverDnCgeqrf(solver_handle, 4, 4, Ad, 4, taud, work, work_size, devInfo);
  cusolverDnCunmqr(solver_handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_C, 4, 1, 4, Ad, 4, taud, bd, 4, work, work_size, devInfo);
  cuFloatComplex cone; cone.x=1.0f; cone.y=0.0f;
  cbstatus=cublasCtrsm(cbhandle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_UPPER,CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,4,1,&cone,Ad,4,bd,4);
  cudaFree(work); 
  cudaFree(taud); 
  cudaFree(devInfo); 
 #ifdef DEBUG
  cudaMemcpy(b,bd,4*sizeof(cuFloatComplex),cudaMemcpyDeviceToHost);
  printf("Afterb=[\n");
  printf("%f+j*(%f)\n",b[0].x,b[0].y);
  printf("%f+j*(%f)\n",b[1].x,b[1].y);
  printf("%f+j*(%f)\n",b[2].x,b[2].y);
  printf("%f+j*(%f)\n",b[3].x,b[3].y);
  printf("];\n");
 #endif
  /* form Z - X * Om, where Om is given by solution b 
    but no need to rearrange b because it is already in col major order */
  //my_ccopy(4*N,z,1,rnew,1);
  cbstatus=cublasCcopy(cbhandle,4*N,z,1,rnew,1);
  checkCublasError(cbstatus,__FILE__,__LINE__);
  //my_zgemm('N','N',2*N,2,2,-1.0+0.0*_Complex_I,z,2*N,b,2,1.0+0.0*_Complex_I,rnew,2*N);
  cuFloatComplex a1,a2;
  a1.x=-1.0f; a1.y=0.0f;
  a2.x=1.0f; a2.y=0.0f;
 #ifdef DEBUG
 /* read back eta for checking */
 cuFloatComplex *etalocal;
 cudaHostAlloc((void **)&etalocal, sizeof(cuFloatComplex)*4*N,cudaHostAllocDefault);
 cudaMemcpy(etalocal, rnew, 4*N*sizeof(cuFloatComplex), cudaMemcpyDeviceToHost);
 printf("Rnewbefore=[\n");
 int ci;
 for (ci=0; ci<2*N; ci++) {
  printf("%f+j*(%f) %f+j*(%f);\n",etalocal[ci].x,etalocal[ci].y,etalocal[ci+2*N].x,etalocal[ci+2*N].y);
 }
 printf("]\n");
 #endif
  cbstatus=cublasCgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_N,2*N,2,2,&a1,x,2*N,bd,2,&a2,rnew,2*N);
 #ifdef DEBUG
  checkCublasError(cbstatus,__FILE__,__LINE__);
 cudaMemcpy(etalocal, rnew, 4*N*sizeof(cuFloatComplex), cudaMemcpyDeviceToHost);
 printf("Rnewafter=[\n");
 for (ci=0; ci<2*N; ci++) {
  printf("%f+j*(%f) %f+j*(%f);\n",etalocal[ci].x,etalocal[ci].y,etalocal[ci+2*N].x,etalocal[ci+2*N].y);
 }
 printf("]\n");
 cudaFreeHost(etalocal);
 #endif
  checkCublasError(cbstatus,__FILE__,__LINE__);
  cudaFree(Ad); 
  cudaFree(bd); 
 }
 /* gradient, also projected to tangent space */
 /* need 8N*BlocksPerGrid+ 8N*2 float storage */
 static void
 cudakernel_fns_fgrad(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, float *y, float *coh, short *bbh, float *iw, int negate, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
 cuFloatComplex *tempeta,*tempb;
 cublasStatus_t cbstatus=CUBLAS_STATUS_SUCCESS;
 cuFloatComplex alpha;
 cudaMalloc((void**)&tempeta, sizeof(cuFloatComplex)*4*N);
 cudaMalloc((void**)&tempb, sizeof(cuFloatComplex)*4*N);
 /* max size of M for one kernel call, to determine optimal blocks */
 int T=DEFAULT_TH_PER_BK*ThreadsPerBlock;
 if (M<T) {
  cudakernel_fns_fgradflat(ThreadsPerBlock, BlocksPerGrid, N, M, x, tempeta, y, coh, bbh);
 } else {
   /* reset memory to zero */
   cudaMemset(tempeta, 0, sizeof(cuFloatComplex)*4*N);
   /* loop through M baselines */
   int L=(M+T-1)/T;
   int ct=0;
   int myT,ci;
   for (ci=0; ci<L; ci++) {
    if (ct+T<M) {
      myT=T;
    } else {
      myT=M-ct;
    }
    int B=(myT+ThreadsPerBlock-1)/ThreadsPerBlock;
    cudakernel_fns_fgradflat(ThreadsPerBlock, B, N, myT, x, tempb, &y[ct*8], &coh[ct*8], &bbh[ct*2]);
    alpha.x=1.0f;alpha.y=0.0f;
    /* tempeta <= tempeta + tempb */
    cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, tempb, 1, tempeta, 1);
    ct=ct+T;
   }
 }
 cudakernel_fns_fscale(N, tempeta, iw);
 /* find -ve gradient */
 if (negate) {
  alpha.x=-1.0f;alpha.y=0.0f;
  cbstatus=cublasCscal(cbhandle,4*N,&alpha,tempeta,1);
 } 
 cudakernel_fns_proj(N, x, tempeta, eta, cbhandle, solver_handle);
 checkCublasError(cbstatus,__FILE__,__LINE__);
 cudaFree(tempeta);
 cudaFree(tempb);
 }
 /* Hessian, also projected to tangent space */
 /* need 8N*BlocksPerGrid+ 8N*2 float storage */
 static void
 cudakernel_fns_fhess(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, cuFloatComplex *fhess, float *y, float *coh, short *bbh, float *iw, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
 cuFloatComplex *tempeta,*tempb;
 cudaMalloc((void**)&tempeta, sizeof(cuFloatComplex)*4*N);
 cudaMalloc((void**)&tempb, sizeof(cuFloatComplex)*4*N);
 cuFloatComplex alpha;
 cublasStatus_t cbstatus=CUBLAS_STATUS_SUCCESS;
 /* max size of M for one kernel call, to determine optimal blocks */
 int T=DEFAULT_TH_PER_BK*ThreadsPerBlock;
 if (M<T) {
  cudakernel_fns_fhessflat(ThreadsPerBlock, BlocksPerGrid, N, M, x, eta, tempeta, y, coh, bbh);
 } else {
   /* reset memory to zero */
   cudaMemset(tempeta, 0, sizeof(cuFloatComplex)*4*N);
   /* loop through M baselines */
   int L=(M+T-1)/T;
   int ct=0;
   int myT,ci;
   for (ci=0; ci<L; ci++) {
    if (ct+T<M) {
      myT=T;
    } else {
      myT=M-ct;
    }
    int B=(myT+ThreadsPerBlock-1)/ThreadsPerBlock;
    cudakernel_fns_fhessflat(ThreadsPerBlock, B, N, myT, x, eta, tempb, &y[ct*8], &coh[ct*8], &bbh[ct*2]);
    alpha.x=1.0f;alpha.y=0.0f;
    /* tempeta <= tempeta + tempb */
    cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, tempb, 1, tempeta, 1);
    ct=ct+T;
   }
 }
 cudakernel_fns_fscale(N, tempeta, iw);
 cudakernel_fns_proj(N, x, tempeta, fhess, cbhandle, solver_handle);
 checkCublasError(cbstatus,__FILE__,__LINE__);
 cudaFree(tempeta);
 cudaFree(tempb);
 }
 /* Armijo step calculation,
  output teta: Armijo gradient 
  return value: 0 : cost reduced, 1: no cost reduction, so do not run again 
  mincost: minimum value of cost found, if possible
 */
 /* need 8N*BlocksPerGrid+ 8N*2 float storage */
 static int
 armijostep(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *teta, float *y, float *coh, short *bbh, float *iw, float *mincost, cublasHandle_t cbhandle,  cusolverDnHandle_t solver_handle) {
 float alphabar=10.0f;
 float beta=0.2f;
 float sigma=0.5f;
 cublasStatus_t cbstatus;
 /* temp storage, re-using global storage */ 
 cuFloatComplex *eta, *x_prop;
 cudaMalloc((void**)&eta, sizeof(cuFloatComplex)*4*N);
 cudaMalloc((void**)&x_prop, sizeof(cuFloatComplex)*4*N);
 //double fx=fns_f(x,y,gdata);
 float fx=cudakernel_fns_f(ThreadsPerBlock,BlocksPerGrid,N,M,x,y,coh,bbh);
 //fns_fgrad(x,eta,y,gdata,0);
 cudakernel_fns_fgrad(ThreadsPerBlock,BlocksPerGrid,N,M,x,eta,y,coh,bbh,iw,0,cbhandle, solver_handle);
 #ifdef DEBUG
 float eta_nrm;
 cublasScnrm2(cbhandle,4*N,eta,1,&eta_nrm);
 printf("||eta||=%f\n",eta_nrm);
 /* read back eta for checking */
 cuFloatComplex *etalocal;
 cudaHostAlloc((void **)&etalocal, sizeof(cuFloatComplex)*4*N,cudaHostAllocDefault);
 cudaMemcpy(etalocal, eta, 4*N*sizeof(cuFloatComplex), cudaMemcpyDeviceToHost);
 printf("Eta=[\n");
 int ci;
 for (ci=0; ci<2*N; ci++) {
  printf("%f %f %f %f\n",etalocal[ci].x,etalocal[ci].y,etalocal[ci+2*N].x,etalocal[ci+2*N].y);
 }
 printf("]\n");
 cudaFreeHost(etalocal);
 #endif
 float beta0=beta;
 float minfx=fx; float minbeta=beta0;
 float lhs,rhs,metric;
 int m,nocostred=0;
 cuFloatComplex alpha;
 *mincost=fx;
 float metric0=cudakernel_fns_g(N,x,eta,eta,cbhandle,solver_handle);
 for (m=0; m<50; m++) {
   /* abeta=(beta0)*alphabar*eta; */
   //my_ccopy(4*dp->N,eta,1,teta,1);
   cbstatus=cublasCcopy(cbhandle,4*N,eta,1,teta,1);
   //my_cscal(4*dp->N,beta0*alphabar+0.0*_Complex_I,teta);
   alpha.x=beta0*alphabar;alpha.y=0.0f;
   cbstatus=cublasCscal(cbhandle,4*N,&alpha,teta,1);
   /* Rx=R(x,teta); */
   //fns_R(dp->N,x,teta,x_prop);
   cudakernel_fns_R(N,x,teta,x_prop,cbhandle,solver_handle);
   //lhs=fns_f(x_prop,y,gdata);
   lhs=cudakernel_fns_f(ThreadsPerBlock,BlocksPerGrid,N,M,x_prop,y,coh,bbh);
   if (lhs<minfx) {
     minfx=lhs;
     *mincost=minfx;
     minbeta=beta0;
   }
   //rhs=fx+sigma*fns_g(dp->N,x,eta,teta);
   //metric=cudakernel_fns_g(N,x,eta,teta,cbhandle);
   metric=beta0*alphabar*metric0;
   rhs=fx+sigma*metric;
 #ifdef DEBUG
 printf("m=%d lhs=%e rhs=%e rat=%e norm=%e\n",m,lhs,rhs,lhs/rhs,metric);
 #endif
   if ((!isnan(lhs) && lhs<=rhs)) {
    minbeta=beta0;
    break;
   }
   beta0=beta0*beta;
 }
 /* if no further cost improvement is seen */
 if (lhs>fx) {
     nocostred=1;
 }
 //my_ccopy(4*dp->N,eta,1,teta,1);
 cbstatus=cublasCcopy(cbhandle,4*N,eta,1,teta,1);
 alpha.x=minbeta*alphabar; alpha.y=0.0f;
 //my_cscal(4*dp->N,minbeta*alphabar+0.0*_Complex_I,teta);
 cbstatus=cublasCscal(cbhandle,4*N,&alpha,teta,1);
 checkCublasError(cbstatus,__FILE__,__LINE__);
 cudaFree(eta);
 cudaFree(x_prop);
 return nocostred;
 }
 /* truncated conjugate gradient method 
  x, grad, eta, r, z, delta, Hxd  : size 2N x 2  complex 
  so, vector size is 4N complex double
  output: eta
  return value: stop_tCG code   
  y: vec(V) visibilities
 */
 /* need 8N*(BlocksPerGrid+2)+ 8N*6 float storage */
 static int
 tcg_solve_cuda(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *grad, cuFloatComplex *eta, cuFloatComplex *fhess, float Delta, float theta, float kappa, int max_inner, int min_inner, float *y, float *coh, short *bbh, float *iw, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) { 
  cuFloatComplex *r,*z,*delta,*Hxd, *rnew;
  float  e_Pe, r_r, norm_r, z_r, d_Pd, d_Hd, alpha, e_Pe_new,
     e_Pd, Deltasq, tau, zold_rold, beta, norm_r0;
  int cj, stop_tCG;
  cudaMalloc((void**)&r, sizeof(cuFloatComplex)*4*N);
  cudaMalloc((void**)&z, sizeof(cuFloatComplex)*4*N);
  cudaMalloc((void**)&delta, sizeof(cuFloatComplex)*4*N);
  cudaMalloc((void**)&Hxd, sizeof(cuFloatComplex)*4*N);
  cudaMalloc((void**)&rnew, sizeof(cuFloatComplex)*4*N);
  cublasStatus_t cbstatus;
  cuFloatComplex a0;
  /*
  initial values
  */
  cbstatus=cublasCcopy(cbhandle,4*N,grad,1,r,1);
  e_Pe=0.0f;
  r_r=cudakernel_fns_g(N,x,r,r,cbhandle,solver_handle);
  norm_r=sqrtf(r_r);
  norm_r0=norm_r;
  cbstatus=cublasCcopy(cbhandle,4*N,r,1,z,1);
  z_r=cudakernel_fns_g(N,x,z,r,cbhandle,solver_handle);
  d_Pd=z_r;
  /*
   initial search direction
  */
  cudaMemset(delta, 0, sizeof(cuFloatComplex)*4*N); 
  a0.x=-1.0f; a0.y=0.0f;
  cbstatus=cublasCaxpy(cbhandle,4*N, &a0, z, 1, delta, 1);
  e_Pd=cudakernel_fns_g(N,x,eta,delta,cbhandle,solver_handle);
  stop_tCG=5;
  /* % begin inner/tCG loop
    for j = 1:max_inner,
  */
  for(cj=1; cj<=max_inner; cj++) {
    cudakernel_fns_fhess(ThreadsPerBlock,BlocksPerGrid,N,M,x,delta,Hxd,y,coh,bbh,iw, cbhandle, solver_handle);
    d_Hd=cudakernel_fns_g(N,x,delta,Hxd,cbhandle,solver_handle);
    alpha=z_r/d_Hd;
    e_Pe_new = e_Pe + 2.0f*alpha*e_Pd + alpha*alpha*d_Pd;
    Deltasq=Delta*Delta;
    if (d_Hd <= 0.0f || e_Pe_new >= Deltasq) {
      tau = (-e_Pd + sqrtf(e_Pd*e_Pd + d_Pd*(Deltasq-e_Pe)))/d_Pd;
      a0.x=tau;
      cbstatus=cublasCaxpy(cbhandle,4*N, &a0, delta, 1, eta, 1);
      /* Heta = Heta + tau *Hdelta */
      cbstatus=cublasCaxpy(cbhandle,4*N, &a0, Hxd, 1, fhess, 1);
      stop_tCG=(d_Hd<=0.0f?1:2);
      break;
    }
    e_Pe=e_Pe_new;
    a0.x=alpha;
    cbstatus=cublasCaxpy(cbhandle,4*N, &a0, delta, 1, eta, 1);
    /* Heta = Heta + alpha*Hdelta */
    cbstatus=cublasCaxpy(cbhandle,4*N, &a0, Hxd, 1, fhess, 1);
    cbstatus=cublasCaxpy(cbhandle,4*N, &a0, Hxd, 1, r, 1);
    cudakernel_fns_proj(N, x, r, rnew, cbhandle,solver_handle);
    cbstatus=cublasCcopy(cbhandle,4*N,rnew,1,r,1);
    r_r=cudakernel_fns_g(N,x,r,r,cbhandle,solver_handle);
    norm_r=sqrtf(r_r);
    /*
      check kappa/theta stopping criterion
    */
    if (cj >= min_inner) {
      float norm_r0pow=powf(norm_r0,theta);
      if (norm_r <= norm_r0*MIN(norm_r0pow,kappa)) {
       stop_tCG=(kappa<norm_r0pow?3:4);
       break;
      }
    }
    cbstatus=cublasCcopy(cbhandle,4*N,r,1,z,1);
    zold_rold=z_r;
    z_r=cudakernel_fns_g(N,x,z,r,cbhandle,solver_handle);
    beta=z_r/zold_rold;
    a0.x=beta; 
    cbstatus=cublasCscal(cbhandle,4*N,&a0,delta,1);
    a0.x=-1.0f; 
    cbstatus=cublasCaxpy(cbhandle,4*N, &a0, z, 1, delta, 1);
    e_Pd = beta*(e_Pd + alpha*d_Pd);
    d_Pd = z_r + beta*beta*d_Pd;
  }
  checkCublasError(cbstatus,__FILE__,__LINE__);
  cudaFree(r);
  cudaFree(z);
  cudaFree(delta);
  cudaFree(Hxd);
  cudaFree(rnew);
  return stop_tCG;
 }
 /* follow clmfit_fl.c */
 int
 rtr_solve_cuda_fl(
  float *x0,         /* initial values and updated solution at output (size 8*N float) */
  float *y,         /* data vector (size 8*M float) */
  int N,              /* no of stations */
  int M,              /* no of constraints */
  int itmax_sd,          /* maximum number of iterations RSD */
  int itmax_rtr,          /* maximum number of iterations RTR */
  float Delta_bar, float Delta0, /* Trust region radius and initial value */
  double *info, /* initial and final residuals */
  cublasHandle_t cbhandle, /* device handle */
  cusolverDnHandle_t solver_handle, /* solver handle */
  int tileoff, /* tile offset when solving for many chunks */
  int ntiles, /* total tile (data) size being solved for */
  me_data_t *adata)
 {
  /* general note: all device variables end with a 'd' */
  cudaError_t err;
  cublasStatus_t cbstatus=CUBLAS_STATUS_SUCCESS;
  /* ME data */
  me_data_t *dp=(me_data_t*)adata;
  int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
  /* coherency on device */
  float *cohd;
  /* baseline-station map on device/host */
  short *bbd;
  /* calculate no of cuda threads and blocks */
  int ThreadsPerBlock=DEFAULT_TH_PER_BK;
  int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
  /* reshape x to make J: 2Nx2 complex double 
  */
  complex float *x;
  if ((x=(complex float*)malloc((size_t)4*N*sizeof(complex float)))==0) {
 #ifndef USE_MIC
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
 #endif
   exit(1);
  }
  /* map x: [(re,im)J_1(0,0) (re,im)J_1(0,1) (re,im)J_1(1,0) (re,im)J_1(1,1)...]
   to
  J: [J_1(0,0) J_1(1,0) J_2(0,0) J_2(1,0) ..... J_1(0,1) J_1(1,1) J_2(0,1) J_2(1,1)....]
 */
  float *Jd=(float*)x;
  /* re J(0,0) */
  my_fcopy(N, &x0[0], 8, &Jd[0], 4);
  /* im J(0,0) */
  my_fcopy(N, &x0[1], 8, &Jd[1], 4);
  /* re J(1,0) */
  my_fcopy(N, &x0[4], 8, &Jd[2], 4);
  /* im J(1,0) */
  my_fcopy(N, &x0[5], 8, &Jd[3], 4);
  /* re J(0,1) */
  my_fcopy(N, &x0[2], 8, &Jd[4*N], 4);
  /* im J(0,1) */
  my_fcopy(N, &x0[3], 8, &Jd[4*N+1], 4);
  /* re J(1,1) */
  my_fcopy(N, &x0[6], 8, &Jd[4*N+2], 4);
  /* im J(1,1) */
  my_fcopy(N, &x0[7], 8, &Jd[4*N+3], 4);
  int ci;
 /***************************************************/
 cuFloatComplex *xd,*fgradxd,*etad,*Hetad,*x_propd;
 float *yd;
 /* for counting how many baselines contribute to each station
   grad/hess calculation */
 float *iwd,*iw;
 if ((iw=(float*)malloc((size_t)N*sizeof(float)))==0) {
 #ifndef USE_MIC
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
 #endif
   exit(1);
 }
 cudaMalloc((void**)&fgradxd, sizeof(cuFloatComplex)*4*N);
 cudaMalloc((void**)&etad, sizeof(cuFloatComplex)*4*N);
 cudaMalloc((void**)&Hetad, sizeof(cuFloatComplex)*4*N);
 cudaMalloc((void**)&x_propd, sizeof(cuFloatComplex)*4*N);
 cudaMalloc((void**)&xd, sizeof(cuFloatComplex)*4*N);
 cudaMalloc((void**)&yd, sizeof(float)*8*M);
 cudaMalloc((void**)&cohd, sizeof(float)*8*Nbase);
 cudaMalloc((void**)&bbd, sizeof(short)*2*Nbase);
 cudaMalloc((void**)&iwd, sizeof(float)*N);
 /* need 8N*(BlocksPerGrid+8) for tcg_solve+grad/hess storage,
   so total storage needed is 
   8N*(BlocksPerGrid+8) + 8N*5 + 8*M + 8*Nbase + 2*Nbase + N
 */
 /* yd <=y : V */
 err=cudaMemcpy(yd, y, 8*M*sizeof(float), cudaMemcpyHostToDevice);
 checkCudaError(err,__FILE__,__LINE__);
 /* need to give right offset for coherencies */
 /* offset: cluster offset+time offset */
 /* C */
 err=cudaMemcpy(cohd, &(dp->ddcohf[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(float), cudaMemcpyHostToDevice);
 checkCudaError(err,__FILE__,__LINE__);
 /* correct offset for baselines */
 err=cudaMemcpy(bbd, &(dp->ddbase[2*(dp->Nbase)*(tileoff)]), Nbase*2*sizeof(short), cudaMemcpyHostToDevice);
 checkCudaError(err,__FILE__,__LINE__);
 /* xd <=x : solution */
 err=cudaMemcpy(xd, x, 8*N*sizeof(float), cudaMemcpyHostToDevice);
 checkCudaError(err,__FILE__,__LINE__);
 float fx,fx0,norm_grad,Delta,fx_prop,rhonum,rhoden,rho;
 /* count how many baselines contribute to each station, store (inverse) in iwd */
 count_baselines(Nbase,N,iw,&(dp->ddbase[2*(dp->Nbase)*(tileoff)]),dp->Nt);
 err=cudaMemcpy(iwd, iw, N*sizeof(float), cudaMemcpyHostToDevice);
 checkCudaError(err,__FILE__,__LINE__);
 free(iw);
 fx=cudakernel_fns_f(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd);
 fx0=fx;
 #ifdef DEBUG
 printf("Initial Cost=%g\n",fx0);
 #endif
 /***************************************************/
 int rsdstat=0;
 /* RSD solution */
 for (ci=0; ci<itmax_sd; ci++) {
  /* Armijo step */
  /* teta=armijostep(V,C,N,x); */
  //armijostep(N,x,eta,y,&gdata);
  rsdstat=armijostep(ThreadsPerBlock, BlocksPerGrid, N, M, xd, etad, yd, cohd, bbd,iwd,&fx,cbhandle,solver_handle);
  /* x=R(x,teta); */
  cudakernel_fns_R(N,xd,etad,x_propd,cbhandle,solver_handle);
  //my_ccopy(4*N,x_propd,1,xd,1);
  if (!rsdstat) {
   /* cost reduced, update solution */
   cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
  } else {
   /* no cost reduction, break loop */
   break; 
  }
 }
 Delta_bar=MIN(fx,0.01f);
 Delta0=Delta_bar*0.125f;
 //printf("fx=%g Delta_bar=%g Delta0=%g\n",fx,Delta_bar,Delta0);
 #ifdef DEBUG
 printf("NEW RSD cost=%g\n",fx);
 #endif
 /***************************************************/
   int min_inner,max_inner,min_outer,max_outer;
   float epsilon,kappa,theta,rho_prime;
   min_inner=1; max_inner=itmax_rtr;//8*N;
   min_outer=3;//itmax_rtr; //3; 
   max_outer=itmax_rtr;
   epsilon=(float)CLM_EPSILON;
   kappa=0.1f;
   theta=1.0f;
   /* default values 0.25, 0.75, 0.25, 2.0 */
   float eta1=0.0001f; float eta2=0.99f; float alpha1=0.25f; float alpha2=3.5f;
   rho_prime=eta1; /* should be <= 0.25, tune for parallel solve  */
   float rho_regularization; /* use large damping */
   rho_regularization=fx*1e-6f;
   /* damping: too small => locally converge, globally diverge
           |\
        |\ | \___
    -|\ | \|
       \
    right damping:  locally and globally converge
    -|\      
       \|\  
          \|\
             \____ 
    */
   float rho_reg;
   int model_decreased=0;
  /* RTR solution */
  int k=0;
  int stop_outer=(itmax_rtr>0?0:1);
  int stop_inner=0;
  if (!stop_outer) {
   cudakernel_fns_fgrad(ThreadsPerBlock,BlocksPerGrid,N,M,xd,fgradxd,yd,cohd,bbd,iwd,1,cbhandle,solver_handle);
   norm_grad=sqrtf(cudakernel_fns_g(N,xd,fgradxd,fgradxd,cbhandle,solver_handle));
  }
  Delta=Delta0;
  /* initial residual */
  info[0]=fx0;
  /*
   % ** Start of TR loop **
  */
   while(!stop_outer) {
    /*  
     % update counter
    */
     k++;
    /* eta = 0*fgradx; */
    cudaMemset(etad, 0, sizeof(cuFloatComplex)*4*N);
    /* solve TR subproblem, also returns Hessian */
    stop_inner=tcg_solve_cuda(ThreadsPerBlock,BlocksPerGrid, N, M, xd, fgradxd, etad, Hetad, Delta, theta, kappa, max_inner, min_inner,yd,cohd,bbd,iwd,cbhandle, solver_handle);
    /*
        Heta = fns.fhess(x,eta);
    */
    /*
      compute the retraction of the proposal
    */
   cudakernel_fns_R(N,xd,etad,x_propd,cbhandle,solver_handle);
    /*
      compute cost of the proposal
    */
    fx_prop=cudakernel_fns_f(ThreadsPerBlock,BlocksPerGrid,N,M,x_propd,yd,cohd,bbd);
    /*
      check the performance of the quadratic model
    */
    rhonum=fx-fx_prop;
    rhoden=-cudakernel_fns_g(N,xd,fgradxd,etad,cbhandle,solver_handle)-0.5f*cudakernel_fns_g(N,xd,Hetad,etad,cbhandle,solver_handle);
    /* regularization of rho ratio */
    /* 
    rho_reg = max(1, abs(fx)) * eps * options.rho_regularization;
    rhonum = rhonum + rho_reg;
    rhoden = rhoden + rho_reg;
    */
    rho_reg=MAX(1.0f,fx)*rho_regularization; /* no epsilon */
    rhonum+=rho_reg;
    rhoden+=rho_reg;
     /*
        rho =   rhonum  / rhoden;
     */
     rho=rhonum/rhoden;
    /* model_decreased = (rhoden >= 0); */
   /* OLD CODE if (fabsf(rhonum/fx) <sqrtf_epsilon) {
     rho=1.0f;
    } */
    model_decreased=(rhoden>=0.0f?1:0);
 #ifdef DEBUG
    printf("stop_inner=%d rho_reg=%g rho =%g/%g= %g rho'= %g\n",stop_inner,rho_reg,rhonum,rhoden,rho,rho_prime);
 #endif
    /*
      choose new TR radius based on performance
    */
    if ( !model_decreased || rho<eta1 ) {
      Delta=alpha1*Delta;
    } else if (rho>eta2 && (stop_inner==2 || stop_inner==1)) {
      Delta=MIN(alpha2*Delta,Delta_bar);
    }
    /*
      choose new iterate based on performance
    */
    if (model_decreased && rho>rho_prime) {
     cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
     fx=fx_prop;
     cudakernel_fns_fgrad(ThreadsPerBlock,BlocksPerGrid,N,M,xd,fgradxd,yd,cohd,bbd,iwd,1,cbhandle,solver_handle);
     norm_grad=sqrtf(cudakernel_fns_g(N,xd,fgradxd,fgradxd,cbhandle,solver_handle));
    }
    /*
     Testing for Stop Criteria
    */
    if (norm_grad<epsilon && k>min_outer) {
      stop_outer=1;
    }
    /*
     stop after max_outer iterations
     */
    if (k>=max_outer) {
      stop_outer=1;
    }
 #ifdef DEBUG
 printf("Iter %d cost=%g\n",k,fx);
 #endif
   }
   /* final residual */
   info[1]=fx;
 #ifdef DEBUG
 printf("NEW RTR cost=%g\n",fx);
 #endif
 /***************************************************/
 checkCublasError(cbstatus,__FILE__,__LINE__);
 cudaDeviceSynchronize();
  if(fx0>fx) {
  //printf("Cost final %g  initial %g\n",fx,fx0);
  /* copy back current solution */
  err=cudaMemcpy(x,xd,8*N*sizeof(float),cudaMemcpyDeviceToHost);
  checkCudaError(err,__FILE__,__LINE__);
  /* copy back solution to x0 : format checked*/
  /* re J(0,0) */
  my_fcopy(N, &Jd[0], 4, &x0[0], 8);
  /* im J(0,0) */
  my_fcopy(N, &Jd[1], 4, &x0[1], 8);
  /* re J(1,0) */
  my_fcopy(N, &Jd[2], 4, &x0[4], 8);
  /* im J(1,0) */
  my_fcopy(N, &Jd[3], 4, &x0[5], 8);
  /* re J(0,1) */
  my_fcopy(N, &Jd[4*N], 4, &x0[2], 8);
  /* im J(0,1) */
  my_fcopy(N, &Jd[4*N+1], 4, &x0[3], 8);
  /* re J(1,1) */
  my_fcopy(N, &Jd[4*N+2], 4, &x0[6], 8);
  /* im J(1,1) */
  my_fcopy(N, &Jd[4*N+3], 4, &x0[7], 8);
  }
  free(x);
 cudaFree(fgradxd);
 cudaFree(etad);
 cudaFree(Hetad);
 cudaFree(x_propd);
 cudaFree(xd);
 cudaFree(yd);
 cudaFree(cohd);
 cudaFree(bbd);
 cudaFree(iwd);
  return 0;
 }
--- a/src/lib/Solvers/rtr_solve_cuda.o
+++ b/src/lib/Solvers/rtr_solve_cuda.o
--- a/src/lib/Solvers/rtr_solve_robust.c
+++ b/src/lib/Solvers/rtr_solve_robust.c
--- a/src/lib/Solvers/rtr_solve_robust.o
+++ b/src/lib/Solvers/rtr_solve_robust.o
--- a/src/lib/Solvers/rtr_solve_robust_admm.c
+++ b/src/lib/Solvers/rtr_solve_robust_admm.c
--- a/src/lib/Solvers/rtr_solve_robust_admm.o
+++ b/src/lib/Solvers/rtr_solve_robust_admm.o
--- a/src/lib/Solvers/rtr_solve_robust_cuda.c
+++ b/src/lib/Solvers/rtr_solve_robust_cuda.c
--- a/src/lib/Solvers/rtr_solve_robust_cuda.o
+++ b/src/lib/Solvers/rtr_solve_robust_cuda.o
--- a/src/lib/Solvers/rtr_solve_robust_cuda_admm.c
+++ b/src/lib/Solvers/rtr_solve_robust_cuda_admm.c
--- a/src/lib/Solvers/rtr_solve_robust_cuda_admm.o
+++ b/src/lib/Solvers/rtr_solve_robust_cuda_admm.o
--- a/src/lib/Solvers/updatenu.c
+++ b/src/lib/Solvers/updatenu.c
@ -1,443 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include "Solvers.h"
 #include <math.h>
 /* Digamma function
   if x>7 use digamma(x) = digamma(x+1) - 1/x
   for accuracy
   using maple expansion
   series(Psi(x+1/2), x=infinity, 21);
   ln(x)+1/24/x^2-7/960/x^4+31/8064/x^6-127/30720/x^8+511/67584/x^10-1414477/67092480/x^12+8191/98304/x^14-118518239/267386880/x^16+5749691557/1882718208/x^18-91546277357/3460300800/x^20+O(1/x^21)
   based on code by Mark Johnson, 2nd September 2007
 */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static double
 digamma(double x) {
  /* FIXME catch -ve value as input */
  double result = 0.0, xx, xx2, xx4;
  for ( ; x < 7.0; ++x) { /* reduce x till x<7 */
    result -= 1.0/x;
  }
  x -= 0.5;
  xx = 1.0/x;
  xx2 = xx*xx;
  xx4 = xx2*xx2;
  result += log(x)+(1./24.)*xx2-(7.0/960.0)*xx4+(31.0/8064.0)*xx4*xx2-(127.0/30720.0)*xx4*xx4;
  return result;
 }
 /* update w<= (nu+1)/(nu+delta^2)
   then q <= w-log(w), so that it is +ve
 */ 
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static void *
 w_nu_update_threadfn(void *data) {
 thread_data_vecnu_t *t=(thread_data_vecnu_t*)data;
 int ci;
 for (ci=t->starti; ci<=t->endi; ci++) {
   //t->ed[ci]*=t->wtd[ci]; ??
   t->wtd[ci]=(t->nu0+1.0)/(t->nu0+t->ed[ci]*t->ed[ci]);
   t->q[ci]=t->wtd[ci]-log(t->wtd[ci]);
 }
 return NULL;
 }
 /* update w<= sqrt(w) */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static void *
 w_sqrt_threadfn(void *data) {
 thread_data_vecnu_t *t=(thread_data_vecnu_t*)data;
 int ci;
 for (ci=t->starti; ci<=t->endi; ci++) {
   t->wtd[ci]=sqrt(t->wtd[ci]);
 }
 return NULL;
 }
 /* update nu  */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static void *
 q_update_threadfn(void *data) {
 thread_data_vecnu_t *t=(thread_data_vecnu_t*)data;
 int ci;
 double thisnu,dgm;
 for (ci=t->starti; ci<=t->endi; ci++) {
   thisnu=(t->nulow+(double)ci*t->nu0); /* deltanu stored in nu0 */
   dgm=digamma(thisnu*0.5+0.5);
   t->q[ci]=dgm-log((thisnu+1.0)*0.5); /* psi((nu+1)/2)-log((nu+1)/2) */
   dgm=digamma(thisnu*0.5);
   t->q[ci]+=-dgm+log((thisnu)*0.5); /* -psi((nu)/2)+log((nu)/2) */
   t->q[ci]+=-t->sumq+1.0; /* q is w-log(w), so -ve: sum(ln(w_i))/N-sum(w_i)/N+1 */
 }
 return NULL;
 }
 /* update nu  */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static void *
 q_update_threadfn_aecm(void *data) {
 thread_data_vecnu_t *t=(thread_data_vecnu_t*)data;
 int ci;
 double thisnu,dgm;
 for (ci=t->starti; ci<=t->endi; ci++) {
   thisnu=(t->nulow+(double)ci*t->nu0); /* deltanu stored in nu0 */
   dgm=digamma(thisnu*0.5);
   t->q[ci]=-dgm+log((thisnu)*0.5); /* -psi((nu)/2)+log((nu)/2) */
   t->q[ci]+=-t->sumq+1.0; /* q is w-log(w), so -ve: sum(ln(w_i))/N-sum(w_i)/N+1 */
 }
 return NULL;
 }
 /* update nu (degrees of freedom)
   also update w
   nu0: current value of nu
   w: Nx1 weight vector
   ed: Nx1 residual error
   psi() : digamma function
   find soltion to
   psi((nu+1)/2)-ln((nu+1)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1 = 0
   use ln(gamma()) => lgamma_r
 */
 double
 update_w_and_nu(double nu0, double *w, double *ed, int N, int Nt, double nulow, double nuhigh) {
  int Nd=30; /* no of samples to estimate nu */
  int nth,nth1,ci;
  int Nthb0,Nthb;
  pthread_attr_t attr;
  pthread_t *th_array;
  thread_data_vecnu_t *threaddata;
  double deltanu,*q,thisnu,sumq;
  if ((q=(double*)calloc((size_t)N,sizeof(double)))==0) {
 #ifndef USE_MIC
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
      exit(1);
  }
  /* setup threads */
  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
 #ifndef USE_MIC
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
 #endif
   exit(1);
  }
  if ((threaddata=(thread_data_vecnu_t*)malloc((size_t)Nt*sizeof(thread_data_vecnu_t)))==0) {
 #ifndef USE_MIC
    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
 #endif
    exit(1);
  }
  /* calculate min values a thread can handle */
  Nthb0=(N+Nt-1)/Nt;
  /* iterate over threads, allocating indices per thread */
  ci=0;
  for (nth=0;  nth<Nt && ci<N; nth++) {
    if (ci+Nthb0<N) {
     Nthb=Nthb0;
    } else {
     Nthb=N-ci;
    }
    threaddata[nth].starti=ci;
    threaddata[nth].endi=ci+Nthb-1;
    threaddata[nth].ed=ed;
    threaddata[nth].wtd=w;
    threaddata[nth].q=q;
    threaddata[nth].nu0=nu0;
    threaddata[nth].nulow=nulow;
    threaddata[nth].nuhigh=nuhigh;
    pthread_create(&th_array[nth],&attr,w_nu_update_threadfn,(void*)(&threaddata[nth]));
    /* next baseline set */
    ci=ci+Nthb;
  }
  /* now wait for threads to finish */
  for(nth1=0; nth1<nth; nth1++) {
   pthread_join(th_array[nth1],NULL);
  }
  sumq=my_dasum(N,q)/(double)N; /* sum(|w_i-log(w_i)|/N), assume all elements are +ve */
  for(nth1=0; nth1<nth; nth1++) {
    pthread_create(&th_array[nth1],&attr,w_sqrt_threadfn,(void*)(&threaddata[nth1]));
  }
  for(nth1=0; nth1<nth; nth1++) {
   pthread_join(th_array[nth1],NULL);
  }
  /* search range 2 to 30 because if nu~=30, its Gaussian */
  deltanu=(double)(nuhigh-nulow)/(double)Nd;
  Nthb0=(Nd+Nt-1)/Nt;
  /* check for too low number of values per thread, halve the threads */
  if (Nthb0<=2) {
   Nt=Nt/2;
   Nthb0=(Nd+Nt-1)/Nt;
  }
  ci=0;
  for (nth=0;  nth<Nt && ci<Nd; nth++) {
    if (ci+Nthb0<Nd) {
     Nthb=Nthb0;
    } else {
     Nthb=Nd-ci;
    }
    threaddata[nth].starti=ci;
    threaddata[nth].endi=ci+Nthb-1;
    threaddata[nth].q=q;
    threaddata[nth].nu0=deltanu;
    threaddata[nth].sumq=sumq;
    pthread_create(&th_array[nth],&attr,q_update_threadfn,(void*)(&threaddata[nth]));
    /* next baseline set */
    ci=ci+Nthb;
  }
  /* now wait for threads to finish */
  for(nth1=0; nth1<nth; nth1++) {
   pthread_join(th_array[nth1],NULL);
  }
  pthread_attr_destroy(&attr);
  free(th_array);
  free(threaddata);
  ci=my_idamin(Nd,q,1);
  thisnu=(nulow+(double)ci*deltanu);
  free(q);
  return thisnu;
 return 0;
 }
 /* update nu (degrees of freedom)
   nu_old: old nu
   logsumw = 1/N sum(log(w_i)-w_i)
   use Nd values in [nulow,nuhigh] to find nu
   psi() : digamma function
   find soltion to
   psi((nu_old+p)/2)-ln((nu_old+p)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1 = 0
   use ln(gamma()) => lgamma_r
   p: 1 or 8
 */
 double
 update_nu(double logsumw, int Nd, int Nt, double nulow, double nuhigh, int p, double nu_old) {
  int ci,nth,nth1,Nthb,Nthb0;
  double deltanu,thisnu,*q;
  pthread_attr_t attr;
  pthread_t *th_array;
  thread_data_vecnu_t *threaddata;
  if ((q=(double*)calloc((size_t)Nd,sizeof(double)))==0) {
 #ifndef USE_MIC
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
      exit(1);
  }
  /* setup threads */
  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
 #ifndef USE_MIC
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
 #endif
   exit(1);
  }
  if ((threaddata=(thread_data_vecnu_t*)malloc((size_t)Nt*sizeof(thread_data_vecnu_t)))==0) {
 #ifndef USE_MIC
    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
 #endif
    exit(1);
  }
  /* calculate psi((nu_old+p)/2)-ln((nu_old+p)/2) */
  double dgm=digamma((nu_old+(double)p)*0.5);
  dgm=dgm-log((nu_old+(double)p)*0.5); /* psi((nu+p)/2)-log((nu+p)/2) */
  deltanu=(double)(nuhigh-nulow)/(double)Nd;
  Nthb0=(Nd+Nt-1)/Nt;
  /* check for too low number of values per thread, halve the threads */
  if (Nthb0<=2) {
   Nt=Nt/2;
   Nthb0=(Nd+Nt-1)/Nt;
  }
  ci=0;
  for (nth=0;  nth<Nt && ci<Nd; nth++) {
    if (ci+Nthb0<Nd) {
     Nthb=Nthb0;
    } else {
     Nthb=Nd-ci;
    }
    threaddata[nth].starti=ci;
    threaddata[nth].endi=ci+Nthb-1;
    threaddata[nth].q=q;
    threaddata[nth].nu0=deltanu;
    threaddata[nth].nulow=nulow;
    threaddata[nth].nuhigh=nuhigh;
    threaddata[nth].sumq=-logsumw-dgm;
    pthread_create(&th_array[nth],&attr,q_update_threadfn_aecm,(void*)(&threaddata[nth]));
    /* next baseline set */
    ci=ci+Nthb;
  }
  /* now wait for threads to finish */
  for(nth1=0; nth1<nth; nth1++) {
   pthread_join(th_array[nth1],NULL);
  }
  pthread_attr_destroy(&attr);
  free(th_array);
  free(threaddata);
  ci=my_idamin(Nd,q,1);
  thisnu=(nulow+((double)ci)*deltanu);
  free(q);
  return thisnu;
 }
 /* ud = sqrt(u^2+v^2) */
 static double
 ncp_weight(double ud) {
 /*    fo(x) = 1/(1+alpha*exp(-x/A))
      A ~=30
 */
 if (ud>400.0) return 1.0; /* no effect on long baselines */
 //return 1.0/(1.0+0.4*exp(-0.05*ud)); 
 return 1.0/(1.0+1.8*exp(-0.05*ud)); 
 }
 static void *
 threadfn_setblweight(void *data) {
 thread_data_baselinewt_t *t=(thread_data_baselinewt_t*)data;
 int ci;
 for (ci=0; ci<t->Nb; ci++) {
  /* get sqrt(u^2+v^2) */
  double uu=t->u[ci+t->boff]*t->freq0;
  double vv=t->v[ci+t->boff]*t->freq0;
  double a=ncp_weight(sqrt(uu*uu+vv*vv));
  t->wt[8*(ci+t->boff)]*=a;
  t->wt[8*(ci+t->boff)+1]*=a;
  t->wt[8*(ci+t->boff)+2]*=a;
  t->wt[8*(ci+t->boff)+3]*=a;
  t->wt[8*(ci+t->boff)+4]*=a;
  t->wt[8*(ci+t->boff)+5]*=a;
  t->wt[8*(ci+t->boff)+6]*=a;
  t->wt[8*(ci+t->boff)+7]*=a;
  //printf("%lf %lf %lf\n",uu,vv,a);
 }
 return NULL;
 }
 /* 
  taper data by weighting based on uv distance (for short baselines)
  for example: use weights as the inverse density function
  1/( 1+f(u,v) ) 
 as u,v->inf, f(u,v) -> 0 so long baselines are not affected 
 x: Nbase*8 x 1 (input,output) data
 u,v : Nbase x 1
 note: u = u/c, v=v/c here, so need freq to convert to wavelengths */
 void
 whiten_data(int Nbase, double *x, double *u, double *v, double freq0, int Nt) {
 pthread_attr_t attr;
 pthread_t *th_array;
 thread_data_baselinewt_t *threaddata;
 int ci,nth1,nth;
 int Nthb0,Nthb;
 Nthb0=(Nbase+Nt-1)/Nt;
 pthread_attr_init(&attr);
 pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
 if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
 #ifndef USE_MIC
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
 #endif
   exit(1);
 }
 if ((threaddata=(thread_data_baselinewt_t*)malloc((size_t)Nt*sizeof(thread_data_baselinewt_t)))==0) {
 #ifndef USE_MIC
    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
 #endif
    exit(1);
 }
  /* iterate over threads, allocating baselines per thread */
  ci=0;
  for (nth=0;  nth<Nt && ci<Nbase; nth++) {
    if (ci+Nthb0<Nbase) {
     Nthb=Nthb0;
    } else {
     Nthb=Nbase-ci;
    }
    threaddata[nth].Nb=Nthb;
    threaddata[nth].boff=ci;
    threaddata[nth].wt=x;
    threaddata[nth].u=u;
    threaddata[nth].v=v;
    threaddata[nth].freq0=freq0;
    pthread_create(&th_array[nth],&attr,threadfn_setblweight,(void*)(&threaddata[nth]));
    /* next baseline set */
    ci=ci+Nthb;
  }
  /* now wait for threads to finish */
  for(nth1=0; nth1<nth; nth1++) {
   pthread_join(th_array[nth1],NULL);
  }
 pthread_attr_destroy(&attr);
 free(th_array);
 free(threaddata);
 }
--- a/src/lib/Solvers/updatenu.o
+++ b/src/lib/Solvers/updatenu.o
--- a/src/lib/admm_solve.c
+++ b/src/lib/admm_solve.c
--- a/src/lib/barrier.c
+++ b/src/lib/barrier.c
@ -1,121 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include "sagecal.h"
 /* implementation of a barrier to sync threads.
  The barrier has two doors (enter and exit). Only one door 
  can be open at a time. Initially the enter door is open.
  All threads that enter the barrier are sleeping (wait).
  The last thread to enter the barrier will 
   1)close the enter door
   2)wakeup all sleeping threads.
   3)open the exit door.
  So the woken up threads will leave the barrier one by 
  one, as they are awoken. The last thread to leave the barrier
  will
   1)open the enter door 
   2)close the exit door,
  So finally the barrier reaches its initial state
 */
 /* initialize barrier */
 /* N - no. of accomodated threads */
 void
 init_th_barrier(th_barrier *barrier, int N)
 {
 barrier->tcount=0; /* initially empty */
 barrier->nthreads=N;
 pthread_mutex_init(&barrier->enter_mutex,NULL);
 pthread_mutex_init(&barrier->exit_mutex,NULL);
 pthread_cond_init(&barrier->lastthread_cond,NULL);
 pthread_cond_init(&barrier->exit_cond,NULL);
 }
 /* destroy barrier */
 void
 destroy_th_barrier(th_barrier *barrier)
 {
 pthread_mutex_destroy(&barrier->enter_mutex);
 pthread_mutex_destroy(&barrier->exit_mutex);
 pthread_cond_destroy(&barrier->lastthread_cond);
 pthread_cond_destroy(&barrier->exit_cond);
 barrier->tcount=barrier->nthreads=0;
 }
 /* the main operation of the barrier */
 void
 sync_barrier(th_barrier *barrier)
 {
 /* trivial case */
 if(barrier->nthreads <2) return;
 /* else */
 /* new threads enters the barrier. Now close the entry door
  so that other threads cannot enter the barrier until we are done */
 pthread_mutex_lock(&barrier->enter_mutex);
 /* next lock the exit mutex - no threads can leave either */
 pthread_mutex_lock(&barrier->exit_mutex);
 /* now check to see if this is the last expected thread */
 if( ++(barrier->tcount) < barrier->nthreads) {
  /* no. this is not the last thread. so open the entry door */
  pthread_mutex_unlock(&barrier->enter_mutex);
 /* go to sleep */
  pthread_cond_wait(&barrier->exit_cond,&barrier->exit_mutex);
 } else {
 /* this is the last thread */
 /* wakeup sleeping threads */
 pthread_cond_broadcast(&barrier->exit_cond);
 /* go to sleep until all threads are woken up
   and leave the barrier */
 pthread_cond_wait(&barrier->lastthread_cond,&barrier->exit_mutex);
 /* now all threads have left the barrier. so open the entry door again */
 pthread_mutex_unlock(&barrier->enter_mutex);
 } 
 /* next to the last thread leaving the barrier */
 if(--(barrier->tcount)==1) {
  /* wakeup the last sleeping thread */
  pthread_cond_broadcast(&barrier->lastthread_cond);
 }
 pthread_mutex_unlock(&barrier->exit_mutex);
 } 
 /* master and two slaves */
 //int
 //main(int argc, char *argv[]) {
 // th_pipeline p;
 // 
 // gbdata g;
 //
 // init_pipeline(&p,&g);
 //sync_barrier(&(p.gate1)); /* stop at gate 1 */
 //   g.status=0; /* master work */
 //sync_barrier(&(p.gate2)); /* stop at gate 2 */
 // //exec_pipeline(&p);
 //sync_barrier(&(p.gate1)); /* stop at gate 1 */
 // g.status=10; /* master work */
 //sync_barrier(&(p.gate2)); /* stop at gate 2 */
 // //exec_pipeline(&p);
 // destroy_pipeline(&p);
 // /* still need to free slave_data structs, from data */
 // return 0;
 //}
--- a/src/lib/clmfit.c
+++ b/src/lib/clmfit.c
--- a/src/lib/clmfit_fl.c
+++ b/src/lib/clmfit_fl.c
--- a/src/lib/clmfit_nocuda.c
+++ b/src/lib/clmfit_nocuda.c
--- a/src/lib/consensus_poly.c
+++ b/src/lib/consensus_poly.c
@ -1,349 +0,0 @@
 /*
 *
 Copyright (C) 2014 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include "sagecal.h"
 #include <math.h>
 #include <stdio.h>
 //#define DEBUG
 /* build matrix with polynomial terms
  B : Npoly x Nf, each row is one basis function
  Npoly : total basis functions
  Nf: frequencies
  freqs: Nfx1 array freqs
  freq0: reference freq
  type : 
  0 :[1 ((f-fo)/fo) ((f-fo)/fo)^2 ...] basis functions
  1 : normalize each row such that norm is 1
  2 : Bernstein poly \sum N_C_r x^r (1-x)^r where x in [0,1] : use min,max values of freq to normalize
     Note: freqs might not be in sorted order, so need to search array to find min,max values
  3: [1 ((f-fo)/fo) (fo/f-1) ((f-fo)/fo)^2 (fo/f-1)^2 ... ] basis, for this case odd Npoly  preferred
 */
 int
 setup_polynomials(double *B, int Npoly, int Nf, double *freqs, double freq0, int type) {
  if (type==0 || type==1) {
  double frat,dsum;
  double invf=1.0/freq0;
  int ci,cm;
  for (ci=0; ci<Nf; ci++) {
     B[ci*Npoly]=1.0;
     frat=(freqs[ci]-freq0)*invf;
     for (cm=1; cm<Npoly; cm++) {
      B[ci*Npoly+cm]=B[ci*Npoly+cm-1]*frat;
     }
  }
 #ifdef DEBUG
  int cj;
  printf("BT=[\n");
  for(cj=0; cj<Npoly; cj++) {
   for (ci=0; ci<Nf; ci++) {
    printf("%lf ",B[ci*Npoly+cj]); 
   }
   printf("\n");
  }
  printf("];\n");
 #endif
  if (type==1) {
   /* normalize each row such that norm is 1 */
   for (cm=0; cm<Npoly; cm++) {
     dsum=0.0;
     for (ci=0; ci<Nf; ci++) {
      dsum+=B[ci*Npoly+cm]*B[ci*Npoly+cm];
     }
     if (dsum>0.0) {
      invf=1.0/sqrt(dsum);
     } else {
      invf=0.0;
     }
     for (ci=0; ci<Nf; ci++) {
      B[ci*Npoly+cm] *=invf;
     }
   }
  }
  } else if (type==2) {
   /* Bernstein polynomials */
   int idmax=my_idamax(Nf, freqs, 1);
   int idmin=my_idamin(Nf, freqs, 1);
   double fmax=freqs[idmax-1];
   double fmin=freqs[idmin-1];
   double *fact; /* factorial array */
   double *px,*p1x; /* arrays for powers of x and (1+x) */
   if ((fact=(double*)calloc((size_t)Npoly,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
   }
   if ((px=(double*)calloc((size_t)Npoly*Nf,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
   }
   if ((p1x=(double*)calloc((size_t)Npoly*Nf,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
   }
   fact[0]=1.0;
   int ci,cj;
   for (ci=1; ci<Npoly; ci++) {
     fact[ci]=fact[ci-1]*(double)ci;
   }
   double invf=1.0/(fmax-fmin);
   double frat;
   for (ci=0; ci<Nf; ci++) {
     /* normalize coordinates */
     frat=(freqs[ci]-fmin)*invf;
     px[ci]=1.0;
     p1x[ci]=1.0;
     px[ci+Nf]=frat;
     p1x[ci+Nf]=1.0-frat;
   }
   for (cj=2; cj<Npoly; cj++) {
    for (ci=0; ci<Nf; ci++) {
     px[cj*Nf+ci]=px[(cj-1)*Nf+ci]*px[Nf+ci]; 
     p1x[cj*Nf+ci]=p1x[(cj-1)*Nf+ci]*p1x[Nf+ci]; 
    }
   }
   for (cj=0; cj<Npoly; cj++) { /* ci: freq, cj: poly order */
     frat=fact[Npoly-1]/(fact[Npoly-cj-1]*fact[cj]);
     for (ci=0; ci<Nf; ci++) {
      B[ci*Npoly+cj]=frat*px[cj*Nf+ci]*p1x[(Npoly-cj-1)*Nf+ci];
     }
   }
 #ifdef DEBUG
   printf("BT=[\n");
   for(cj=0; cj<Npoly; cj++) {
    for (ci=0; ci<Nf; ci++) {
    printf("%lf ",B[ci*Npoly+cj]); 
   }
   printf("\n");
   }
   printf("];\n");
 #endif
   free(fact);
   free(px);
   free(p1x);
  } else if (type==3) { /* [1 (f-fo)/fo (fo/f-1) ... */
   double frat;
   double invf=1.0/freq0;
   int ci,cm;
   for (ci=0; ci<Nf; ci++) {
     B[ci*Npoly]=1.0;
     frat=(freqs[ci]-freq0)*invf;
     double lastval=frat;
     for (cm=1; cm<Npoly; cm+=2) { /* odd values 1,3,5,... */
      B[ci*Npoly+cm]=lastval;
      lastval*=frat;
     }
     frat=(freq0/freqs[ci]-1.0);
     lastval=frat;
     for (cm=2; cm<Npoly; cm+=2) { /* even values 2,4,6,... */
      B[ci*Npoly+cm]=lastval;
      lastval*=frat;
     }
   }
 #ifdef DEBUG
  int cj;
  printf("BT=[\n");
  for(cj=0; cj<Npoly; cj++) {
   for (ci=0; ci<Nf; ci++) {
    printf("%lf ",B[ci*Npoly+cj]); 
   }
   printf("\n");
  }
  printf("];\n");
 #endif
  } else {
    fprintf(stderr,"%s : %d: undefined polynomial type\n",__FILE__,__LINE__);
  }
  return 0;
 }
 /* build matrix with polynomial terms
  B : Npoly x Nf, each row is one basis function
  Bi: Npoly x Npoly pseudo inverse of sum( B(:,col) x B(:,col)' )
  Npoly : total basis functions
  Nf: frequencies
  fratio: Nfx1 array of weighing factors depending on the flagged data of each freq
  Sum taken is a weighted sum, using weights in fratio
 */
 int
 find_prod_inverse(double *B, double *Bi, int Npoly, int Nf, double *fratio) {
  int ci,status,lwork=0;
  double w[1],*WORK,*U,*S,*VT;
  /* set Bi to zero */
  memset(Bi,0,sizeof(double)*Npoly*Npoly);
  /* find sum */
  for (ci=0; ci<Nf; ci++) { 
   /* outer product */
   my_dgemm('N','T',Npoly,Npoly,1,fratio[ci],&B[ci*Npoly],Npoly,&B[ci*Npoly],Npoly,1.0,Bi,Npoly);
  }
 #ifdef DEBUG
  int cj;
  printf("BT=[\n");
  for (ci=0; ci<Nf; ci++) {
   for(cj=0; cj<Npoly; cj++) {
    printf("%lf ",B[ci*Npoly+cj]); 
   }
   printf("\n");
  }
  printf("];\nBi=[\n");
  for (ci=0; ci<Npoly; ci++) {
   for(cj=0; cj<Npoly; cj++) {
    printf("%lf ",Bi[ci*Npoly+cj]); 
   }
   printf("\n");
  }
  printf("];\n");
 #endif
  if ((U=(double*)calloc((size_t)Npoly*Npoly,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  if ((VT=(double*)calloc((size_t)Npoly*Npoly,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  if ((S=(double*)calloc((size_t)Npoly,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  /* memory for SVD */
  status=my_dgesvd('A','A',Npoly,Npoly,Bi,Npoly,S,U,Npoly,VT,Npoly,w,-1);
  if (!status) {
    lwork=(int)w[0];
  } else {
    printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
    exit(1);
  }
  if ((WORK=(double*)calloc((size_t)lwork,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
  }
  status=my_dgesvd('A','A',Npoly,Npoly,Bi,Npoly,S,U,Npoly,VT,Npoly,WORK,lwork);
  if (status) {
    printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
    exit(1);
  }
  /* find 1/singular values, and multiply columns of U with new singular values */
  for (ci=0; ci<Npoly; ci++) {
   if (S[ci]>CLM_EPSILON) {
    S[ci]=1.0/S[ci];
   } else {
    S[ci]=0.0;
   }
   my_dscal(Npoly,S[ci],&U[ci*Npoly]);
  }
  /* find product U 1/S V^T */
  my_dgemm('N','N',Npoly,Npoly,Npoly,1.0,U,Npoly,VT,Npoly,0.0,Bi,Npoly);
 #ifdef DEBUG
  printf("Bii=[\n");
  for (ci=0; ci<Npoly; ci++) {
   for(cj=0; cj<Npoly; cj++) {
    printf("%lf ",Bi[ci*Npoly+cj]); 
   }
   printf("\n");
  }
  printf("];\n");
 #endif
  free(U);
  free(S);
  free(VT);
  free(WORK);
  return 0;
 }
 /* update Z
   Z: 8N Npoly x M double array (real and complex need to be updated separate)
   N : stations
   M : clusters
   Npoly: no of basis functions
   z : right hand side 8NM Npoly x 1 (note the different ordering from Z)
   Bi : NpolyxNpoly matrix, Bi^T=Bi assumed
 */
 int 
 update_global_z(double *Z,int N,int M,int Npoly,double *z,double *Bi) { 
 /* one block of Z for one direction 2Nx2xNpoly (complex)
    and 8NxNpoly  real values : select one column : 2NxNpoly (complex)
    select real,imag : 2NxNpoly each (vector)
    reshape each to 2NxNpoly matrix => Q
    Bi : NpolyxNpoly matrix = B^T
    for each direction (M values)
    select 2N,2N,... : 2Nx Npoly complex values from z (ordered by M)
    select real,imag: size 2NxNpoly, 2NxNpoly vectors
    reshape to 2NxNpoly => R
    reshape to 2NxNpoly => I (imag)
    then Q=([R I] Bi^T) for each column
    Q=[R_1^T I_1^T R_2^T I_2^T]^T Bi^T for 2 columns
    R_1,I_1,R_2,I_2 : size 2NxNpoly 
    R : (2N 4) x Npoly
    so find Q
 */
 double *R,*Q;
 if ((R=(double*)calloc((size_t)2*N*Npoly*4,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
 }
 if ((Q=(double*)calloc((size_t)2*N*Npoly*4,sizeof(double)))==0) {
    printf("%s: %d: no free memory\n",__FILE__,__LINE__);
    exit(1);
 }
 int ci,np;
 for (ci=0; ci<M; ci++) {
  for (np=0; np<Npoly; np++) {
   /* select 2N */
   my_dcopy(2*N, &z[8*N*ci+np*8*N*M], 4, &R[np*8*N], 1); /* R_1 */
   my_dcopy(2*N, &z[8*N*ci+np*8*N*M+1], 4, &R[np*8*N+2*N], 1); /* I_1 */
   my_dcopy(2*N, &z[8*N*ci+np*8*N*M+2], 4, &R[np*8*N+2*2*N], 1); /* R_2 */
   my_dcopy(2*N, &z[8*N*ci+np*8*N*M+3], 4, &R[np*8*N+3*2*N], 1); /* I_2 */
  }
  /* find Q=R B^T */
  memset(Q,0,sizeof(double)*2*N*Npoly*4);
  my_dgemm('N','N',8*N,Npoly,Npoly,1.0,R,8*N,Bi,Npoly,1.0,Q,8*N);
  /* copy back to Z */ 
  for (np=0; np<Npoly; np++) {
   my_dcopy(2*N, &Q[np*8*N], 1, &Z[8*N*Npoly*ci+8*N*np], 4); 
   my_dcopy(2*N, &Q[np*8*N+2*N], 1, &Z[8*N*Npoly*ci+8*N*np+1], 4); 
   my_dcopy(2*N, &Q[np*8*N+2*2*N], 1, &Z[8*N*Npoly*ci+8*N*np+2], 4); 
   my_dcopy(2*N, &Q[np*8*N+3*2*N], 1, &Z[8*N*Npoly*ci+8*N*np+3], 4); 
  }
 }
 free(R);
 free(Q);
 return 0;
 }
--- a/src/lib/dataio.c
+++ b/src/lib/dataio.c
@ -1,82 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include <stdio.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <sys/mman.h>
 #include "sagecal.h"
 int 
 open_data_stream(int file, double **d, int *count, int *N, double *freq0, double *ra0, double *dec0) {
  struct stat statbuf;
  int ig;
 /* find the file size */
 if (fstat (file,&statbuf) < 0) {
   fprintf(stderr,"%s: %d: no file open\n",__FILE__,__LINE__);
   exit(1);
 }
 //printf("file size (bytes) %d\n",(int)statbuf.st_size);
 /* total double values is size/8 */
 *count=statbuf.st_size/8;
 //printf("total double values %d\n",*count);
  /* map the file to memory */
  *d= (double*)mmap(NULL,  statbuf.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, file, 0);
  if ( !d) {
     fprintf(stderr,"%s: %d: no file open\n",__FILE__,__LINE__);
 		 exit(1);
  }
  /* remove header from data */
  *N=(int)(*d)[0];
  *freq0=(*d)[1];
  *ra0=(*d)[2];
  *dec0=(*d)[3];
  /* read ignored stations and discard them */
  ig=(int)(*d)[4]; 
  /* make correct value for N */
  *N=*N-ig;
  printf("Ignoring %d stations\n",ig);
  /* increment to data */
  *d=&((*d)[5+ig]); 
  return(0);
 }
 int
 close_data_stream(double *d, int count) {
  /* sync to disk */
  msync(d, (size_t)count*sizeof(double), MS_SYNC );
  munmap((void*)d, (size_t)count*sizeof(double));
  return 0;
 }
--- a/src/lib/diag_fl.cu
+++ b/src/lib/diag_fl.cu
@ -1,270 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include "cuda.h"
 #include <cuComplex.h>
 #include <stdio.h>
 /* enable this for checking for kernel failure */
 #define CUDA_DBG
 __global__ void 
 kernel_sqrtdiv_fl(int M, float eps, float *__restrict__ x){
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* make sure to use only M threads */
  if (tid<M) {
    if (x[tid]>eps) {
      x[tid]=1.0f/sqrtf(x[tid]);
    } else {
      x[tid]=0.0f;
    }
  }
 }
 __global__ void 
 kernel_diagmult_fl(int M, float *__restrict__ U, const float *__restrict__ D) {
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* which column this tid operates on */
  unsigned int col = tid/M;
  if (tid<M*M) {
     U[tid]=U[tid]*D[col];
  }
 }
 __global__ void 
 kernel_jnorm_fl(int N, int M, const float *__restrict__ J, float *__restrict__ d) {
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* each thread handles one row */  
  if (tid<N) {
    d[tid]=0.0f;
    for (int ci=0; ci<M; ci++) {
     /* J is transposed, so read each column */
     d[tid]=d[tid]+J[tid*M+ci]*J[tid*M+ci]; 
    }
  }
 }
 __global__ void 
 kernel_jacf_fl2(int Nbase, int M, float *__restrict__ jac, const float *__restrict__ coh, const float *__restrict__ p, const short *__restrict__ bb, int N){
  /* global thread index : equal to the baseline */
  unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
  /* which parameter:0...M */
  unsigned int m = threadIdx.y + blockDim.y*blockIdx.y;
  if(n<Nbase && m<M) {
    int sta1=(int)bb[3*n];
    int sta2=(int)bb[3*n+1];
    /* condition for calculating this baseline sum is 
     If this baseline is flagged,
     or if this parameter does not belong to sta1 or sta2
     we do not compute
    */
    int stc=m>>3; /* 0...Ns-1 (because M=total par= 8 * Nstations */
    /* flags are not taken into account */
    if (((stc==sta2)||(stc==sta1))) {   
     cuFloatComplex C[4];
     C[0].x=coh[8*n];
     C[0].y=coh[8*n+1];
     C[1].x=coh[8*n+2];
     C[1].y=coh[8*n+3];
     C[2].x=coh[8*n+4];
     C[2].y=coh[8*n+5];
     C[3].x=coh[8*n+6];
     C[3].y=coh[8*n+7]; 
     /* which parameter exactly 0..7 */
     int stoff=m-stc*8;
     float pp1[8]; 
     float pp2[8]; 
     if (stc==sta1) {
      for (int cn=0; cn<8; cn++) {
       pp1[cn]=0.0f;
       pp2[cn]=p[sta2*8+cn];
      }
      pp1[stoff]=1.0f;
     } else if (stc==sta2) {
      for (int cn=0; cn<8; cn++) {
       pp2[cn]=0.0f;
       pp1[cn]=p[sta1*8+cn];
      }
      pp2[stoff]=1.0f;
     }
     cuFloatComplex G1[4];
     G1[0].x=pp1[0];
     G1[0].y=pp1[1];
     G1[1].x=pp1[2];
     G1[1].y=pp1[3];
     G1[2].x=pp1[4];
     G1[2].y=pp1[5];
     G1[3].x=pp1[6];
     G1[3].y=pp1[7];
     cuFloatComplex T1[4];
     /* T=G1*C */
     T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
     T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
     T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
     T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
     cuFloatComplex G2[4];
     /* conjugate this */
     G2[0].x=pp2[0];
     G2[0].y=-pp2[1];
     G2[2].x=pp2[2];
     G2[2].y=-pp2[3];
     G2[1].x=pp2[4];
     G2[1].y=-pp2[5];
     G2[3].x=pp2[6];
     G2[3].y=-pp2[7];
     cuFloatComplex T2[4];
     T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
     T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
     T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
     T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
     /* update jacobian */
     /* NOTE: row major order */
     jac[m+M*8*n]=T2[0].x;
     jac[m+M*(8*n+1)]=T2[0].y;
     jac[m+M*(8*n+2)]=T2[1].x;
     jac[m+M*(8*n+3)]=T2[1].y;
     jac[m+M*(8*n+4)]=T2[2].x;
     jac[m+M*(8*n+5)]=T2[2].y;
     jac[m+M*(8*n+6)]=T2[3].x;
     jac[m+M*(8*n+7)]=T2[3].y;
    } 
   }
 }
 /* only use extern if calling code is C */
 extern "C"
 {
 /* cuda driver for calculating jacf() */
 /* p: params (Mx1), jac: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
 void
 cudakernel_jacf_fl2(float *p, float *jac, int M, int N, float *coh, short *bbh, int Nbase, int Mclus, int Nstations) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  /* NOTE: use small value for ThreadsPerBlock here, like 8 */
  dim3 threadsPerBlock(16, 8);
  /* jacobian: Nbase x Nstations (proportional to N), so */
  dim3 numBlocks((Nbase+threadsPerBlock.x-1)/threadsPerBlock.x, 
               (M+threadsPerBlock.y-1)/threadsPerBlock.y);
  /* set memory of jac to zero */
  cudaMemset(jac, 0, N*M*sizeof(float));
 // printf("Kernel Jax data size=%d, params=%d, block=%d,%d, thread=%d,%d, baselines=%d\n",N, M, numBlocks.x,numBlocks.y, threadsPerBlock.x, threadsPerBlock.y, Nbase);
  kernel_jacf_fl2<<< numBlocks, threadsPerBlock>>>(Nbase,  M, jac, coh, p, bbh, Nstations);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* invert sqrt(singular values)  1/Sd[]  for Sd[]> eps */
 void
 cudakernel_sqrtdiv_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float eps, float *Sd) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_sqrtdiv_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, eps, Sd);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* U <= U D, 
   U : MxM
   D : Mx1, diagonal matrix
 */
 void
 cudakernel_diagmult_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float *U, float *D) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_diagmult_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, U, D);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 /* diag(J^T J)
   d[i] = J[i,:] * J[i,:]
   J: NxM (in row major order, so J[i,:] is actually J[:,i]
   d: Nx1
 */
 void
 cudakernel_jnorm_fl(int ThreadsPerBlock, int BlocksPerGrid, float *J, int N, int M, float *d) {
 #ifdef CUDA_DBG
  cudaError_t error;
 #endif
  kernel_jnorm_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N,M,J,d);
  cudaDeviceSynchronize();
 #ifdef CUDA_DBG
  error = cudaGetLastError();
  if(error != cudaSuccess)
  {
    // print the CUDA error message and exit
    fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
    exit(-1);
  }
 #endif
 }
 }
--- a/src/lib/diagnostics.c
+++ b/src/lib/diagnostics.c
@ -1,550 +0,0 @@
 /*
 *
 Copyright (C) 2014 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include "sagecal.h"
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include <cuda_runtime.h>
 #include <pthread.h>
 #include <math.h>
 static void
 checkCudaError(cudaError_t err, const char *file, int line)
 {
 #ifdef CUDA_DEBUG
    if(!err)
        return;
    fprintf(stderr,"GPU (CUDA): %s %s %d\n", cudaGetErrorString(err),file,line);
    exit(EXIT_FAILURE);
 #endif
 }
 static void
 checkCublasError(cublasStatus_t cbstatus, char *file, int line)
 {
 #ifdef CUDA_DEBUG
   if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr,"%s: %d: CUBLAS failure\n",file,line);
    exit(EXIT_FAILURE);
   }
 #endif
 }
 /* find for one cluster J (J^T W J+ eW)^-1 J^T  and extract diagonal as output
  p: parameters M x 1
  rd: residual vector N x 1 (on the device, invarient)
  x: (output) diagonal of leverage matrix 
  cbhandle,gWORK: BLAS/storage  pointers
  tileoff: need for hybrid parameters
  adata: has all additional info: coherency,baselines,flags
 */
 static int
 calculate_leverage(float *p, float *rd, float *x, int M, int N, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle, float *gWORK, int tileoff, int ntiles, me_data_t *dp) {
 /* p needs to be copied to device and x needs to be copied back from device
  rd always remains in the device (select part with the right offset) 
  N will change in hybrid mode, so copy back to x with right offset */
 int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
 float *jacd,*xd,*jacTjacd,*pd,*cohd,*Ud,*VTd,*Sd;
 unsigned long int moff=0;
 short *bbd;
 cudaError_t err;
 /* total storage N+M*N+M*M+M+Nbase*8+M*M+M*M+M+M+Nbase*3(short)/(float) */ 
 xd=&gWORK[moff];
 moff+=N;
 jacd=&gWORK[moff];
 moff+=M*N;
 jacTjacd=&gWORK[moff];
 moff+=M*M;
 pd=&gWORK[moff];
 moff+=M;
 cohd=&gWORK[moff];
 moff+=Nbase*8;
 Ud=&gWORK[moff];
 moff+=M*M;
 VTd=&gWORK[moff];
 moff+=M*M;
 Sd=&gWORK[moff];
 moff+=M;
 bbd=(short*)&gWORK[moff];
 moff+=(Nbase*3*sizeof(short))/sizeof(float);
 err=cudaMemcpyAsync(pd, p, M*sizeof(float), cudaMemcpyHostToDevice,0);
 checkCudaError(err,__FILE__,__LINE__);
 /* need to give right offset for coherencies */
 /* offset: cluster offset+time offset */
 err=cudaMemcpyAsync(cohd, &(dp->ddcohf[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(float), cudaMemcpyHostToDevice,0);
 checkCudaError(err,__FILE__,__LINE__);
 /* correct offset for baselines */
 err=cudaMemcpyAsync(bbd, &(dp->ddbase[3*(dp->Nbase)*(tileoff)]), Nbase*3*sizeof(short), cudaMemcpyHostToDevice,0);
 checkCudaError(err,__FILE__,__LINE__);
 cudaDeviceSynchronize();
 int ThreadsPerBlock=DEFAULT_TH_PER_BK;
 int ci,Mi;
 /* extra storage for cusolver */
 int work_size=0;
 int *devInfo;
 err=cudaMalloc((void**)&devInfo, sizeof(int));
 checkCudaError(err,__FILE__,__LINE__);
 float *work;
 float *rwork;
 cusolverDnSgesvd_bufferSize(solver_handle, M, M, &work_size);
 err=cudaMalloc((void**)&work, work_size*sizeof(float));
 checkCudaError(err,__FILE__,__LINE__);
 err=cudaMalloc((void**)&rwork, 5*M*sizeof(float));
 checkCudaError(err,__FILE__,__LINE__);
 /* set mem to 0 */
 cudaMemset(xd, 0, N*sizeof(float));
 /* calculate J^T, not taking flags into account */
 cudakernel_jacf_fl2(pd, jacd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
 /* calculate JTJ=(J^T J - [e] [W]) */
 //status=culaDeviceSgemm('N','T',M,M,N,1.0f,jacd,M,jacd,M,0.0f,jacTjacd,M);
 //checkStatus(status,__FILE__,__LINE__);
 cublasStatus_t cbstatus=CUBLAS_STATUS_SUCCESS;
 float cone=1.0f; float czero=0.0f;
 cbstatus=cublasSgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_T,M,M,N,&cone,jacd,M,jacd,M,&czero,jacTjacd,M);
 /* add mu * I to JTJ */
 cudakernel_diagmu_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, jacTjacd, 1e-9f);
 /* calculate inv(JTJ) using SVD */
 /* inv(JTJ) = Ud x Sid x VTd : we take into account that JTJ is symmetric */
 //status=culaDeviceSgesvd('A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M);
 //checkStatus(status,__FILE__,__LINE__);
 cusolverDnSgesvd(solver_handle,'A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M,work,work_size,rwork,devInfo);
 cudaDeviceSynchronize();
 /* find Sd= 1/sqrt(Sd) of the singular values (positive singular values) */
 cudakernel_sqrtdiv_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, 1e-9f, Sd);
 /* multiply Ud with Sid (diagonal) Ud <= Ud Sid (columns modified) */
 cudakernel_diagmult_fl(ThreadsPerBlock, (M*M+ThreadsPerBlock-1)/ThreadsPerBlock, M, Ud, Sd);
 /* now multiply Ud VTd to get the square root */
 //status=culaDeviceSgemm('N','N',M,M,M,1.0f,Ud,M,VTd,M,0.0f,jacTjacd,M);
 //checkStatus(status,__FILE__,__LINE__);
 cbstatus=cublasSgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_N,M,M,M,&cone,Ud,M,VTd,M,&czero,jacTjacd,M);
 /* calculate J^T, without taking flags into account (use same storage as previous J^T) */
 cudakernel_jacf_fl2(pd, jacd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
 /* multiply (J^T)^T sqrt(B)  == sqrt(B)^T J^T, taking M columns at a time */
 for (ci=0; ci<(N+M-1)/M;ci++) {
  if (ci*M+M<N) {
   Mi=M;
  } else {
   Mi=N-ci*M;
  }
  //status=culaDeviceSgemm('T','N',M,Mi,M,1.0f,jacTjacd,M,&jacd[ci*M*M],M,0.0f,VTd,M);
  //checkStatus(status,__FILE__,__LINE__);
  cbstatus=cublasSgemm(cbhandle,CUBLAS_OP_T,CUBLAS_OP_N,M,Mi,M,&cone,jacTjacd,M,&jacd[ci*M*M],M,&czero,VTd,M);
  err=cudaMemcpy(&jacd[ci*M*M],VTd,Mi*M*sizeof(float),cudaMemcpyDeviceToDevice);
  checkCudaError(err,__FILE__,__LINE__);
 }
 /* xd[i] <= ||J[i,:]||^2 */
 cudakernel_jnorm_fl(ThreadsPerBlock, (N+ThreadsPerBlock-1)/ThreadsPerBlock, jacd, N, M, xd);
 /* output x <=xd */
 err=cudaMemcpyAsync(x, xd, N*sizeof(float), cudaMemcpyDeviceToHost,0);
 cudaDeviceSynchronize();
 checkCudaError(err,__FILE__,__LINE__);
 checkCublasError(cbstatus,__FILE__,__LINE__);
 return 0;
 }
 /******************** pipeline functions **************************/
 typedef struct gb_data_dg_ {
  int status[2]; 
  float *p[2]; /* pointer to parameters being used by each thread (depends on cluster) */
  float *xo; /* residual vector (copied to device) */
  float *x[2]; /* output leverage values from each thread */
  int M[2]; /* no. of parameters (per cluster,hybrid) */
  int N[2]; /* no. of visibilities (might change in hybrid mode) */
  me_data_t *lmdata[2]; /* two for each thread */
  /* GPU related info */
  cublasHandle_t cbhandle[2]; /* CUBLAS handles */
  cusolverDnHandle_t solver_handle[2]; 
  float *rd[2]; /* residual vector on the device (invarient) */
  float *gWORK[2]; /* GPU buffers */
  int64_t data_size; /* size of buffer (bytes) */
 } gbdatadg;
 /* slave thread 2GPU function */
 static void *
 pipeline_slave_code_dg(void *data)
 {
 slave_tdata *td=(slave_tdata*)data;
 gbdatadg *gd=(gbdatadg*)(td->pline->data);
 int tid=td->tid;
 while(1) {
  sync_barrier(&(td->pline->gate1)); /* stop at gate 1*/
  if(td->pline->terminate) break; /* if flag is set, break loop */
  sync_barrier(&(td->pline->gate2)); /* stop at gate 2 */
 /* do work */
  if (gd->status[tid]==PT_DO_CDERIV) {
    me_data_t *t=(me_data_t *)gd->lmdata[tid];
    /* divide the tiles into chunks tilesz/nchunk */
    int tilechunk=(t->tilesz+t->carr[t->clus].nchunk-1)/t->carr[t->clus].nchunk;
    int ci;
    int cj=0;
    int ntiles;
    /* loop over chunk, righ set of parameters and residual vector */
    for (ci=0; ci<t->carr[t->clus].nchunk; ci++) {
     /* divide the tiles into chunks tilesz/nchunk */
     if (cj+tilechunk<t->tilesz) {
      ntiles=tilechunk;
     } else {
      ntiles=t->tilesz-cj;
     }
    /* right offset for rd[] and x[] needed and since no overlap,
       can wait for all chunks to complete  */
    calculate_leverage(&gd->p[tid][ci*(gd->M[tid])],&gd->rd[tid][8*cj*t->Nbase],&gd->x[tid][8*cj*t->Nbase], gd->M[tid], 8*ntiles*t->Nbase, gd->cbhandle[tid], gd->solver_handle[tid], gd->gWORK[tid], cj, ntiles, gd->lmdata[tid]);
    cj=cj+tilechunk;
   }
  } else if (gd->status[tid]==PT_DO_AGPU) {
    attach_gpu_to_thread2(tid,&gd->cbhandle[tid],&gd->solver_handle[tid],&gd->gWORK[tid],gd->data_size,1);
    /* copy residual vector to device */
    cudaError_t err;
    me_data_t *t=(me_data_t *)gd->lmdata[tid];
    err=cudaMalloc((void**)&gd->rd[tid], (size_t)8*t->tilesz*t->Nbase*sizeof(float));
    checkCudaError(err,__FILE__,__LINE__);
    err=cudaMemcpy(gd->rd[tid], gd->xo, 8*t->tilesz*t->Nbase*sizeof(float), cudaMemcpyHostToDevice);
    checkCudaError(err,__FILE__,__LINE__);
  } else if (gd->status[tid]==PT_DO_DGPU) {
    cudaFree(gd->rd[tid]);
    detach_gpu_from_thread2(gd->cbhandle[tid],gd->solver_handle[tid],gd->gWORK[tid],1);
  } else if (gd->status[tid]!=PT_DO_NOTHING) { /* catch error */ 
    fprintf(stderr,"%s: %d: invalid mode for slave tid=%d status=%d\n",__FILE__,__LINE__,tid,gd->status[tid]);
    exit(1);
  }
 }
 return NULL;
 }
 /* initialize the pipeline
  and start the slaves rolling */
 static void
 init_pipeline_dg(th_pipeline *pline,
     void *data)
 {
 slave_tdata *t0,*t1;
 pthread_attr_init(&(pline->attr));
 pthread_attr_setdetachstate(&(pline->attr),PTHREAD_CREATE_JOINABLE);
 init_th_barrier(&(pline->gate1),3); /* 3 threads, including master */
 init_th_barrier(&(pline->gate2),3); /* 3 threads, including master */
 pline->terminate=0;
 pline->data=data; /* data should have pointers to t1 and t2 */
 if ((t0=(slave_tdata*)malloc(sizeof(slave_tdata)))==0) {
    fprintf(stderr,"no free memory\n");
    exit(1);
 }
 if ((t1=(slave_tdata*)malloc(sizeof(slave_tdata)))==0) {
    fprintf(stderr,"no free memory\n");
    exit(1);
 }
 if ((pline->thst=(taskhist*)malloc(sizeof(taskhist)))==0) {
    fprintf(stderr,"no free memory\n");
    exit(1);
 }
 init_task_hist(pline->thst);
 t0->pline=t1->pline=pline;
 t0->tid=0;
 t1->tid=1; /* link back t1, t2 to data so they could be freed */
 pline->sd0=t0;
 pline->sd1=t1;
 pthread_create(&(pline->slave0),&(pline->attr),pipeline_slave_code_dg,(void*)t0);
 pthread_create(&(pline->slave1),&(pline->attr),pipeline_slave_code_dg,(void*)t1);
 }
 /* destroy the pipeline */
 /* need to kill the slaves first */
 static void
 destroy_pipeline_dg(th_pipeline *pline)
 {
 pline->terminate=1;
 sync_barrier(&(pline->gate1));
 pthread_join(pline->slave0,NULL);
 pthread_join(pline->slave1,NULL);
 destroy_th_barrier(&(pline->gate1));
 destroy_th_barrier(&(pline->gate2));
 pthread_attr_destroy(&(pline->attr));
 destroy_task_hist(pline->thst);
 free(pline->thst);
 free(pline->sd0);
 free(pline->sd1);
 pline->data=NULL;
 }
 /******************** end pipeline functions **************************/
 /*  Calculate St.Laurent-Cook Jacobian leverage
  xo: residual  (modified)
  flags: 2 for flags based on uvcut, 1 for normal flags
  coh: coherencies are calculated for all baselines, regardless of flag
  diagmode: 1: replace residual, 2: calc noise/leverage ratio
 */
 int
 calculate_diagnostics(double *u,double *v,double *w,double *p,double *xo,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, complex double *coh, int M,int Mt,int diagmode, int Nt) {
  int cj;
  int n;
  me_data_t lmdata0,lmdata1;
  int Nbase1;
  /* no of data */
  n=Nbase*tilesz*8;
  /* true no of baselines */
  Nbase1=Nbase*tilesz;
  double *ddcoh;
  short *ddbase;
  int c0,c1;
  float *ddcohf, *pf, *xdummy0f, *xdummy1f, *res0, *dgf;
 /********* thread data ******************/
  /* barrier */
  th_pipeline tp;
  gbdatadg tpg;
 /****************************************/
  lmdata0.clus=lmdata1.clus=-1;
  /* setup data for lmfit */
  lmdata0.u=lmdata1.u=u;
  lmdata0.v=lmdata1.v=v;
  lmdata0.w=lmdata1.w=w;
  lmdata0.Nbase=lmdata1.Nbase=Nbase;
  lmdata0.tilesz=lmdata1.tilesz=tilesz;
  lmdata0.N=lmdata1.N=N;
  lmdata0.barr=lmdata1.barr=barr;
  lmdata0.carr=lmdata1.carr=carr;
  lmdata0.M=lmdata1.M=M;
  lmdata0.Mt=lmdata1.Mt=Mt;
  lmdata0.freq0=lmdata1.freq0=NULL; /* not used */
  lmdata0.Nt=lmdata1.Nt=Nt;
  lmdata0.coh=lmdata1.coh=coh;
  /* rearrange coh for GPU use */
  if ((ddcoh=(double*)calloc((size_t)(M*Nbase1*8),sizeof(double)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  if ((ddcohf=(float*)calloc((size_t)(M*Nbase1*8),sizeof(float)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  if ((ddbase=(short*)calloc((size_t)(Nbase1*3),sizeof(short)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  rearrange_coherencies2(Nbase1, barr, coh, ddcoh, ddbase, M, Nt);
  lmdata0.ddcoh=lmdata1.ddcoh=ddcoh;
  lmdata0.ddbase=lmdata1.ddbase=ddbase;
  /* ddcohf (float) << ddcoh (double) */
  double_to_float(ddcohf,ddcoh,M*Nbase1*8,Nt);
  lmdata0.ddcohf=lmdata1.ddcohf=ddcohf;
  if ((pf=(float*)calloc((size_t)(Mt*8*N),sizeof(float)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  double_to_float(pf,p,Mt*8*N,Nt);
  /* residual */
  if ((res0=(float*)calloc((size_t)(n),sizeof(float)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  double_to_float(res0,xo,n,Nt);
  /* sum of diagonal values of leverage */
  if ((dgf=(float*)calloc((size_t)(n),sizeof(float)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  if ((xdummy0f=(float*)calloc((size_t)(n),sizeof(float)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
  if ((xdummy1f=(float*)calloc((size_t)(n),sizeof(float)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
 /********** setup threads *******************************/
  /* also calculate the total storage needed to be allocated on a GPU */
   /* determine total size for memory allocation 
     residual = n (separately allocated)
     diagonal = n
    For one cluster,
     Jacobian = nxm,  J^T J = mxm, (also inverse)
   */
   int Mm=8*N; /* no of parameters */
   int64_t data_sz=0;
   data_sz=(int64_t)(n+Mm*n+3*Mm*Mm+3*Mm+Nbase1*8)*sizeof(float)+(int64_t)Nbase1*3*sizeof(short);
  tpg.data_size=data_sz;
  tpg.lmdata[0]=&lmdata0;
  tpg.lmdata[1]=&lmdata1;
  tpg.xo=res0; /* residual */
  init_pipeline_dg(&tp,&tpg);
  sync_barrier(&(tp.gate1)); /* sync at gate 1*/
  tpg.status[0]=tpg.status[1]=PT_DO_AGPU;
  sync_barrier(&(tp.gate2)); /* sync at gate 2*/
  sync_barrier(&(tp.gate1)); /* sync at gate 1*/
  tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
  sync_barrier(&(tp.gate2)); /* sync at gate 2*/
 /********** done setup threads *******************************/
     tpg.x[0]=xdummy0f;
     tpg.M[0]=8*N; /* even though size of p is > M, dont change this */
     tpg.N[0]=n; /* Nbase*tilesz*8 */
     tpg.x[1]=xdummy1f;
     tpg.M[1]=8*N; /* even though size of p is > M, dont change this */
     tpg.N[1]=n; /* Nbase*tilesz*8 */
    for (cj=0; cj<M/2; cj++) { /* iter per cluster pairs */
      c0=2*cj;
      c1=2*cj+1;
  sync_barrier(&(tp.gate1)); /* sync at gate 1 */
     lmdata0.clus=c0;
     lmdata1.clus=c1;
     /* run this from a separate thread */
     tpg.p[0]=&pf[carr[c0].p[0]]; /* length carr[c0].nchunk times */
     tpg.p[1]=&pf[carr[c1].p[0]]; /* length carr[c1].nchunk times */
     tpg.status[0]=tpg.status[1]=PT_DO_CDERIV;
  sync_barrier(&(tp.gate2)); /* sync at gate 2 */
  sync_barrier(&(tp.gate1)); /* sync at gate 1 */
     tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
  sync_barrier(&(tp.gate2)); /* sync at gate 2 */
    /* add result to the sum */
    my_saxpy(n, xdummy0f, 1.0f, dgf);
    my_saxpy(n, xdummy1f, 1.0f, dgf);
   }
   /* odd cluster out, if M is odd */
   if (M%2) {
      c0=M-1;
  sync_barrier(&(tp.gate1)); /* sync at gate 1 */
     tpg.p[0]=&pf[carr[c0].p[0]];
     lmdata0.clus=c0;
     tpg.status[0]=PT_DO_CDERIV;
     tpg.status[1]=PT_DO_NOTHING;
  sync_barrier(&(tp.gate2)); /* sync at gate 2 */
 /**************************************************************************/
  sync_barrier(&(tp.gate1)); /* sync at gate 1 */
     tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
  sync_barrier(&(tp.gate2)); /* sync at gate 2 */
    my_saxpy(n, xdummy0f, 1.0f, dgf);
  }
  free(pf);
  free(ddcohf);
  free(xdummy1f);
  free(res0);
  free(ddcoh);
  /******** free threads ***************/
  sync_barrier(&(tp.gate1)); /* sync at gate 1*/
  tpg.status[0]=tpg.status[1]=PT_DO_DGPU;
  sync_barrier(&(tp.gate2)); /* sync at gate 2*/
  destroy_pipeline_dg(&tp);
  /******** done free threads ***************/
  /* now add 1's to locations with flagged data */
  /* create array for adding */
  create_onezerovec(Nbase1, ddbase, xdummy0f, Nt);
  my_saxpy(n, xdummy0f, 1.0f, dgf);
  free(xdummy0f);
  free(ddbase);
  /* output */
 //  for (cj=0; cj<n; cj++) {
 //   printf("%d %f\n",cj,dgf[cj]);
 //  }
  if (diagmode==1) {
  /* copy back to output */
  float_to_double(xo,dgf,n,Nt);
  } else { 
    /* solve system of  equations a * leverage + b * 1 = |residual|
      to find a,b scalars, and just print them as output */
     /* find  1^T |r| = sum (|residual|) and  lev^T |r|  */
     float sum1,sum2;
     find_sumproduct(n, res0, dgf, &sum1, &sum2, Nt);
     //printf("sum|res|=%f sum(lev^T |res|)=%f\n",sum1,sum2);
     float a00,a01,a11;
     a00=my_fnrm2(n,dgf); /* lev^T lev */
     a01=my_fasum(n,dgf); /* = a10 = sum|leverage| */
     a00=a00*a00;
     a11=(float)n; /* sum( 1 ) */
     float r00,r01;
     r00=sum1;
     r01=sum2;
     //printf("A=[\n %f %f;\n %f %f];\n b=[\n %f\n %f\n]\n",a00,a01,a01,a11,r00,r01);
     /* solve A [a b]^T = r */
     float alpha,beta,denom;
     denom=(a00*a11-a01*a01);
     //printf("denom=%f\n",denom);
     if (denom>1e-6f) { /* can be solved */
      alpha=(r00*a11-r01*a01)/denom;
     } else {
      alpha=0.0f;
     }
     beta=(r00-a00*alpha)/a01; 
     printf("Error Noise/Model %e/%e\n",beta,alpha);
  }
  free(dgf);
 return 0;
 }
--- a/src/lib/lbfgs.c
+++ b/src/lib/lbfgs.c
--- a/src/lib/lbfgs_nocuda.c
+++ b/src/lib/lbfgs_nocuda.c
@ -1,926 +0,0 @@
 /*
 *
 Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>  
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 $Id$
 */
 #include "sagecal.h"
 #include <pthread.h>
 /**** repeated code here ********************/
 /* Jones matrix multiplication 
   C=A*B
 */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static void
 amb(complex double * __restrict a, complex double * __restrict b, complex double * __restrict c) {
 c[0]=a[0]*b[0]+a[1]*b[2];
 c[1]=a[0]*b[1]+a[1]*b[3];
 c[2]=a[2]*b[0]+a[3]*b[2];
 c[3]=a[2]*b[1]+a[3]*b[3];
 }
 /* Jones matrix multiplication 
   C=A*B^H
 */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static void
 ambt(complex double * __restrict a, complex double * __restrict b, complex double * __restrict c) {
 c[0]=a[0]*conj(b[0])+a[1]*conj(b[1]);
 c[1]=a[0]*conj(b[2])+a[1]*conj(b[3]);
 c[2]=a[2]*conj(b[0])+a[3]*conj(b[1]);
 c[3]=a[2]*conj(b[2])+a[3]*conj(b[3]);
 }
 /**** end repeated code ********************/
 /* worker thread for a cpu */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static void *
 cpu_calc_deriv(void *adata) {
 thread_data_grad_t *t=(thread_data_grad_t*)adata;
 int ci,nb;
 int stc,stoff,stm,sta1,sta2;
 int N=t->N; /* stations */
 int M=t->M; /* clusters */
 int Nbase=(t->Nbase)*(t->tilesz);
 complex double xr[4]; /* residuals */
 complex double G1[4],G2[4],C[4],T1[4],T2[4];
 double pp[8];
 complex double csum;
 int cli,tpchunk,pstart,nchunk,tilesperchunk,stci,ttile,tptile,poff;
 /* iterate over each paramter */
 for (ci=t->g_start; ci<=t->g_end; ++ci) {
    t->g[ci]=0.0;
    /* find station and parameter corresponding to this value of ci */
    /* this parameter should correspond to the right baseline (x tilesz)
        to contribute to the derivative */
    cli=0;
    while((cli<M) && (ci<t->carr[cli].p[0] || ci>t->carr[cli].p[0]+8*N*t->carr[cli].nchunk-1)) {
     cli++;
    }
   /* now either cli>=M: cluster not found 
       or cli<M and cli is the right cluster */
   if (cli==M && ci>=t->carr[cli-1].p[0] && ci<=t->carr[cli-1].p[0]+8*N*t->carr[cli-1].nchunk-1) {
    cli--;
   }
   if (cli<M) {
    /* right parameter offset */
    stci=ci-t->carr[cli].p[0];
    stc=(stci%(8*N))/8; /* 0..N-1 */
    /* make sure this baseline contribute to this parameter */
    tpchunk=stci/(8*N);
    nchunk=t->carr[cli].nchunk;
    pstart=t->carr[cli].p[0];
    tilesperchunk=(t->tilesz+nchunk-1)/nchunk;
    /* iterate over all baselines and accumulate sum */
    for (nb=0; nb<Nbase; ++nb) {
     /* which tile is this ? */
     ttile=nb/t->Nbase;
     /* which chunk this tile belongs to */
     tptile=ttile/tilesperchunk;
     /* now tptile has to match tpchunk, otherwise ignore calculation */
     if (tptile==tpchunk) {
     sta1=t->barr[nb].sta1;
     sta2=t->barr[nb].sta2;
     if (((stc==sta1)||(stc==sta2))&& !t->barr[nb].flag) {
      /* this baseline has a contribution */
      /* which paramter of this station */
      stoff=(stci%(8*N))%8; /* 0..7 */
      /* which cluster */
      stm=cli; /* 0..M-1 */
      /* exact expression for derivative 
         2 real( vec^H(residual_this_baseline) 
            * vec(-J_{pm}C_{pqm} J_{qm}^H)
        where m: chosen cluster
        J_{pm},J_{qm} Jones matrices for baseline p-q
        depending on the parameter, J ==> E 
        E: zero matrix, except 1 at location of m
       residual : in x[8*nb:8*nb+7]
       C coh: in coh[8*M*nb+m*8:8*M*nb+m*8+7] (double storage)
           coh[4*M*nb+4*m:4*M*nb+4*m+3] (complex storage)
       J_p,J_q: in p[sta1*8+m*8*N: sta1*8+m*8*N+7]
        and p[sta2*8+m*8*N: sta2*8+m*8*N+ 7]
     */
     /* read in residual vector, conjugated */
     xr[0]=(t->x[nb*8])-_Complex_I*(t->x[nb*8+1]);
     xr[1]=(t->x[nb*8+2])-_Complex_I*(t->x[nb*8+3]);
     xr[2]=(t->x[nb*8+4])-_Complex_I*(t->x[nb*8+5]);
     xr[3]=(t->x[nb*8+6])-_Complex_I*(t->x[nb*8+7]);
     /* read in coherency */
     C[0]=t->coh[4*M*nb+4*stm];
     C[1]=t->coh[4*M*nb+4*stm+1];
     C[2]=t->coh[4*M*nb+4*stm+2];
     C[3]=t->coh[4*M*nb+4*stm+3];
     memset(pp,0,sizeof(double)*8); 
     if (stc==sta1) {
       /* this station parameter gradient */
       pp[stoff]=1.0;
       memset(G1,0,sizeof(complex double)*4); 
       G1[0]=pp[0]+_Complex_I*pp[1];
       G1[1]=pp[2]+_Complex_I*pp[3];
       G1[2]=pp[4]+_Complex_I*pp[5];
       G1[3]=pp[6]+_Complex_I*pp[7];
       poff=pstart+tpchunk*8*N+sta2*8;
       G2[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]);
       G2[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]);
       G2[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+4]);
       G2[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]);
     } else if (stc==sta2) {
       memset(G2,0,sizeof(complex double)*4); 
       pp[stoff]=1.0;
       G2[0]=pp[0]+_Complex_I*pp[1];
       G2[1]=pp[2]+_Complex_I*pp[3];
       G2[2]=pp[4]+_Complex_I*pp[5];
       G2[3]=pp[6]+_Complex_I*pp[7];
       poff=pstart+tpchunk*8*N+sta1*8;
       G1[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]);
       G1[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]);
       G1[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+5]);
       G1[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]);
     }
     /* T1=G1*C */
     amb(G1,C,T1);
     /* T2=T1*G2' */
     ambt(T1,G2,T2);
     /* calculate product xr*vec(J_p C J_q^H ) */
     csum=xr[0]*T2[0];
     csum+=xr[1]*T2[1];
     csum+=xr[2]*T2[2];
     csum+=xr[3]*T2[3];
     /* accumulate sum */
     t->g[ci]+=-2.0*creal(csum);
     }
     }
    }
   }
 }
 return NULL;
 }
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static int
 func_grad(
   void (*func)(double *p, double *hx, int m, int n, void *adata),
   double *p, double *g, double *xo, int m, int n, double step, void *adata) {
  /* gradient for each parameter is
     (||func(p+step*e_i)-x||^2-||func(p-step*e_i)-x||^2)/2*step
    i=0,...,m-1 for all parameters
    e_i: unit vector, 1 only at i-th location
  */
  double *x; /* array to store residual */
  int ci;
  me_data_t *dp=(me_data_t*)adata;
  int Nt=dp->Nt;
  pthread_attr_t attr;
  pthread_t *th_array;
  thread_data_grad_t *threaddata;
  if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  /* evaluate func once, store in x, and create threads */
  /* and calculate the residual x=xo-func */
  func(p,x,m,n,adata);
  /* calculate x<=x-xo */
  my_daxpy(n,xo,-1.0,x);
  /* setup threads */
  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
 #ifndef USE_MIC
   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
 #endif
   exit(1);
  }
  if ((threaddata=(thread_data_grad_t*)malloc((size_t)Nt*sizeof(thread_data_grad_t)))==0) {
 #ifndef USE_MIC
    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
 #endif
    exit(1);
  }
  int nth,nth1,Nparm;
  /* parameters per thread */
  Nparm=(m+Nt-1)/Nt;
  /* each thread will calculate derivative of part of 
     parameters */
  ci=0;
  for (nth=0;  nth<Nt; nth++) {
   threaddata[nth].Nbase=dp->Nbase;
   threaddata[nth].tilesz=dp->tilesz;
   threaddata[nth].barr=dp->barr;
   threaddata[nth].carr=dp->carr;
   threaddata[nth].M=dp->M;
   threaddata[nth].N=dp->N;
   threaddata[nth].coh=dp->coh;
   threaddata[nth].m=m;
   threaddata[nth].n=n;
   threaddata[nth].x=x;
   threaddata[nth].p=p;
   threaddata[nth].g=g;
   threaddata[nth].g_start=ci;
   threaddata[nth].g_end=ci+Nparm-1;
   if (threaddata[nth].g_end>=m) {
    threaddata[nth].g_end=m-1;
   }
   ci=ci+Nparm;
   pthread_create(&th_array[nth],&attr,cpu_calc_deriv,(void*)(&threaddata[nth]));
  }
  /* now wait for threads to finish */
  for(nth1=0; nth1<nth; nth1++) {
   pthread_join(th_array[nth1],NULL);
  }
  pthread_attr_destroy(&attr);
  free(th_array);
  free(threaddata);
  free(x);
  return 0;
 }
 /* use algorithm 9.1 to compute pk=Hk gk */
 /* pk,gk: size m x 1
   s, y: size mM x 1 
   rho: size M x 1 
   ii: true location of the k th values in s,y */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static void
 mult_hessian(int m, double *pk, double *gk, double *s, double *y, double *rho, int M, int ii) {
 int ci;
 double *alphai;
 int *idx; /* store sorted locations of s, y here */
 double gamma,beta;
 if ((alphai=(double*)calloc((size_t)M,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
 }
 if ((idx=(int*)calloc((size_t)M,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
 }
 if (M>0) {
  /* find the location of k-1 th value */
  if (ii>0) {
   ii=ii-1;
  } else {
   ii=M-1;
  }
 /* s,y will have 0,1,...,ii,ii+1,...M-1 */
 /* map this to  ii+1,ii+2,...,M-1,0,1,..,ii */
  for (ci=0; ci<M-ii-1; ci++){
   idx[ci]=(ii+ci+1);
  }
  for(ci=M-ii-1; ci<M; ci++) {
   idx[ci]=(ci-M+ii+1);
  }
 }
 #ifdef DEBUG
 printf("prod M=%d, current ii=%d\n",M,ii);
 for(ci=0; ci<M; ci++) {
  printf("%d->%d ",ci,idx[ci]);
 }
 printf("\n");
 #endif
 /* q = grad(f)k : pk<=gk */
 my_dcopy(m,gk,1,pk,1);
 /* this should be done in the right order */
 for (ci=0; ci<M; ci++) {
  /* alphai=rhoi si^T*q */
  alphai[M-ci-1]=rho[idx[M-ci-1]]*my_ddot(m,&s[m*idx[M-ci-1]],pk);
  /* q=q-alphai yi */
  my_daxpy(m,&y[m*idx[M-ci-1]],-alphai[M-ci-1],pk);
 }
 /* r=Hk(0) q : initial hessian */
 /* gamma=s(k-1)^T*y(k-1)/y(k-1)^T*y(k-1)*/
 gamma=1.0;
 if (M>0) {
  gamma=my_ddot(m,&s[m*idx[M-1]],&y[m*idx[M-1]]);
  gamma/=my_ddot(m,&y[m*idx[M-1]],&y[m*idx[M-1]]);
  /* Hk(0)=gamma I, so scale q by gamma */
  /* r= Hk(0) q */
  my_dscal(m,gamma,pk);
 } 
 for (ci=0; ci<M; ci++) {
  /* beta=rhoi yi^T * r */
  beta=rho[idx[ci]]*my_ddot(m,&y[m*idx[ci]],pk);
  /* r = r + (alphai-beta)*si */
  my_daxpy(m,&s[m*idx[ci]],alphai[ci]-beta,pk);
 }
 free(alphai);
 free(idx);
 }
 /* cubic interpolation in interval [a,b] (a>b is possible)
   to find step that minimizes cost function */
 /* func: vector function
   xk: parameter values size m x 1 (at which step is calculated)
   pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
   a/b:  interval for interpolation
   x: size n x 1 (storage)
   xp: size m x 1 (storage)
   xo: observed data size n x 1
   n: size of vector function
   step: step size for differencing 
   adata:  additional data passed to the function
 */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static double 
 cubic_interp(
   void (*func)(double *p, double *hx, int m, int n, void *adata),
   double *xk, double *pk, double a, double b, double *x, double *xp,  double *xo, int m, int n, double step, void *adata) {
  double f0,f1,f0d,f1d; /* function values and derivatives at a,b */
  double p01,p02,z0,fz0;
  double aa,cc;
  my_dcopy(m,xk,1,xp,1); /* xp<=xk */
  my_daxpy(m,pk,a,xp); /* xp<=xp+(a)*pk */
  func(xp,x,m,n,adata);
  my_daxpy(n,xo,-1.0,x);
  f0=my_dnrm2(n,x);
  f0*=f0;
  /* grad(phi_0): evaluate at -step and +step */
  my_daxpy(m,pk,step,xp); /* xp<=xp+(a+step)*pk */
  func(xp,x,m,n,adata);
  my_daxpy(n,xo,-1.0,x);
  p01=my_dnrm2(n,x);
  my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(a-step)*pk */
  func(xp,x,m,n,adata);
  my_daxpy(n,xo,-1.0,x);
  p02=my_dnrm2(n,x);
  f0d=(p01*p01-p02*p02)/(2.0*step);
  my_dcopy(m,xk,1,xp,1); /* xp<=xk */
  my_daxpy(m,pk,b,xp); /* xp<=xp+(b)*pk */
  func(xp,x,m,n,adata);
  my_daxpy(n,xo,-1.0,x);
  f1=my_dnrm2(n,x);
  f1*=f1;
  /* grad(phi_1): evaluate at -step and +step */
  my_daxpy(m,pk,step,xp); /* xp<=xp+(b+step)*pk */
  func(xp,x,m,n,adata);
  my_daxpy(n,xo,-1.0,x);
  p01=my_dnrm2(n,x);
  my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(b-step)*pk */
  func(xp,x,m,n,adata);
  my_daxpy(n,xo,-1.0,x);
  p02=my_dnrm2(n,x);
  f1d=(p01*p01-p02*p02)/(2.0*step);
  //printf("Interp a,f(a),f'(a): (%lf,%lf,%lf) (%lf,%lf,%lf)\n",a,f0,f0d,b,f1,f1d);
  /* cubic poly in [0,1] is f0+f0d z+eta z^2+xi z^3 
    where eta=3(f1-f0)-2f0d-f1d, xi=f0d+f1d-2(f1-f0) 
    derivative f0d+2 eta z+3 xi z^2 => cc+bb z+aa z^2 */
   aa=3.0*(f0-f1)/(b-a)+(f1d-f0d);
   p01=aa*aa-f0d*f1d;
  /* root exist? */
  if (p01>0.0) {
   /* root */
   cc=sqrt(p01);
   z0=b-(f1d+cc-aa)*(b-a)/(f1d-f0d+2.0*cc);
   /* FIXME: check if this is within boundary */
   aa=MAX(a,b);
   cc=MIN(a,b);
   //printf("Root=%lf, in [%lf,%lf]\n",z0,cc,aa);
   if (z0>aa || z0<cc) {
    fz0=f0+f1;
   } else {
    /* evaluate function for this root */
    my_dcopy(m,xk,1,xp,1); /* xp<=xk */
    my_daxpy(m,pk,a+z0*(b-a),xp); /* xp<=xp+(z0)*pk */
    func(xp,x,m,n,adata);
    my_daxpy(n,xo,-1.0,x);
    fz0=my_dnrm2(n,x);
    fz0*=fz0;
   }
   /* now choose between f0,f1,fz0,fz1 */
   if (f0<f1 && f0<fz0) {
     return a;
   }
   if (f1<fz0) {
     return b;
   }
   /* else */
   return (z0);
  } else { 
   /* find the value from a or b that minimizes func */
   if (f0<f1) {
    return a;
   } else {
    return b;
   }
  }
  return 0;
 }
 /*************** Fletcher line search **********************************/
 /* zoom function for line search */
 /* func: vector function
   xk: parameter values size m x 1 (at which step is calculated)
   pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
   a/b: bracket interval [a,b] (a>b) is possible
   x: size n x 1 (storage)
   xp: size m x 1 (storage)
   phi_0: phi(0)
   gphi_0: grad(phi(0))
   xo: observed data size n x 1
   n: size of vector function
   step: step size for differencing 
   adata:  additional data passed to the function
 */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static double 
 linesearch_zoom(
   void (*func)(double *p, double *hx, int m, int n, void *adata),
   double *xk, double *pk, double a, double b, double *x, double *xp,  double phi_0, double gphi_0, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) {
  double alphaj,phi_j,phi_aj;
  double gphi_j,p01,p02,aj,bj;
  double alphak=1.0;
  int ci,found_step=0;
  aj=a;
  bj=b;
  ci=0;
  while(ci<10) {
    /* choose alphaj from [a+t2(b-a),b-t3(b-a)] */
    p01=aj+t2*(bj-aj);
    p02=bj-t3*(bj-aj);
    alphaj=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata);
    //printf("cubic intep [%lf,%lf]->%lf\n",p01,p02,alphaj);
    /* evaluate phi(alphaj) */
    my_dcopy(m,xk,1,xp,1); /* xp<=xk */
    my_daxpy(m,pk,alphaj,xp); /* xp<=xp+(alphaj)*pk */
    func(xp,x,m,n,adata);
    /* calculate x<=x-xo */
    my_daxpy(n,xo,-1.0,x);
    phi_j=my_dnrm2(n,x);
    phi_j*=phi_j;
    /* evaluate phi(aj) */
    my_dcopy(m,xk,1,xp,1); /* xp<=xk */
    my_daxpy(m,pk,aj,xp); /* xp<=xp+(alphaj)*pk */
    func(xp,x,m,n,adata);
    /* calculate x<=x-xo */
    my_daxpy(n,xo,-1.0,x);
    phi_aj=my_dnrm2(n,x);
    phi_aj*=phi_aj;
    if ((phi_j>phi_0+rho*alphaj*gphi_0) || phi_j>=phi_aj) {
      bj=alphaj; /* aj unchanged */
    } else {
     /* evaluate grad(alphaj) */
     my_dcopy(m,xk,1,xp,1); /* xp<=xk */
     my_daxpy(m,pk,alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */
     func(xp,x,m,n,adata);
     /* calculate x<=x-xo */
     my_daxpy(n,xo,-1.0,x);
     p01=my_dnrm2(n,x);
     my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphaj-step)*pk */
     func(xp,x,m,n,adata);
     /* calculate x<=x-xo */
     my_daxpy(n,xo,-1.0,x);
     p02=my_dnrm2(n,x);
     gphi_j=(p01*p01-p02*p02)/(2.0*step);
     /* termination due to roundoff/other errors pp. 38, Fletcher */
     if ((aj-alphaj)*gphi_j<=step) {
      alphak=alphaj;
      found_step=1;
      break;
     }
     if (fabs(gphi_j)<=-sigma*gphi_0) {
      alphak=alphaj;
      found_step=1;
      break;
     }
     if (gphi_j*(bj-aj)>=0) {
       bj=aj;
     } /* else bj unchanged */
     aj=alphaj;
   }
   ci++;
  }
  if (!found_step) {
   /* use bound to find possible step */
   alphak=alphaj;
  }
 #ifdef DEBUG
  printf("Found %g Interval [%lf,%lf]\n",alphak,a,b);
 #endif
  return alphak;
 }
 /* line search */
 /* func: vector function
   xk: parameter values size m x 1 (at which step is calculated)
   pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
   alpha1: initial value for step
   sigma,rho,t1,t2,t3: line search parameters (from Fletcher) 
   xo: observed data size n x 1
   n: size of vector function
   step: step size for differencing 
   adata:  additional data passed to the function
 */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 static double 
 linesearch(
   void (*func)(double *p, double *hx, int m, int n, void *adata),
   double *xk, double *pk, double alpha1, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) {
 /* phi(alpha)=f(xk+alpha pk)
  for vector function func 
   f(xk) =||func(xk)||^2 */
  double *x,*xp;
  double alphai,alphai1;
  double phi_0,phi_alphai,phi_alphai1;
  double p01,p02;
  double gphi_0,gphi_i;
  double alphak;
  double mu;
  double tol; /* lower limit for minimization */
  int ci;
  if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  if ((xp=(double*)calloc((size_t)m,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  alphak=1.0;
  /* evaluate phi_0 and grad(phi_0) */
  func(xk,x,m,n,adata);
  my_daxpy(n,xo,-1.0,x);
  phi_0=my_dnrm2(n,x);
  phi_0*=phi_0;
  /* select tolarance 1/100 of current function value */
  tol=MIN(0.01*phi_0,1e-6);
  /* grad(phi_0): evaluate at -step and +step */
  my_dcopy(m,xk,1,xp,1); /* xp<=xk */
  my_daxpy(m,pk,step,xp); /* xp<=xp+(0.0+step)*pk */
  func(xp,x,m,n,adata);
  /* calculate x<=x-xo */
  my_daxpy(n,xo,-1.0,x);
  p01=my_dnrm2(n,x);
  my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(0.0-step)*pk */
  func(xp,x,m,n,adata);
  /* calculate x<=x-xo */
  my_daxpy(n,xo,-1.0,x);
  p02=my_dnrm2(n,x);
  gphi_0=(p01*p01-p02*p02)/(2.0*step);
  /* estimate for mu */
  /* mu = (tol-phi_0)/(rho gphi_0) */
  mu=(tol-phi_0)/(rho*gphi_0);
 #ifdef DEBUG
  printf("mu=%lf, alpha1=%lf\n",mu,alpha1);
 #endif
  ci=1;
  alphai=alpha1; /* initial value for alpha(i) : check if 0<alphai<=mu */
  alphai1=0.0;
  phi_alphai1=phi_0;
  while(ci<10) {
   /* evalualte phi(alpha(i))=f(xk+alphai pk) */
   my_dcopy(m,xk,1,xp,1); /* xp<=xk */
   my_daxpy(m,pk,alphai,xp); /* xp<=xp+alphai*pk */
   func(xp,x,m,n,adata);
   /* calculate x<=x-xo */
   my_daxpy(n,xo,-1.0,x);
   phi_alphai=my_dnrm2(n,x);
   phi_alphai*=phi_alphai;
   if (phi_alphai<tol) {
     alphak=alphai;
 #ifdef DEBUG
     printf("Linesearch : Condition 0 met\n");
 #endif
     break;
   }
   if ((phi_alphai>phi_0+alphai*gphi_0) || (ci>1 && phi_alphai>=phi_alphai1)) {
      /* ai=alphai1, bi=alphai bracket */
      alphak=linesearch_zoom(func,xk,pk,alphai1,alphai,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata);
 #ifdef DEBUG
      printf("Linesearch : Condition 1 met\n");
 #endif
      break;
   } 
   /* evaluate grad(phi(alpha(i))) */
   my_dcopy(m,xk,1,xp,1); /* NOT NEEDED here?? xp<=xk */
   my_daxpy(m,pk,alphai+step,xp); /* xp<=xp+(alphai+step)*pk */
   func(xp,x,m,n,adata);
   /* calculate x<=x-xo */
   my_daxpy(n,xo,-1.0,x);
   p01=my_dnrm2(n,x);
   my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphai-step)*pk */
   func(xp,x,m,n,adata);
   /* calculate x<=x-xo */
   my_daxpy(n,xo,-1.0,x);
   p02=my_dnrm2(n,x);
   gphi_i=(p01*p01-p02*p02)/(2.0*step);
   if (fabs(gphi_i)<=-sigma*gphi_0) {
     alphak=alphai;
 #ifdef DEBUG
     printf("Linesearch : Condition 2 met\n");
 #endif
     break;
   }
   if (gphi_i>=0) {
     /* ai=alphai, bi=alphai1 bracket */
     alphak=linesearch_zoom(func,xk,pk,alphai,alphai1,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata);
 #ifdef DEBUG
     printf("Linesearch : Condition 3 met\n");
 #endif
     break;
   }
   /* else preserve old values */
   if (mu<=(2.0*alphai-alphai1)) {
     /* next step */
     alphai1=alphai;
     alphai=mu;
   } else {
     /* choose by interpolation in [2*alphai-alphai1,min(mu,alphai+t1*(alphai-alphai1)] */
     p01=2.0*alphai-alphai1;
     p02=MIN(mu,alphai+t1*(alphai-alphai1));
     alphai=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata);
     //printf("cubic interp [%lf,%lf]->%lf\n",p01,p02,alphai);
   }
   phi_alphai1=phi_alphai;
   ci++;
  }
  free(x);
  free(xp);
 #ifdef DEBUG
  printf("Step size=%g\n",alphak);
 #endif
  return alphak;
 }
 /*************** END Fletcher line search **********************************/
 int
 lbfgs_fit(
   void (*func)(double *p, double *hx, int m, int n, void *adata),
   double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, void *adata) {
  double *gk; /* gradients at both k+1 and k iter */
  double *xk1,*xk; /* parameters at k+1 and k iter */
  double *pk; /* step direction H_k * grad(f) */
  double step=1e-6; /* step for interpolation */
  double *y, *s; /* storage for delta(grad) and delta(p) */
  double *rho; /* storage for 1/yk^T*sk */
  int ci,ck,cm;
  double alphak=1.0;
  if ((gk=(double*)calloc((size_t)m,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  if ((xk1=(double*)calloc((size_t)m,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  if ((xk=(double*)calloc((size_t)m,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  if ((pk=(double*)calloc((size_t)m,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  /* storage size mM x 1*/
  if ((s=(double*)calloc((size_t)m*M,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  if ((y=(double*)calloc((size_t)m*M,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  if ((rho=(double*)calloc((size_t)M,sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
     exit(1);
  }
  /* initial value for params xk=p */
  my_dcopy(m,p,1,xk,1);
  /*  gradient gk=grad(f)_k */
  func_grad(func,xk,gk,x,m,n,step,adata);
  double gradnrm=my_dnrm2(m,gk);
  /* if gradient is too small, no need to solve, so stop */
  if (gradnrm<CLM_STOP_THRESH) {
   ck=itmax;
   step=0.0;
  } else {
   ck=0;
   /* step in [1e-6,1e-9] */
   step=MAX(1e-9,MIN(1e-3/gradnrm,1e-6));
  }
 #ifdef DEBUG
  printf("||grad||=%g step=%g\n",gradnrm,step);
 #endif
  cm=0;
  ci=0;
  while (ck<itmax) {
   /* mult with hessian  pk=-H_k*gk */
   if (ck<M) {
    mult_hessian(m,pk,gk,s,y,rho,ck,ci);
   } else {
    mult_hessian(m,pk,gk,s,y,rho,M,ci);
   }
   my_dscal(m,-1.0,pk);
   /* linesearch to find step length */
   /* parameters alpha1=10.0,sigma=0.1, rho=0.01, t1=9, t2=0.1, t3=0.5 */
   alphak=linesearch(func,xk,pk,10.0,0.1,0.01,9,0.1,0.5,x,m,n,step,adata);
   /* parameters c1=1e-4 c2=0.9, alpha1=1.0, alphamax=10.0, step (for alpha)=1e-4*/
   //alphak=linesearch_nw(func,xk,pk,1.0,10.0,1e-4,0.9,x,m,n,1e-4,adata);
   //alphak=1.0;
   /* check if step size is too small, then stop */
   if (fabs(alphak)<CLM_EPSILON) {
    break;
   }
   /* update parameters xk1=xk+alpha_k *pk */
   my_dcopy(m,xk,1,xk1,1);
   my_daxpy(m,pk,alphak,xk1);
   /* calculate sk=xk1-xk and yk=gk1-gk */
   /* sk=xk1 */ 
   my_dcopy(m,xk1,1,&s[cm],1); 
   /* sk=sk-xk */
   my_daxpy(m,xk,-1.0,&s[cm]);
   /* yk=-gk */ 
   my_dcopy(m,gk,1,&y[cm],1); 
   my_dscal(m,-1.0,&y[cm]);
   /* update gradient */
   func_grad(func,xk1,gk,x,m,n,step,adata);
   /* yk=yk+gk1 */
   my_daxpy(m,gk,1.0,&y[cm]);
   /* calculate 1/yk^T*sk */
   rho[ci]=1.0/my_ddot(m,&y[cm],&s[cm]);
   /* update xk=xk1 */
   my_dcopy(m,xk1,1,xk,1); 
   //printf("iter %d store %d\n",ck,cm);
   ck++;
   /* increment storage appropriately */
   if (cm<(M-1)*m) {
    /* offset of m */
    cm=cm+m;
    ci++;
   } else {
    cm=ci=0;
   }
  }
 /* copy back solution to p */
 my_dcopy(m,xk,1,p,1);
 /* for (ci=0; ci<m; ci++) {
   printf("grad %d=%lf\n",ci,gk[ci]);
  } */
  free(gk);
  free(xk1);
  free(xk);
  free(pk);
  free(s);
  free(y);
  free(rho);
  return 0;
 }
--- a/src/lib/lmfit.c
+++ b/src/lib/lmfit.c
--- a/src/lib/lmfit_nocuda.c
+++ b/src/lib/lmfit_nocuda.c
--- a/Show More
+++ b/Show More