update to 0.3.8

2015-08-19 19:20:08 +02:00 · 2015-08-19 19:20:08 +02:00 · 4f971a5cdd
parent b167c81854
commit 4f971a5cdd
27 changed files with 2446 additions and 387 deletions
--- a/src/MPI/Makefile
+++ b/src/MPI/Makefile
@ -13,6 +13,7 @@ LDFLAGS=-Wl,--rpath,/usr/local/OpenBLAS/lib/
 #LDFLAGS=-Wl,-t,--rpath,/software/users/lofareor/SW/lib64
 # -Wl,--hash-style=both

+# with multithread FFTW
 MY_LIBS=-lm -lsagecal
 INCLUDES=-I. -I./lib -I$(CASA_INCDIR) -I/usr/include
 LIBPATH=-L$(LAPACK_DIR) -L$(CASA_LIBDIR)  -L./lib
--- a/src/MPI/main.cpp
+++ b/src/MPI/main.cpp
@ -33,7 +33,7 @@ using namespace Data;

 void
 print_copyright(void) {
-  cout<<"SAGECal-MPI 0.3.5 (C) 2011-2015 Sarod Yatawatta"<<endl;
+  cout<<"SAGECal-MPI 0.3.9 (C) 2011-2015 Sarod Yatawatta"<<endl;
 }


@ -44,10 +44,10 @@ print_help(void) {
   cout << "-f MSlist: text file with MS names" << endl;
   cout << "-s sky.txt: sky model file"<< endl;
   cout << "-c cluster.txt: cluster file"<< endl;
-   cout << "-p solutions.txt: if given, save solution in this file, if not given 'XXX.MS.solutions' will be used"<< endl;
+   cout << "-p solutions.txt: if given, save (global) solutions in this file, but slaves will always write to 'XXX.MS.solutions'"<< endl;
   cout << "-F sky model format: 0: LSM, 1: LSM with 3 order spectra : default "<< Data::format<<endl;
-   cout << "-I input column (DATA/CORRECTED_DATA) : default " <<Data::DataField<< endl;
-   cout << "-O ouput column (DATA/CORRECTED_DATA) : default " <<Data::OutField<< endl;
+   cout << "-I input column (DATA/CORRECTED_DATA/...) : default " <<Data::DataField<< endl;
+   cout << "-O ouput column (DATA/CORRECTED_DATA/...) : default " <<Data::OutField<< endl;
   cout << "-e max EM iterations : default " <<Data::max_emiter<< endl;
   cout << "-g max iterations  (within single EM) : default " <<Data::max_iter<< endl;
   cout << "-l max LBFGS iterations : default " <<Data::max_lbfgs<< endl;
@ -57,16 +57,16 @@ print_help(void) {
   cout << "-A ADMM iterations: default " <<Data::Nadmm<< endl;
   cout << "-P consensus polynomial order: default " <<Data::Npoly<< endl;
   cout << "-r regularization factor: default " <<Data::admm_rho<< endl;
+   cout << "-G regularization factor of each cluster (text file instead of -r): default : None" << endl;
   cout << "-x exclude baselines length (lambda) lower than this in calibration : default "<<Data::min_uvcut << endl;
   cout << "-y exclude baselines length (lambda) higher than this in calibration : default "<<Data::max_uvcut << endl;
   cout <<endl<<"Advanced options:"<<endl;
   cout << "-k cluster_id : correct residuals with solution of this cluster : default "<<Data::ccid<< endl;
   cout << "-o robust rho, robust matrix inversion during correction: default "<<Data::rho<< endl;
-   cout << "-j 0,1,2... 0 : OSaccel, 1 no OSaccel, 2: OSRLM, 3: RLM, 4: RTR, 5: RRTR: default "<<Data::solver_mode<< endl;
+   cout << "-j 0,1,2... 0 : OSaccel, 1 no OSaccel, 2: OSRLM, 3: RLM, 4: RTR, 5: RRTR: 6: NSD, default "<<Data::solver_mode<< endl;
   cout << "-L robust nu, lower bound: default "<<Data::nulow<< endl;
   cout << "-H robust nu, upper bound: default "<<Data::nuhigh<< endl;
   cout << "-R randomize iterations: default "<<Data::randomize<< endl;
-   cout << "-D 0,1,2 : if >0, enable diagnostics (Jacobian Leverage) 1 replace Jacobian Leverage as output, 2 only fractional noise/leverage is printed: default " <<Data::DoDiag<< endl;
   cout << "-T stop after this number of solutions (0 means no limit): default "<<Data::Nmaxtime<< endl;
   cout <<"Report bugs to <sarod@users.sf.net>"<<endl;
 }
@ -75,7 +75,7 @@ print_help(void) {
 void 
 ParseCmdLine(int ac, char **av) {
    char c;
-    while((c=getopt(ac, av, "c:e:f:g:j:k:l:m:n:o:p:r:s:t:x:y:A:D:F:I:L:O:P:H:R:T:h"))!= -1)
+    while((c=getopt(ac, av, "c:e:f:g:j:k:l:m:n:o:p:r:s:t:x:y:A:F:I:L:O:P:G:H:R:T:h"))!= -1)
    {
        switch(c)
        {
@ -147,6 +147,9 @@ ParseCmdLine(int ac, char **av) {
            case 'r': 
                admm_rho= atof(optarg);
                break;
+            case 'G': 
+                admm_rho_file= optarg;
+                break;
            case 'R': 
                randomize= atoi(optarg);
                break;
--- a/src/MPI/sagecal_master.cpp
+++ b/src/MPI/sagecal_master.cpp
@ -67,40 +67,42 @@ sagecal_master(int argc, char **argv) {

   iodata.Nms=ntasks-1;
   /**** get info from slaves ***************************************/
-   int *bufint=new int[4];
+   int *bufint=new int[5];
   double *bufdouble=new double[1];
   iodata.freqs=new double[iodata.Nms];
   iodata.freq0=0.0;
   iodata.N=iodata.M=iodata.totalt=0;
+   int Mo=0;
   /* use iodata to store the results, also check for consistency of results */
   for (int cm=0; cm<iodata.Nms; cm++) {
-     MPI_Recv(bufint, 4, /* N,M,tilesz,totalt */
+     MPI_Recv(bufint, 5, /* N,Mo(actual clusters),M(with hybrid),tilesz,totalt */
       MPI_INT, cm+1, TAG_MSAUX, MPI_COMM_WORLD, &status);
-cout<<"Slave "<<cm+1<<" N="<<bufint[0]<<" M="<<bufint[1]<<" tilesz="<<bufint[2]<<" totaltime="<<bufint[3]<<endl;
+cout<<"Slave "<<cm+1<<" N="<<bufint[0]<<" M="<<bufint[1]<<"/"<<bufint[2]<<" tilesz="<<bufint[3]<<" totaltime="<<bufint[4]<<endl;
     if (cm==0) { /* update data */
      iodata.N=bufint[0];
-      iodata.M=bufint[1];
-      iodata.tilesz=bufint[2];
-      iodata.totalt=bufint[3];
+      Mo=bufint[1];
+      iodata.M=bufint[2];
+      iodata.tilesz=bufint[3];
+      iodata.totalt=bufint[4];
     } else { /* compare against others */
-       if ((iodata.N != bufint[0]) || (iodata.M != bufint[1]) || (iodata.tilesz != bufint[2])) {
-        cout<<"Slave "<<cm+1<<" parameters do not match  N="<<bufint[0]<<" M="<<bufint[1]<<" tilesz="<<bufint[2]<<endl;
+       if ((iodata.N != bufint[0]) || (iodata.M != bufint[2]) || (iodata.tilesz != bufint[3])) {
+        cout<<"Slave "<<cm+1<<" parameters do not match  N="<<bufint[0]<<" M="<<bufint[2]<<" tilesz="<<bufint[3]<<endl;
       }
-       if (iodata.totalt<bufint[3]) {
+       if (iodata.totalt<bufint[4]) {
        /* use max value as total time */
-        iodata.totalt=bufint[3];
+        iodata.totalt=bufint[4];
       }
     }
     MPI_Recv(bufdouble, 1, /* freq */
       MPI_DOUBLE, cm+1, TAG_MSAUX, MPI_COMM_WORLD, &status);
     iodata.freqs[cm]=bufdouble[0];
     iodata.freq0 +=bufdouble[0];
-     cout<<"Slave "<<cm+1<<" freq="<<bufdouble[0]<<endl;
+     cout<<"Slave "<<cm+1<<" frequency (MHz)="<<bufdouble[0]*1e-6<<endl;
   }
    iodata.freq0/=(double)iodata.Nms;
    delete [] bufint;
    delete [] bufdouble;
-cout<<"Reference freq="<<iodata.freq0<<endl;
+cout<<"Reference frequency (MHz)="<<iodata.freq0*1.0e-6<<endl;
    /* ADMM memory */
    double *Z,*Y,*z;
    /* Z: 2Nx2 x Npoly x M */
@ -126,6 +128,24 @@ cout<<"Reference freq="<<iodata.freq0<<endl;
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
    }
+    /* file for saving solutions */
+    FILE *sfp=0;
+    if (solfile) {
+     if ((sfp=fopen(solfile,"w+"))==0) {
+       fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
+       exit(1);
+     }
+    }
+
+    /* write additional info to solution file */
+    if (solfile) {
+      fprintf(sfp,"# solution file (Z) created by SAGECal\n");
+      fprintf(sfp,"# reference_freq(MHz) polynomial_order stations clusters effective_clusters\n");
+      fprintf(sfp,"%lf %d %d %d %d\n",iodata.freq0*1e-6,Npoly,iodata.N,Mo,iodata.M);
+    }
+
+
+

    /* interpolation polynomial */
    double *B,*Bi;
@ -139,12 +159,38 @@ cout<<"Reference freq="<<iodata.freq0<<endl;
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
    }
-    /* regularization factor array */
-    double *arho;
-    if ((arho=(double*)calloc((size_t)Nadmm,sizeof(double)))==0) {
+    /* regularization factor array, size Mx1
+       one per each hybrid cluster */
+    double *arho,*arhoslave;
+    if ((arho=(double*)calloc((size_t)iodata.M,sizeof(double)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
    }
+    if ((arhoslave=(double*)calloc((size_t)Mo,sizeof(double)))==0) {
+     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+     exit(1);
+    }
+
+    /* if text file is given, read it and update rho array */
+    if (Data::admm_rho_file) {
+     read_arho_fromfile(Data::admm_rho_file,iodata.M,arho,Mo,arhoslave);
+    } else {
+     /* copy common value */
+     /* setup regularization factor array */
+     for (int p=0; p<iodata.M; p++) {
+      arho[p]=admm_rho; 
+     }
+     for (int p=0; p<Mo; p++) {
+      arhoslave[p]=admm_rho; 
+     }
+    }
+
+    /* send array to slaves */
+    /* update rho on each slave */
+    for(int cm=0; cm<iodata.Nms; cm++) {
+      MPI_Send(arhoslave, Mo, MPI_DOUBLE, cm+1,TAG_RHO, MPI_COMM_WORLD);
+    }
+    free(arhoslave);

 #ifdef DEBUG
    FILE *dfp;
@ -159,11 +205,6 @@ cout<<"Reference freq="<<iodata.freq0<<endl;
    /* find sum B(:,i)B(:,i)^T, and its pseudoinverse */
    find_prod_inverse(B,Bi,Npoly,iodata.Nms);

-    /* setup regularization factor array */
-    for (int p=0; p<Nadmm; p++) {
-     arho[p]=admm_rho; 
-    }
-
 #ifdef DEBUG
    fprintf(dfp,"B=[\n");
    for (int p=0; p<Npoly; p++) {
@ -174,7 +215,7 @@ cout<<"Reference freq="<<iodata.freq0<<endl;
    }
    fprintf(dfp,"];\n");
    fprintf(dfp,"rho=%lf;\narho=[",admm_rho);
-    for (int p=0; p<Nadmm; p++) {
+    for (int p=0; p<iodata.M; p++) {
      fprintf(dfp,"%lf ",arho[p]);
    }
    fprintf(dfp,"];\n");
@ -197,8 +238,13 @@ cout<<"Reference freq="<<iodata.freq0<<endl;
 #ifdef DEBUG
    Ntime=1;
 #endif
-cout<<"Master total timeslots="<<Ntime<<endl;
-cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="<<admm_rho<<endl;
+    cout<<"Master total timeslots="<<Ntime<<endl;
+
+    if (!Data::admm_rho_file) {
+     cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="<<admm_rho<<endl;
+    } else {
+     cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization given by text file "<<Data::admm_rho_file<<endl;
+    }
    int msgcode;
    for (int ct=0; ct<Ntime; ct++)  {
      /* send start processing signal to slaves */
@ -208,11 +254,6 @@ cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="
      }

      for (int admm=0; admm<Nadmm; admm++) {
-         /* update rho on each slave */
-         for(int cm=0; cm<iodata.Nms; cm++) {
-          MPI_Send(&arho[admm], 1, MPI_DOUBLE, cm+1,TAG_RHO, MPI_COMM_WORLD);
-         }
-
         /* get Y_i+rho J_i from each slave */
         /* note: for first iteration, reorder values as
            2Nx2 complex  matrix blocks, M times from each  slave 
@ -270,8 +311,16 @@ cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="
            my_daxpy(8*iodata.N*iodata.M, &Y[cm*8*iodata.N*iodata.M], B[cm*Npoly+ci], &z[ci*8*iodata.N*iodata.M]);
           }
         }
-         /* also scale by 1/rho */
-         my_dscal(8*iodata.N*iodata.M*Npoly,1.0/arho[admm],z);
+         /* also scale by 1/rho, only if rho>0, otherwise set it to 0.0*/
+         for (int cm=0; cm<iodata.M; cm++) {
+          double invscale=0.0;
+          if (arho[cm]>0.0) {
+           invscale=1.0/arho[cm];
+          } 
+          for (int ci=0; ci<Npoly; ci++) {
+            my_dscal(8*iodata.N,invscale,&z[8*iodata.N*iodata.M*ci+8*iodata.N*cm]);
+          }
+         }

 #ifdef DEBUG
         fprintf(dfp,"%%%%%%%%%%%%%% time=%d admm=%d\n",ct,admm);
@ -342,6 +391,19 @@ cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="
          resetcount++;
        }
      }
+
+    /* write Z to solution file, same format as J, but we have Npoly times more
+       values per timeslot per column */
+     if (solfile) {
+      for (int p=0; p<iodata.N*8*Npoly; p++) {
+       fprintf(sfp,"%d ",p);
+       for (int pp=0; pp<iodata.M; pp++) {
+        fprintf(sfp," %e",Z[pp*iodata.N*8*Npoly+p]);
+       }
+       fprintf(sfp,"\n");
+      }
+     }
+
    }

    /* send end signal to each slave */
@ -350,6 +412,10 @@ cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="
        MPI_Send(&msgcode, 1, MPI_INT, cm+1,TAG_CTRL, MPI_COMM_WORLD);
    }

+    if (solfile) {
+      fclose(sfp);
+    }
+

 #ifdef DEBUG
    fclose(dfp);
--- a/src/MPI/sagecal_slave.cpp
+++ b/src/MPI/sagecal_slave.cpp
@ -24,6 +24,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <pthread.h>
+#include <time.h>

 #include<sagecal.h>
 #include <mpi.h>
@ -71,21 +72,14 @@ sagecal_slave(int argc, char **argv) {
    double **pm;
    complex double *coh;
    FILE *sfp=0;
-    if (solfile) {
-      if ((sfp=fopen(solfile,"w+"))==0) {
+    /* always create default solution file name MS+'.solutions' */
+    std::string filebuff=std::string(Data::TableName)+std::string(".solutions\0");
+    if ((sfp=fopen(filebuff.c_str(),"w+"))==0) {
       fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
       return 1;
-      }
-    } else {
-     /* create default solution file name MS+'.solutions' */
-     std::string filebuff=std::string(Data::TableName)+std::string(".solutions\0");
-     if ((sfp=fopen(filebuff.c_str(),"w+"))==0) {
-       fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
-       return 1;
-     }
-     /* set solfile to non null value */
-     solfile=const_cast<char*>(filebuff.c_str());
    }
+    /* set solfile to non null value */
+    solfile=const_cast<char*>(filebuff.c_str());


     double mean_nu;
@ -192,17 +186,25 @@ sagecal_slave(int argc, char **argv) {
      msitr[cm]->origin();
    }

+    /* write additional info to solution file */
+    if (solfile) {
+      fprintf(sfp,"# solution file created by SAGECal\n");
+      fprintf(sfp,"# freq(MHz) bandwidth(MHz) time_interval(min) stations clusters effective_clusters\n");
+      fprintf(sfp,"%lf %lf %lf %d %d %d\n",iodata.freq0*1e-6,iodata.deltaf*1e-6,(double)iodata.tilesz*iodata.deltat/60.0,iodata.N,M,Mt);
+    }
+

    /**** send info to master ***************************************/
-    /* send freq (freq0), no. stations (N), total timeslots (totalt), no. of clusters (Mt), integration time (deltat), bandwidth (deltaf) */
-    int *bufint=new int[4];
+    /* send freq (freq0), no. stations (N), total timeslots (totalt), no. of clusters (M), true no. of clusters with hybrid (Mt), integration time (deltat), bandwidth (deltaf) */
+    int *bufint=new int[5];
    double *bufdouble=new double[1];
    bufint[0]=iodata.N;
-    bufint[1]=Mt;
-    bufint[2]=iodata.tilesz;
-    bufint[3]=iodata.totalt;
+    bufint[1]=M;
+    bufint[2]=Mt;
+    bufint[3]=iodata.tilesz;
+    bufint[4]=iodata.totalt;
    bufdouble[0]=iodata.freq0;
-    MPI_Send(bufint, 4, MPI_INT, 0,TAG_MSAUX, MPI_COMM_WORLD);
+    MPI_Send(bufint, 5, MPI_INT, 0,TAG_MSAUX, MPI_COMM_WORLD);
    MPI_Send(bufdouble, 1, MPI_DOUBLE, 0,TAG_MSAUX, MPI_COMM_WORLD);

    delete [] bufint;
@ -227,6 +229,15 @@ sagecal_slave(int argc, char **argv) {
     exit(1);
    }

+    double *arho;
+    if ((arho=(double*)calloc((size_t)M,sizeof(double)))==0) {
+     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+     exit(1);
+    }
+
+    /* get regularization factor array */
+    MPI_Recv(arho,M,MPI_DOUBLE,0,TAG_RHO,MPI_COMM_WORLD,&status);
+
    /* if we have more than 1 channel, need to backup raw data */
    double *xbackup=0;
    if (iodata.Nchan>1) {
@ -236,6 +247,7 @@ sagecal_slave(int argc, char **argv) {
     }
    }

+
    int msgcode=0;
    /* starting iterations doubled */
    int start_iter=1;
@ -265,22 +277,19 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
     /******************** ADMM  *******************************/

     for (int admm=0; admm<Nadmm; admm++) {
-      /* get current regularization factor */
-      MPI_Recv(&admm_rho,1,MPI_DOUBLE,0,TAG_RHO,MPI_COMM_WORLD,&status);
-
      /* ADMM 1: minimize cost function */
      if (admm==0) { 
 #ifndef HAVE_CUDA
      if (start_iter) {
-       sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(Data::solver_mode==SM_RTR_OSRLM_RLBFGS?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
+       sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,(iodata.N<=64?2*Data::max_emiter:4*Data::max_emiter),Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(iodata.N<=64 && Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(iodata.N<=64 && (Data::solver_mode==SM_RTR_OSRLM_RLBFGS||Data::solver_mode==SM_NSD_RLBFGS)?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,0,&mean_nu,&res_0,&res_1); /* 0 for dummy whiten flag */
       start_iter=0;
      } else {
-       sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
+       sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,0,&mean_nu,&res_0,&res_1);
      }
 #endif /* !HAVE_CUDA */
 #ifdef HAVE_CUDA
      if (start_iter) {
-       sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(Data::solver_mode==SM_RTR_OSRLM_RLBFGS?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
+       sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,(iodata.N<=64?2*Data::max_emiter:4*Data::max_emiter),Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(iodata.N<=64 && Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(iodata.N<=64 && (Data::solver_mode==SM_RTR_OSRLM_RLBFGS||Data::solver_mode==SM_NSD_RLBFGS)?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
       start_iter=0;
      } else {
       sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
@ -301,11 +310,11 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
       }
 
 #ifndef HAVE_CUDA
-       sagefit_visibilities_admm(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,admm_rho,&mean_nu,&res_0,&res_1);
+       sagefit_visibilities_admm(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,arho,&mean_nu,&res_0,&res_1);
 #endif /* !HAVE_CUDA */
 #ifdef HAVE_CUDA
-       //sagefit_visibilities_admm(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,admm_rho,&mean_nu,&res_0,&res_1);
-       sagefit_visibilities_admm_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,admm_rho,&mean_nu,&res_0,&res_1);
+       //sagefit_visibilities_admm(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,arho,&mean_nu,&res_0,&res_1);
+       sagefit_visibilities_admm_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,arho,&mean_nu,&res_0,&res_1);
 #endif /* HAVE_CUDA */
      }

@ -314,10 +323,25 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
      if (admm==0) {
       /* Y is set to 0 : so original is just rho * J*/
       my_dcopy(iodata.N*8*Mt, p, 1, Y, 1);
-       my_dscal(iodata.N*8*Mt, admm_rho, Y);
+       /* scale by individual rho for each cluster */
+       /* if rho<=0, do nothing */
+       ck=0;
+       for (ci=0; ci<M; ci++) {
+        /* Y will be set to 0 if rho<=0 */
+        my_dscal(iodata.N*8*carr[ci].nchunk, arho[ci], &Y[ck]);
+        ck+=iodata.N*8*carr[ci].nchunk;
+       }
      } else {
-       my_daxpy(iodata.N*8*Mt, p, admm_rho, Y);
+       ck=0;
+       for (ci=0; ci<M; ci++) {
+        if (arho[ci]>0.0) {
+         my_daxpy(iodata.N*8*carr[ci].nchunk, &p[ck], arho[ci], &Y[ck]);
+        }
+        ck+=iodata.N*8*carr[ci].nchunk;
+//cout<<"Clus="<<ci<<" Chunk="<<carr[ci].nchunk<<" Rho="<<arho[ci]<<endl;
+       }
      }
+
      MPI_Send(Y, iodata.N*8*Mt, MPI_DOUBLE, 0,TAG_YDATA, MPI_COMM_WORLD);
      /* for initial ADMM iteration, get back Y with common unitary ambiguity */
      if (admm==0) {
@ -329,7 +353,14 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
     
      /* update Y_i <= Y_i + rho (J_i-B_i Z)
          since we already have Y_i + rho J_i, only need -rho (B_i Z) */
-      my_daxpy(iodata.N*8*Mt, Z, -admm_rho, Y);
+      ck=0;
+      for (ci=0; ci<M; ci++) {
+        if (arho[ci]>0.0) {
+         my_daxpy(iodata.N*8*carr[ci].nchunk, &Z[ck], -arho[ci], &Y[ck]);
+        }
+        ck+=iodata.N*8*carr[ci].nchunk;
+      }
+
      /* calculate primal residual J-BZ */
      my_dcopy(iodata.N*8*Mt, p, 1, pres, 1);
      my_daxpy(iodata.N*8*Mt, Z, -1.0, pres);
@ -365,21 +396,21 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
    /* if residual has increased too much, or all are flagged (0 residual)
      or NaN
      reset solutions to original
-      initial values */
-    if (res_1==0.0 || !isfinite(res_1) || res_1>res_ratio*res_prev) {
+      initial values : use residual at 1st ADMM */
+    if (res_01==0.0 || !isfinite(res_01) || res_01>res_ratio*res_prev) {
      cout<<"Resetting Solution"<<endl;
      /* reset solutions so next iteration has default initial values */
      memcpy(p,pinit,(size_t)iodata.N*8*Mt*sizeof(double));
      /* also assume iterations have restarted from scratch */
      start_iter=1;
      /* also forget min residual (otherwise will try to reset it always) */
-      res_prev=res_1;
-    } else if (res_1<res_prev) { /* only store the min value */
-     res_prev=res_1;
+      res_prev=res_01;
+    } else if (res_01<res_prev) { /* only store the min value */
+     res_prev=res_01;
    }
    end_time = time(0);
    elapsed_time = ((double) (end_time-start_time)) / 60.0;
-    if (solver_mode==SM_OSLM_OSRLM_RLBFGS||solver_mode==SM_RLM_RLBFGS||solver_mode==SM_RTR_OSRLM_RLBFGS) { 
+    if (solver_mode==SM_OSLM_OSRLM_RLBFGS||solver_mode==SM_RLM_RLBFGS||solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) { 
     cout<<"nu="<<mean_nu<<endl;
    }
    cout<<myrank<< ": Timeslot: "<<tilex<<" residual: initial="<<res_00<<"/"<<res_0<<",final="<<res_01<<"/"<<res_1<<", Time spent="<<elapsed_time<<" minutes"<<endl;
@ -463,6 +494,7 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
  free(Z);
  free(Y);
  free(pres);
+  free(arho);
  /**********************************************************/

   cout<<"Done."<<endl;    
--- a/src/MS/Makefile
+++ b/src/MS/Makefile
@ -1,6 +1,6 @@
 OUTPUT=
 CXX=g++
-CXXFLAGS=-O3 -Wall -g #-fnostack-protector
+CXXFLAGS=-O3 -Wall -g #-pg #-fnostack-protector
 CASA_LIBDIR=/opt/casacore/lib
 CASA_INCDIR=/opt/casacore/include/casacore
 CASA_LIBS=-lcasa_casa -lcasa_tables -lcasa_measures -lcasa_ms -lcfitsio
--- a/src/MS/data.cpp
+++ b/src/MS/data.cpp
@ -55,6 +55,7 @@ int Data::lbfgs_m=7;
 int Data::gpu_threads=128;
 int Data::linsolv=1;
 int Data::randomize=1;
+int Data::whiten=0;
 int Data::DoSim=0;
 int Data::DoDiag=0;
 int Data::doChan=0; /* if 1, solve for each channel in multi channel data */
@ -69,6 +70,7 @@ char *Data::MSlist=NULL;
 int Data::Nadmm=1;
 int Data::Npoly=2;
 double Data::admm_rho=5.0;
+char *Data::admm_rho_file=NULL;

 /* no upper limit, solve for all timeslots */
 int Data::Nmaxtime=0;
@ -298,6 +300,7 @@ Data::loadData(Table ti, Data::IOData iodata) {
        for(int k = 0; k < iodata.Nchan; k++) {
           Complex *ptr = data[k].data();
           bool *flgptr=flag[k].data();
+           //if (!flag.data()[k]){
           if (!flgptr[0] && !flgptr[1] && !flgptr[2] && !flgptr[3]){
             cxx+=ptr[0];
             cxy+=ptr[1];
@ -438,6 +441,7 @@ Data::loadDataList(vector<MSIter*> msitr, Data::IOData iodata) {
        for(int k = 0; k < iodata.NchanMS[cm]; k++) {
           Complex *ptr = data[k].data();
           bool *flgptr=flag[k].data();
+           //if (!flag.data()[k]){
           if (!flgptr[0] && !flgptr[1] && !flgptr[2] && !flgptr[3]){
             cxx+=ptr[0];
             cxy+=ptr[1];
--- a/src/MS/data.h
+++ b/src/MS/data.h
@ -114,6 +114,7 @@ namespace Data
    extern char *ignorefile;
    extern double nulow,nuhigh;
    extern int randomize;
+    extern int whiten;
    extern int DoSim; /* if 1, simulation mode */
    extern int doChan; /* if 1, solve for each channel in multi channel data */
    extern int DoDiag; /* if >0, enables diagnostics (Leverage) 1: write leverage as output (no residual), 2: only calculate fractions of leverage/noise */
@ -122,6 +123,7 @@ namespace Data
    extern int Nadmm; /* ADMM iterations >=1 */
    extern int Npoly; /* polynomial order >=1 */
    extern double admm_rho; /* regularization */
+    extern char *admm_rho_file; /* text file for regularization of each cluster */
    /* for debugging, upper limit on time slots */
    extern int Nmaxtime;
 }
--- a/src/MS/main.cpp
+++ b/src/MS/main.cpp
@ -31,7 +31,7 @@ using namespace Data;

 void
 print_copyright(void) {
-  cout<<"SAGECal 0.3.5 (C) 2011-2015 Sarod Yatawatta"<<endl;
+  cout<<"SAGECal 0.3.8 (C) 2011-2015 Sarod Yatawatta"<<endl;
 }


@ -47,15 +47,15 @@ print_help(void) {
   cout << "-c cluster.txt: cluster file"<< endl;
   cout << "-p solutions.txt: if given, save solution in this file"<< endl;
   cout << "-F sky model format: 0: LSM, 1: LSM with 3 order spectra : default "<< Data::format<<endl;
-   cout << "-I input column (DATA/CORRECTED_DATA) : default " <<Data::DataField<< endl;
-   cout << "-O ouput column (DATA/CORRECTED_DATA) : default " <<Data::OutField<< endl;
+   cout << "-I input column (DATA/CORRECTED_DATA/...) : default " <<Data::DataField<< endl;
+   cout << "-O ouput column (DATA/CORRECTED_DATA/...) : default " <<Data::OutField<< endl;
   cout << "-e max EM iterations : default " <<Data::max_emiter<< endl;
   cout << "-g max iterations  (within single EM) : default " <<Data::max_iter<< endl;
   cout << "-l max LBFGS iterations : default " <<Data::max_lbfgs<< endl;
   cout << "-m LBFGS memory size : default " <<Data::lbfgs_m<< endl;
   cout << "-n no of worker threads : default "<<Data::Nt << endl;
   cout << "-t tile size : default " <<Data::TileSize<< endl;
-   cout << "-a 0,1 : if 1, only simulate (with solutions if solutions file is also given): default " <<Data::DoSim<< endl;
+   cout << "-a 0,1,2 : if 1, only simulate, if 2, simulate and add to residual, (multiplied by solutions if solutions file is also given): default " <<Data::DoSim<< endl;
   cout << "-z ignore_clusters: if only doing a simulation, ignore the cluster ids listed in this file" << endl;
   cout << "-b 0,1 : if 1, solve for each channel: default " <<Data::doChan<< endl;
   cout << "-x exclude baselines length (lambda) lower than this in calibration : default "<<Data::min_uvcut << endl;
@ -63,9 +63,10 @@ print_help(void) {
   cout <<endl<<"Advanced options:"<<endl;
   cout << "-k cluster_id : correct residuals with solution of this cluster : default "<<Data::ccid<< endl;
   cout << "-o robust rho, robust matrix inversion during correction: default "<<Data::rho<< endl;
-   cout << "-j 0,1,2... 0 : OSaccel, 1 no OSaccel, 2: OSRLM, 3: RLM, 4: RTR, 5: RRTR: default "<<Data::solver_mode<< endl;
+   cout << "-j 0,1,2... 0 : OSaccel, 1 no OSaccel, 2: OSRLM, 3: RLM, 4: RTR, 5: RRTR, 6:NSD : default "<<Data::solver_mode<< endl;
   cout << "-L robust nu, lower bound: default "<<Data::nulow<< endl;
   cout << "-H robust nu, upper bound: default "<<Data::nuhigh<< endl;
+   cout << "-W pre-whiten data: default "<<Data::whiten<< endl;
   cout << "-R randomize iterations: default "<<Data::randomize<< endl;
   cout << "-D 0,1,2 : if >0, enable diagnostics (Jacobian Leverage) 1 replace Jacobian Leverage as output, 2 only fractional noise/leverage is printed: default " <<Data::DoDiag<< endl;
   cout <<"Report bugs to <sarod@users.sf.net>"<<endl;
@ -81,7 +82,7 @@ ParseCmdLine(int ac, char **av) {
        print_help();
        exit(0);
    }
-    while((c=getopt(ac, av, "a:b:c:d:e:f:g:j:k:l:m:n:o:p:s:t:x:y:z:D:F:I:O:L:H:R:h"))!= -1)
+    while((c=getopt(ac, av, "a:b:c:d:e:f:g:j:k:l:m:n:o:p:s:t:x:y:z:D:F:I:O:L:H:R:W:h"))!= -1)
    {
        switch(c)
        {
@ -105,7 +106,7 @@ ParseCmdLine(int ac, char **av) {
                break;
            case 'a':
                DoSim= atoi(optarg);
-                if (DoSim>1) { DoSim=1; }
+                if (DoSim<0) { DoSim=1; }
                break;
            case 'b':
                doChan= atoi(optarg);
@ -154,6 +155,9 @@ ParseCmdLine(int ac, char **av) {
            case 'R': 
                randomize= atoi(optarg);
                break;
+            case 'W': 
+                whiten= atoi(optarg);
+                break;
            case 'x': 
                Data::min_uvcut= atof(optarg);
                break;
@ -187,13 +191,13 @@ ParseCmdLine(int ac, char **av) {
    cout<<"Selecting baselines > "<<min_uvcut<<" and < "<<max_uvcut<<" wavelengths."<<endl;
    if (!DoSim) {
    cout<<"Using ";
-    if (solver_mode==SM_LM_LBFGS || solver_mode==SM_OSLM_LBFGS || solver_mode==SM_RTR_OSLM_LBFGS) {
+    if (solver_mode==SM_LM_LBFGS || solver_mode==SM_OSLM_LBFGS || solver_mode==SM_RTR_OSLM_LBFGS ||  solver_mode==SM_NSD_RLBFGS) {
     cout<<"Gaussian noise model for solver."<<endl;
    } else {
     cout<<"Robust noise model for solver with degrees of freedom ["<<nulow<<","<<nuhigh<<"]."<<endl;
    }
    } else {
-     cout<<"Only doing simulation."<<endl;
+     cout<<"Only doing simulation (with possible correction for cluster id "<<ccid<<")."<<endl;
    }
 }

@ -226,11 +230,11 @@ main(int argc, char **argv) {
    }
    /**********************************************************/
     int M,Mt,ci,cj,ck;  
-  /* parameters */
-  double *p,*pinit,*pfreq;
-  double **pm;
-  complex double *coh;
-  FILE *sfp=0;
+   /* parameters */
+   double *p,*pinit,*pfreq;
+   double **pm;
+   complex double *coh;
+   FILE *sfp=0;
    if (solfile) {
     if (!Data::DoSim) {
      if ((sfp=fopen(solfile,"w+"))==0) {
@ -240,9 +244,16 @@ main(int argc, char **argv) {
     } else {
     /* simulation mode, read only access */
      if ((sfp=fopen(solfile,"r"))==0) {
-       fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
+       fprintf(stderr,"%s: %d: no solution file present\n",__FILE__,__LINE__);
       return 1;
      }
+      /* remember to skip first 3 lines from solution file */
+      char chr;
+      for (ci=0; ci<3; ci++) {
+       do {
+        chr = fgetc(sfp);
+       } while (chr != '\n');
+      } 
     }
    }

@ -390,7 +401,15 @@ main(int argc, char **argv) {
    for(int cm=0; cm<iodata.Nms; cm++) {
      msitr[cm]->origin();
    }
-    /* starting iterations doubled */
+
+    /* write additional info to solution file */
+    if (solfile && !Data::DoSim) {
+      fprintf(sfp,"# solution file created by SAGECal\n");
+      fprintf(sfp,"# freq(MHz) bandwidth(MHz) time_interval(min) stations clusters effective_clusters\n");
+      fprintf(sfp,"%lf %lf %lf %d %d %d\n",iodata.freq0*1e-6,iodata.deltaf*1e-6,(double)iodata.tilesz*iodata.deltat/60.0,iodata.N,M,Mt);
+    }
+
+    /* starting iterations are doubled */
    int start_iter=1;
    while (msitr[0]->more()) {
      start_time = time(0);
@ -456,7 +475,8 @@ main(int argc, char **argv) {
     inout(p: length(8*mic_N*Mt)) 
     sagefit_visibilities_mic(mic_u,mic_v,mic_w,mic_x,mic_N,mic_Nbase,mic_tilesz,barr,mic_chunks,mic_pindex,coh,M,Mt,mic_freq0,mic_deltaf,p,mic_data_min_uvcut,mic_data_Nt,2*mic_data_max_emiter,mic_data_max_iter,(mic_data_dochan? 0 :mic_data_max_lbfgs),mic_data_lbfgs_m,mic_data_gpu_threads,mic_data_linsolv,mic_data_solver_mode,mic_data_nulow,mic_data_nuhigh,mic_data_randomize,&mean_nu,&res_0,&res_1);
 #else /* NOT MIC */
-     sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,(Data::doChan? 0 :Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(Data::solver_mode==SM_RTR_OSRLM_RLBFGS?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
+     sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,(iodata.N<=64?2*Data::max_emiter:4*Data::max_emiter),Data::max_iter,(Data::doChan? 0 :Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(iodata.N<=64 && Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(iodata.N<=64 && (Data::solver_mode==SM_RTR_OSRLM_RLBFGS||Data::solver_mode==SM_NSD_RLBFGS)?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,Data::whiten,&mean_nu,&res_0,&res_1);
+     //sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,(Data::doChan? 0 :Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,Data::whiten,&mean_nu,&res_0,&res_1);
 #endif /* USE_MIC */
     start_iter=0;
    } else {
@ -474,14 +494,14 @@ main(int argc, char **argv) {
     inout(p: length(8*mic_N*Mt)) 
     sagefit_visibilities_mic(mic_u,mic_v,mic_w,mic_x,mic_N,mic_Nbase,mic_tilesz,barr,mic_chunks,mic_pindex,coh,M,Mt,mic_freq0,mic_deltaf,p,mic_data_min_uvcut,mic_data_Nt,mic_data_max_emiter,mic_data_max_iter,(mic_data_dochan? 0: mic_data_max_lbfgs),mic_data_lbfgs_m,mic_data_gpu_threads,mic_data_linsolv,mic_data_solver_mode,mic_data_nulow,mic_data_nuhigh,mic_data_randomize,&mean_nu,&res_0,&res_1);
 #else /* NOT MIC */
-     sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,(Data::doChan? 0: Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
+     sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,(Data::doChan? 0: Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,Data::whiten,&mean_nu,&res_0,&res_1);
 #endif /* USE_MIC */
    }
 #endif /* !HAVE_CUDA */
 #ifdef HAVE_CUDA
 #ifdef ONE_GPU
    if (start_iter) {
-     sagefit_visibilities_dual_pt_one_gpu(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,(Data::doChan? 0: Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(Data::solver_mode==SM_RTR_OSRLM_RLBFGS?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
+     sagefit_visibilities_dual_pt_one_gpu(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt, (iodata.N<=64?2*Data::max_emiter:4*Data::max_emiter),Data::max_iter,(Data::doChan? 0: Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(iodata.N<=64 && Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(iodata.N<=64 && (Data::solver_mode==SM_RTR_OSRLM_RLBFGS||Data::solver_mode==SM_NSD_RLBFGS)?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
     start_iter=0;
    } else {
     sagefit_visibilities_dual_pt_one_gpu(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,(Data::doChan? 0:Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
@ -489,7 +509,7 @@ main(int argc, char **argv) {
 #endif /* ONE_GPU */
 #ifndef ONE_GPU
    if (start_iter) {
-     sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,(Data::doChan? 0:Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(Data::solver_mode==SM_RTR_OSRLM_RLBFGS?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
+     sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,(iodata.N<=64?2*Data::max_emiter:4*Data::max_emiter),Data::max_iter,(Data::doChan? 0:Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(iodata.N<=64 && Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(iodata.N<=64 && (Data::solver_mode==SM_RTR_OSRLM_RLBFGS||Data::solver_mode==SM_NSD_RLBFGS)?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
     ///DBG sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,(Data::doChan? 0:Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
     start_iter=0;
    } else {
@ -561,11 +581,11 @@ main(int argc, char **argv) {
   } else {
    /************ simulation only mode ***************************/
    if (!solfile) {
-     predict_visibilities_multifreq(iodata.u,iodata.v,iodata.w,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt);
+     predict_visibilities_multifreq(iodata.u,iodata.v,iodata.w,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,(Data::DoSim>1?1:0));
    } else {
     read_solutions(sfp,p,carr,iodata.N,M);
    /* if solution file is given, read in the solutions and predict */
-     predict_visibilities_multifreq_withsol(iodata.u,iodata.v,iodata.w,p,iodata.xo,ignorelist,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt);
+     predict_visibilities_multifreq_withsol(iodata.u,iodata.v,iodata.w,p,iodata.xo,ignorelist,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,(Data::DoSim>1?1:0),Data::ccid,Data::rho);
    }
    /************ end simulation only mode ***************************/
   }
@ -613,10 +633,14 @@ main(int argc, char **argv) {
   }
    end_time = time(0);
    elapsed_time = ((double) (end_time-start_time)) / 60.0;
-    if (solver_mode==SM_OSLM_OSRLM_RLBFGS||solver_mode==SM_RLM_RLBFGS||solver_mode==SM_RTR_OSRLM_RLBFGS) { 
+    if (!Data::DoSim) {
+    if (solver_mode==SM_OSLM_OSRLM_RLBFGS||solver_mode==SM_RLM_RLBFGS||solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) { 
    cout<<"nu="<<mean_nu<<endl;
    }
-cout<<"Timeslot: "<<tilex<<" Residual: initial="<<res_0<<",final="<<res_1<<", Time spent="<<elapsed_time<<" minutes"<<endl;
+      cout<<"Timeslot: "<<tilex<<" Residual: initial="<<res_0<<",final="<<res_1<<", Time spent="<<elapsed_time<<" minutes"<<endl;
+    } else {
+      cout<<"Timeslot: "<<tilex<<", Time spent="<<elapsed_time<<" minutes"<<endl;
+    }
    }

   Data::freeData(iodata);
--- a/src/lib/Makefile
+++ b/src/lib/Makefile
@ -1,6 +1,6 @@
 CC=gcc
 CXX=g++
-CFLAGS= -Wall -O3 -g
+CFLAGS= -Wall -O3 -g #-pg
 CLIBS= -lm -lpthread
 #LAPACK=-L/usr/lib/atlas/sse -llapack -lblas
 #LAPACK=-L/usr/local/GotoBLAS2/lib -lgoto2 -lpthread -lgfortran
@ -52,7 +52,6 @@ rtr_solve_robust_admm.o:rtr_solve_robust_admm.c
 admm_solve.o:admm_solve.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<

-
 RANLIB=ranlib
 libsagecal.a:$(OBJECTS) sagecal.h
 	ar rv $@ $(OBJECTS); \
--- a/src/lib/admm_solve.c
+++ b/src/lib/admm_solve.c
@ -387,7 +387,7 @@ minimize_viz_full_pth(double *p, double *x, int m, int n, void *data) {

 int
 sagefit_visibilities_admm(double *u, double *v, double *w, double *x, int N,   
-   int Nbase, int tilesz,  baseline_t *barr,  clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, double admm_rho, double *mean_nu, double *res_0, double *res_1) {
+   int Nbase, int tilesz,  baseline_t *barr,  clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, double *admm_rho, double *mean_nu, double *res_0, double *res_1) {

  int  ci,cj,ck,tcj;
  double *p; // parameters: m x 1
@ -511,7 +511,7 @@ printf("\n\ncluster %d iter=%d\n",cj,this_itermax);
        /* use a reasonable TR radius because cost function has extra 
       regularization NB: ADMM very sensitive to this */
       double Delta0=2.0; 
-       rtr_solve_nocuda_robust_admm(&p[carr[cj].p[ck]], &Y[carr[cj].p[ck]], &BZ[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], N, ntiles*Nbase, this_itermax+5, this_itermax+10, Delta0, Delta0*0.125, admm_rho, nulow, nuhigh, info, &lmdata);
+       rtr_solve_nocuda_robust_admm(&p[carr[cj].p[ck]], &Y[carr[cj].p[ck]], &BZ[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], N, ntiles*Nbase, this_itermax+5, this_itermax+10, Delta0, Delta0*0.125, admm_rho[cj], nulow, nuhigh, info, &lmdata);
       if (ci==max_emiter-1){
            robust_nuM[cj]+=lmdata.robust_nu;
       }
@ -563,6 +563,8 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
  free(robust_nuM);
  if (robust_nu0<nulow) {
     robust_nu0=nulow;
+  } else if (robust_nu0>nuhigh) {
+     robust_nu0=nuhigh;
  }

  /* final residual calculation */
@ -601,7 +603,7 @@ pipeline_slave_code_admm_flt(void *data)
  sync_barrier(&(td->pline->gate2)); /* stop at gate 2 */
  /* do work : only one solver */
  //printf("state=%d, thread %d\n",gd->status[tid],tid);
-  if (gd->status[tid]==PT_DO_WORK_RRTR ) {
+  if (gd->status[tid]==PT_DO_WORK_RRTR || gd->status[tid]==PT_DO_WORK_NSD) {
 /************************* work *********************/
  me_data_t *t=(me_data_t *)gd->lmdata[tid];
  /* divide the tiles into chunks tilesz/nchunk */
@ -626,10 +628,13 @@ pipeline_slave_code_admm_flt(void *data)
      ntiles=t->tilesz-cj;
     }

-      /* max trust region radius: keep reasonable */
-      float Delta0=2.0f;
-      /* storage (float) 8N*(BlocksPerGrid+8) + 8N*5 + 8*M + 8*Nbase + 2*Nbase + N + M */
-      ret=rtr_solve_cuda_robust_admm_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->Y[tid][ci*(gd->M[tid])], &gd->Z[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+5, gd->itermax[tid]+10, Delta0, Delta0*0.125f, gd->admm_rho, gd->nulow, gd->nuhigh, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid],  cj, ntiles, (void*)gd->lmdata[tid]);
+      if (gd->status[tid]==PT_DO_WORK_NSD) {
+       ret=nsd_solve_cuda_robust_admm_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->Y[tid][ci*(gd->M[tid])], &gd->Z[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+15, gd->admm_rho[tid], gd->nulow, gd->nuhigh, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid],  cj, ntiles, (void*)gd->lmdata[tid]);
+      } else {
+       /* max trust region radius: keep reasonable */
+       float Delta0=2.0f;
+       ret=rtr_solve_cuda_robust_admm_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->Y[tid][ci*(gd->M[tid])], &gd->Z[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+10, Delta0, Delta0*0.125f, gd->admm_rho[tid], gd->nulow, gd->nuhigh, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid],  cj, ntiles, (void*)gd->lmdata[tid]);
+      }

     init_res+=gd->info[tid][0];
     final_res+=gd->info[tid][1];
@ -712,7 +717,7 @@ destroy_pipeline_admm_flt(th_pipeline *pline)
 //#define DEBUG
 int
 sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x, int N, 
-   int Nbase, int tilesz,  baseline_t *barr,  clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,  double nulow, double nuhigh, int randomize,double admm_rho, double *mean_nu, double *res_0, double *res_1) {
+   int Nbase, int tilesz,  baseline_t *barr,  clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,  double nulow, double nuhigh, int randomize,double *admm_rho, double *mean_nu, double *res_0, double *res_1) {


  int  ci,cj;
@ -857,23 +862,20 @@ sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x
  tpg.status[0]=tpg.status[1]=PT_DO_AGPU;
  /* also calculate the total storage needed to be allocated on a GPU */
   /* determine total size for memory allocation */
-   int Mm=8*N;
   int64_t data_sz=0;
-   /* size for RTR (float)  : M*( 13 + ntiles*Nbase/128 +1+1) + 18*ntiles*Nbase
-        where M: no of stations (params/8) ntiles: tile size Nbase: baselines  
+   /* size for RTR/NSD (float), 128 is the ThreadsPerBlock   
+      NSD is a bit lower
   */
-   int64_t data_sz_rtr=0;
-   /* 2 x Nbase more than normal RTR for weight, log(weight) */
-   data_sz_rtr=(N*(13+Nbase1/128+1+1)+20*Nbase1)*sizeof(float);
-   /* size for ROBUSTLM */
-   data_sz=(int64_t)(n+Mm*n+Mm*Mm+Mm+Mm*Mm+Mm+Mm+Mm+Mm+Nbase1*8+n+n+n+n)*sizeof(float)+(int64_t)Nbase1*2*sizeof(char);
-   data_sz=MAX(data_sz,data_sz_rtr);
-   data_sz+=(int64_t)Mm*sizeof(float);
+  if (solver_mode==SM_NSD_RLBFGS) {
+   data_sz=(8*N*(7+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
+  } else { /* default is RTR */
+   data_sz=(8*N*(11+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
+  }
+
  tpg.data_size=data_sz;
  tpg.nulow=nulow;
  tpg.nuhigh=nuhigh;
  tpg.randomize=randomize;
-  tpg.admm_rho=(float)admm_rho;
  sync_barrier(&(tp.gate2)); /* sync at gate 2*/

  sync_barrier(&(tp.gate1)); /* sync at gate 1*/
@ -948,6 +950,7 @@ sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x
     tpg.p[0]=&pf[carr[c0].p[0]]; /* length carr[c0].nchunk times */
     tpg.Y[0]=&Yf[carr[c0].p[0]]; /* length carr[c0].nchunk times */
     tpg.Z[0]=&Zf[carr[c0].p[0]]; /* length carr[c0].nchunk times */
+     tpg.admm_rho[0]=(float)admm_rho[c0];
     tpg.x[0]=xdummy0f;
     tpg.M[0]=8*N; /* even though size of p is > M, dont change this */
     tpg.N[0]=n; /* Nbase*tilesz*8 */
@ -960,6 +963,7 @@ sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x
     tpg.p[1]=&pf[carr[c1].p[0]]; /* length carr[c1].nchunk times */
     tpg.Y[1]=&Yf[carr[c1].p[0]]; /* length carr[c1].nchunk times */
     tpg.Z[1]=&Zf[carr[c1].p[0]]; /* length carr[c1].nchunk times */
+     tpg.admm_rho[1]=(float)admm_rho[c1];
     tpg.x[1]=xdummy1f;
     tpg.M[1]=8*N; /* even though size of p is > M, dont change this */
     tpg.N[1]=n; /* Nbase*tilesz*8 */
@ -971,7 +975,11 @@ sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x
 /**************************************************************************/

     /* both threads do work */
-     tpg.status[0]=tpg.status[1]=PT_DO_WORK_RRTR;
+     if (solver_mode==SM_NSD_RLBFGS) {
+      tpg.status[0]=tpg.status[1]=PT_DO_WORK_NSD;
+     } else {
+      tpg.status[0]=tpg.status[1]=PT_DO_WORK_RRTR;
+     }
  sync_barrier(&(tp.gate2)); /* sync at gate 2 */
  sync_barrier(&(tp.gate1)); /* sync at gate 1 */
     tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
@ -1037,6 +1045,7 @@ printf("1: %lf -> %lf 2: %lf -> %lf\n\n\n",info0[0],info0[1],info1[0],info1[1]);
     tpg.p[0]=&pf[carr[c0].p[0]];
     tpg.Y[0]=&Yf[carr[c0].p[0]];
     tpg.Z[0]=&Zf[carr[c0].p[0]];
+     tpg.admm_rho[0]=(float)admm_rho[c0];
     tpg.x[0]=xdummy0f;
     tpg.M[0]=8*N;
     tpg.N[0]=n;
@ -1046,7 +1055,11 @@ printf("1: %lf -> %lf 2: %lf -> %lf\n\n\n",info0[0],info0[1],info1[0],info1[1]);
     tpg.linsolv=linsolv;
     tpg.lmdata[0]=&lmdata0;

-     tpg.status[0]=PT_DO_WORK_RRTR;
+     if (solver_mode==SM_NSD_RLBFGS) {
+      tpg.status[0]=PT_DO_WORK_NSD;
+     } else {
+      tpg.status[0]=PT_DO_WORK_RRTR;
+     }

     tpg.status[1]=PT_DO_NOTHING;
  sync_barrier(&(tp.gate2)); /* sync at gate 2 */
@ -1106,9 +1119,12 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
    printf("mean nu=%lf\n",robust_nu0);
 #endif
    free(robust_nuM);
-    if (robust_nu0<nulow) {
+  if (robust_nu0<nulow) {
     robust_nu0=nulow;
-    }
+  } else if (robust_nu0>nuhigh) {
+     robust_nu0=nuhigh;
+  }
+
  /******** free threads ***************/
  sync_barrier(&(tp.gate1)); /* sync at gate 1*/
  tpg.status[0]=tpg.status[1]=PT_DO_DGPU;
@ -1139,7 +1155,7 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);

 int
 sagefit_visibilities_admm_dual_pt_flt_one(double *u, double *v, double *w, double *x, int N, 
-   int Nbase, int tilesz,  baseline_t *barr,  clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,  double nulow, double nuhigh, int randomize,double admm_rho, double *mean_nu, double *res_0, double *res_1) {
+   int Nbase, int tilesz,  baseline_t *barr,  clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,  double nulow, double nuhigh, int randomize,double *admm_rho, double *mean_nu, double *res_0, double *res_1) {


  int  ci,cj;
@ -1284,24 +1300,16 @@ sagefit_visibilities_admm_dual_pt_flt_one(double *u, double *v, double *w, doubl
  tpg.status[0]=tpg.status[1]=PT_DO_AGPU;
  /* also calculate the total storage needed to be allocated on a GPU */
   /* determine total size for memory allocation */
-   int Mm=8*N;
   int64_t data_sz=0;
-   /* size for RTR (float)  : M*( 13 + ntiles*Nbase/128 +1+1) + 18*ntiles*Nbase
-        where M: no of stations (params/8) ntiles: tile size Nbase: baselines  
+   /* size for RTR/NSD (float), 128 is the ThreadsPerBlock   
+      NSD is a bit lower, but use the same
   */
-   int64_t data_sz_rtr=0;
-   /* 2 x Nbase more than normal RTR for weight, log(weight) */
-   data_sz_rtr=(N*(13+Nbase1/128+1+1)+20*Nbase1)*sizeof(float);
-   /* size for ROBUSTLM */
-   data_sz=(int64_t)(n+Mm*n+Mm*Mm+Mm+Mm*Mm+Mm+Mm+Mm+Mm+Nbase1*8+n+n+n+n)*sizeof(float)+(int64_t)Nbase1*2*sizeof(char);
-   data_sz=MAX(data_sz,data_sz_rtr);
-   data_sz+=(int64_t)Mm*sizeof(float);
+  data_sz=(8*N*(11+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);

  tpg.data_size=data_sz;
  tpg.nulow=nulow;
  tpg.nuhigh=nuhigh;
  tpg.randomize=randomize;
-  tpg.admm_rho=(float)admm_rho;
  sync_barrier(&(tp.gate2)); /* sync at gate 2*/

  sync_barrier(&(tp.gate1)); /* sync at gate 1*/
@ -1358,6 +1366,7 @@ sagefit_visibilities_admm_dual_pt_flt_one(double *u, double *v, double *w, doubl
     tpg.p[0]=&pf[carr[c0].p[0]];
     tpg.Y[0]=&Yf[carr[c0].p[0]];
     tpg.Z[0]=&Zf[carr[c0].p[0]];
+     tpg.admm_rho[0]=(float)admm_rho[c0];
     tpg.x[0]=xdummy0f;
     tpg.M[0]=8*N;
     tpg.N[0]=n;
@ -1427,9 +1436,13 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
    printf("mean nu=%lf\n",robust_nu0);
 #endif
    free(robust_nuM);
-    if (robust_nu0<nulow) {
+  if (robust_nu0<nulow) {
     robust_nu0=nulow;
-    }
+  } else if (robust_nu0>nuhigh) {
+     robust_nu0=nuhigh;
+  }
+
+
  /******** free threads ***************/
  sync_barrier(&(tp.gate1)); /* sync at gate 1*/
  tpg.status[0]=tpg.status[1]=PT_DO_DGPU;
--- a/src/lib/clmfit.c
+++ b/src/lib/clmfit.c
@ -23,6 +23,7 @@
 #include <string.h>
 #include <math.h>
 #include <float.h>
+#include <unistd.h>

 #include "sagecal.h"
 #include <cuda_runtime.h>
@ -1175,8 +1176,14 @@ attach_gpu_to_thread2(int card,  cublasHandle_t *cbhandle,float **WORK, int64_t
  cudaError_t err;
  culaStatus status;
  cublasStatus_t cbstatus;
-  status=culaSelectDevice(card); /* FIXME: Do not enable CULA if its not going to be used */
  if (usecula) {
+   status=culaSelectDevice(card); /* Do not enable CULA if its not going to be used */
+   /* if first try failed, wait and retry */
+   if (status) {
+    fprintf(stderr,"%s: %d: CULA device select failure, retrying\n",__FILE__,__LINE__);
+    sleep(10);
+    status=culaSelectDevice(card);
+   }
   checkStatus(status,__FILE__,__LINE__);
   status=culaInitialize();
   checkStatus(status,__FILE__,__LINE__);
@ -1189,13 +1196,21 @@ attach_gpu_to_thread2(int card,  cublasHandle_t *cbhandle,float **WORK, int64_t
     exit(1);
   }
   checkStatus(status,__FILE__,__LINE__);
+   cudaSetDevice(card); /* we need this */
+  } else { /* not using CULA */
+   cudaSetDevice(card);
  }

-  cudaSetDevice(card); /* do we need this because sometimes we do not use cula */
  cbstatus=cublasCreate(cbhandle);
  if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
-    fprintf(stderr,"%s: %d: CUBLAS create fail\n",__FILE__,__LINE__);
-    exit(1);
+    /* retry once more before exiting */
+    fprintf(stderr,"%s: %d: CUBLAS create failure, retrying\n",__FILE__,__LINE__);
+    sleep(10);
+    cbstatus=cublasCreate(cbhandle);
+    if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
+     fprintf(stderr,"%s: %d: CUBLAS create fail\n",__FILE__,__LINE__);
+     exit(1);
+    }
  }

  err=cudaMalloc((void**)WORK, (size_t)work_size);
@ -1246,7 +1261,6 @@ detach_gpu_from_thread2(int card,cublasHandle_t cbhandle,float *WORK, int usecul
  }
  cudaFree(WORK);

-  //cudaSetDevice(card);
  cudaDeviceReset();
 }
 void
--- a/src/lib/enveor.sh
+++ b/src/lib/enveor.sh
@ -4,6 +4,8 @@ export LD_LIBRARY_PATH
 source /opt/intel/composerxe/bin/compilervars.sh intel64

 export MIC_ENV_PREFIX=MIC
+##export MKL_MIC_ENABLE=1
+##export OMP_NUM_THREADS=16
 export MIC_PREFIX=MIC
 export MIC_OMP_NUM_THREADS=240
 # 
--- a/src/lib/lmfit.c
+++ b/src/lib/lmfit.c
@ -1975,6 +1975,8 @@ sagefit_visibilities_dual_pt(double *u, double *v, double *w, double *x, int N,
    free(robust_nuM);
    if (robust_nu0<nulow) {
     robust_nu0=nulow;
+    } else if (robust_nu0>nuhigh) {
+     robust_nu0=nuhigh;
    }
  }

@ -2363,7 +2365,10 @@ sagefit_visibilities_dual_pt_one_gpu(double *u, double *v, double *w, double *x,
    free(robust_nuM);
    if (robust_nu0<nulow) {
     robust_nu0=nulow;
+    } else if (robust_nu0>nuhigh) {
+     robust_nu0=nuhigh;
    }
+
  }

  /******** free threads ***************/
@ -2503,7 +2508,7 @@ pipeline_slave_code_flt(void *data)
  //printf("state=%d, thread %d\n",gd->status[tid],tid);
  if (gd->status[tid]==PT_DO_WORK_LM || gd->status[tid]==PT_DO_WORK_OSLM
   || gd->status[tid]==PT_DO_WORK_RLM || gd->status[tid]==PT_DO_WORK_OSRLM
-   || gd->status[tid]==PT_DO_WORK_RTR || gd->status[tid]==PT_DO_WORK_RRTR ) {
+   || gd->status[tid]==PT_DO_WORK_RTR || gd->status[tid]==PT_DO_WORK_RRTR || gd->status[tid]==PT_DO_WORK_NSD) {
 /************************* work *********************/
  me_data_t *t=(me_data_t *)gd->lmdata[tid];
  /* divide the tiles into chunks tilesz/nchunk */
@ -2535,21 +2540,18 @@ pipeline_slave_code_flt(void *data)
     } else if (gd->status[tid]==PT_DO_WORK_RLM) {
      ret=rlevmar_der_single_cuda_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid], 8*ntiles*t->Nbase, gd->itermax[tid], gd->opts[tid], gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], gd->linsolv, cj, ntiles, gd->nulow,gd->nuhigh,(void*)gd->lmdata[tid]);
     } else if (gd->status[tid]==PT_DO_WORK_OSRLM) {
-      ret=osrlevmar_der_single_cuda_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid], 8*ntiles*t->Nbase, gd->itermax[tid], gd->opts[tid], gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], gd->linsolv, cj, ntiles, gd->nulow,gd->nuhigh,gd->randomize,(void*)gd->lmdata[tid]);
+      ret=osrlevmar_der_single_cuda_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid], 8*ntiles*t->Nbase, gd->itermax[tid], gd->opts[tid], gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], gd->linsolv, cj, ntiles, gd->nulow,gd->nuhigh,gd->randomize,0,(void*)gd->lmdata[tid]); /* FIXME 0 for whiten */
     } else if (gd->status[tid]==PT_DO_WORK_RTR) {
      /* note stations: M/8, baselines ntiles*Nbase RSD+RTR */
      float Delta0=0.01f; /* use very small value because previous LM has already made the solution close to true value */
-      /* storage: 
-       need (float) 8N*(BlocksPerGrid+8) + 8N*5 + 8*M + 8*Nbase + 2*Nbase + N
-       where N<=M/8 M<=ntiles*Nbase, Nbase<=ntiles*Nbase,
-         BlocksPerGrid=ceil(ntiles*Nbase/128)
-      so(max): M*( 13 + ntiles*Nbase/128 +1 +1) + 18*ntiles*Nbase
-        where M: no of stations (params/8), ntiles: tile size, Nbase: baselines */
+      /* storage: see function header 
+        */
      ret=rtr_solve_cuda_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+5, gd->itermax[tid]+10, Delta0, Delta0*0.125f, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid],  cj, ntiles, (void*)gd->lmdata[tid]);
     } else if (gd->status[tid]==PT_DO_WORK_RRTR) {
      float Delta0=0.01f;
-      /* storage (float) 8N*(BlocksPerGrid+8) + 8N*5 + 8*M + 8*Nbase + 2*Nbase + N + M */
      ret=rtr_solve_cuda_robust_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+5, gd->itermax[tid]+10, Delta0, Delta0*0.125f, gd->nulow, gd->nuhigh, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid],  cj, ntiles, (void*)gd->lmdata[tid]);
+     } else if (gd->status[tid]==PT_DO_WORK_NSD) {
+      ret=nsd_solve_cuda_robust_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+15, gd->nulow, gd->nuhigh, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid],  cj, ntiles, (void*)gd->lmdata[tid]);
     }
     init_res+=gd->info[tid][0];
     final_res+=gd->info[tid][1];
@ -2654,7 +2656,6 @@ sagefit_visibilities_dual_pt_flt(double *u, double *v, double *w, double *x, int
  int *cr=0; /* array for random permutation of clusters */
  int c0,c1;

-  //opts[0]=LM_INIT_MU; opts[1]=1E-15; opts[2]=1E-15; opts[3]=1E-20;
  opts[0]=CLM_INIT_MU; opts[1]=1E-9; opts[2]=1E-9; opts[3]=1E-9;
  opts[4]=-CLM_DIFF_DELTA;

@ -2747,7 +2748,7 @@ sagefit_visibilities_dual_pt_flt(double *u, double *v, double *w, double *x, int
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
  }
-  if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
+  if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
   if ((robust_nuM=(double*)calloc((size_t)(M),sizeof(double)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
     exit(1);
@ -2775,37 +2776,37 @@ sagefit_visibilities_dual_pt_flt(double *u, double *v, double *w, double *x, int
   int64_t data_sz=0;
   /* size for MLM (disabled) */
   //data_sz=(int64_t)(n+Mm*n+n+n+n+Mm+Mm+Mm*Mm+Mm*Mm+Mm+Mm+Mm+Mm+Mm+Nbase1*8)*sizeof(double)+(int64_t)Nbase1*2*sizeof(char);
-   /* size for RTR (float)  : M*( 13 + ntiles*Nbase/128 +1+1) + 18*ntiles*Nbase
-        where M: no of stations (params/8) ntiles: tile size Nbase: baselines  
+
+   /* size for RTR/NSD (float), 128 is the ThreadsPerBlock   
   */
-   int64_t data_sz_rtr=0;
   if (solver_mode==SM_RTR_OSLM_LBFGS) {
-     data_sz_rtr=(N*(13+Nbase1/128+1+1)+18*Nbase1)*sizeof(float);
+     /* use same size as robust version, probably is lower */
+     data_sz=(8*N*(11+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
   } else if (solver_mode==SM_RTR_OSRLM_RLBFGS) {
-     /* 2 x Nbase more than normal RTR for weight, log(weight) */
-     data_sz_rtr=(N*(13+Nbase1/128+1+1)+20*Nbase1)*sizeof(float);
-   }
-   if (solver_mode==SM_LM_LBFGS || solver_mode==SM_OSLM_LBFGS || solver_mode==SM_RTR_OSLM_LBFGS) {
+     data_sz=(8*N*(11+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
+   } else if (solver_mode==SM_NSD_RLBFGS) {
+     data_sz=(8*N*(7+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
+   } else if (solver_mode==SM_LM_LBFGS || solver_mode==SM_OSLM_LBFGS) {
    /* size for LM */
    data_sz=(int64_t)(n+Mm*n+Mm*Mm+Mm+Mm*Mm+Mm+Mm+Mm+Mm+Nbase1*8+n+n)*sizeof(float)+(int64_t)Nbase1*2*sizeof(char);
-    if(solver_mode==SM_RTR_OSLM_LBFGS) {
-     data_sz=MAX(data_sz,data_sz_rtr);
+    if (linsolv==1) {
+     data_sz+=(int64_t)Mm*sizeof(float);
+    } else if (linsolv==2) {
+     data_sz+=(int64_t)(Mm*Mm+Mm*Mm+Mm)*sizeof(float);
    }
-   } else if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
+   } else if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS) {
   /* size for ROBUSTLM */
     data_sz=(int64_t)(n+Mm*n+Mm*Mm+Mm+Mm*Mm+Mm+Mm+Mm+Mm+Nbase1*8+n+n+n+n)*sizeof(float)+(int64_t)Nbase1*2*sizeof(char);
-    if(solver_mode==SM_RTR_OSRLM_RLBFGS) {
-     data_sz=MAX(data_sz,data_sz_rtr);
+    if (linsolv==1) {
+     data_sz+=(int64_t)Mm*sizeof(float);
+    } else if (linsolv==2) {
+     data_sz+=(int64_t)(Mm*Mm+Mm*Mm+Mm)*sizeof(float);
    }
   } else {
    fprintf(stderr,"%s: %d: invalid mode for solver\n",__FILE__,__LINE__);
    exit(1);
   }
-   if (linsolv==1) {
-    data_sz+=(int64_t)Mm*sizeof(float);
-   } else if (linsolv==2) {
-    data_sz+=(int64_t)(Mm*Mm+Mm*Mm+Mm)*sizeof(float);
-   }
+
  tpg.data_size=data_sz;
  tpg.nulow=nulow;
  tpg.nuhigh=nuhigh;
@ -2938,6 +2939,11 @@ sagefit_visibilities_dual_pt_flt(double *u, double *v, double *w, double *x, int
       lmdata0.robust_nu=lmdata1.robust_nu=robust_nu0; /* initial robust nu */
      } 
      tpg.status[0]=tpg.status[1]=PT_DO_WORK_RRTR;
+     } else if (solver_mode==SM_NSD_RLBFGS) {
+      if (!ci) {
+       lmdata0.robust_nu=lmdata1.robust_nu=robust_nu0; /* initial robust nu */
+      } 
+      tpg.status[0]=tpg.status[1]=PT_DO_WORK_NSD;
     } else {
 #ifndef USE_MIC
        fprintf(stderr,"%s: %d: undefined solver mode\n",__FILE__,__LINE__);
@ -2965,7 +2971,7 @@ printf("1: %lf -> %lf 2: %lf -> %lf\n\n\n",info0[0],info0[1],info1[0],info1[1]);
      nerr[c1]=0.0;
     }
     /* update robust_nu */
-     if ((solver_mode==SM_RLM_RLBFGS  || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) && (ci==max_emiter-1)) {
+     if ((solver_mode==SM_RLM_RLBFGS  || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) && (ci==max_emiter-1)) {
      robust_nuM[c0]+=lmdata0.robust_nu;
      robust_nuM[c1]+=lmdata1.robust_nu;
     }
@ -3047,11 +3053,15 @@ printf("1: %lf -> %lf 2: %lf -> %lf\n\n\n",info0[0],info0[1],info1[0],info1[1]);
     }  else if (solver_mode==SM_RTR_OSLM_LBFGS) {
        tpg.status[0]=PT_DO_WORK_RTR;
     }  else if (solver_mode==SM_RTR_OSRLM_RLBFGS) {
-       /* last iteration is OSRLM */
       if (!ci) {
        lmdata0.robust_nu=robust_nu0; /* initial robust nu */
       } 
       tpg.status[0]=PT_DO_WORK_RRTR;
+     }  else if (solver_mode==SM_NSD_RLBFGS) {
+       if (!ci) {
+        lmdata0.robust_nu=robust_nu0; /* initial robust nu */
+       } 
+       tpg.status[0]=PT_DO_WORK_NSD;
     } else {
 #ifndef USE_MIC
        fprintf(stderr,"%s: %d: undefined solver mode\n",__FILE__,__LINE__);
@ -3077,7 +3087,7 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
      nerr[c0]=0.0;
     }
     /* update robust_nu */
-     if ((solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) && (ci==max_emiter-1)) {
+     if ((solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) && (ci==max_emiter-1)) {
      robust_nuM[c0]+=lmdata0.robust_nu;
     }
     /* once again subtract solved model from data */
@ -3108,7 +3118,7 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
  free(xdummy1f);
  free(pf);
  free(ddcohf);
-  if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
+  if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
    /* calculate mean robust_nu over all clusters */
    robust_nu0=my_dasum(M,robust_nuM)/(double)M;
 #ifdef DEBUG
@ -3120,7 +3130,10 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
    free(robust_nuM);
    if (robust_nu0<nulow) {
     robust_nu0=nulow;
+    } else if (robust_nu0>nuhigh) {
+     robust_nu0=nuhigh;
    }
+
  }
  /******** free threads ***************/
  sync_barrier(&(tp.gate1)); /* sync at gate 1*/
@ -3133,11 +3146,7 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);

  if (max_lbfgs>0) {
  /* use LBFGS */
-   if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
-    /* if RTR, divide by 8 */
-    if (solver_mode==SM_RTR_OSRLM_RLBFGS) {
-     robust_nu0 *=0.125;  
-    }
+   if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
    lmdata0.robust_nu=robust_nu0;
    ret=lbfgs_fit_robust_cuda(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata0);
   } else {
--- a/src/lib/lmfit_nocuda.c
+++ b/src/lib/lmfit_nocuda.c
@ -646,7 +646,6 @@ predict_threadfn_withgain_full(void *data) {
       */

     px=(ci+t->boff)/((Ntilebase+t->carr[cm].nchunk-1)/t->carr[cm].nchunk);
-     //pm=&(t->p[cm*8*N]);
     pm=&(t->p[t->carr[cm].p[px]]);
     G1[0]=(pm[sta1*8])+_Complex_I*(pm[sta1*8+1]);
     G1[1]=(pm[sta1*8+2])+_Complex_I*(pm[sta1*8+3]);
@ -710,7 +709,6 @@ minimize_viz_full_pth(double *p, double *x, int m, int n, void *data) {
  int Nbase1=(dp->Nbase)*(dp->tilesz);

  /* calculate min baselines a thread can handle */
-  //Nthb0=ceil((double)Nbase1/(double)Nt);
  Nthb0=(Nbase1+Nt-1)/Nt;

  /* setup threads */
@ -800,7 +798,6 @@ minimize_viz_full_pth00(double *p, double *x, int m, int n, void *data) {
 int Ntilebase=(dp->Nbase)*(dp->tilesz);
 int px;

- #pragma omp parallel for
 for (ci=0; ci<Ntilebase; ci++) {
   /* iterate over the sky model and calculate contribution */
   /* for this x[8*ci:8*(ci+1)-1] */
@ -873,7 +870,7 @@ minimize_viz_full_pth00(double *p, double *x, int m, int n, void *data) {

 int
 sagefit_visibilities(double *u, double *v, double *w, double *x, int N,   
-   int Nbase, int tilesz,  baseline_t *barr,  clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, double *mean_nu, double *res_0, double *res_1) {
+   int Nbase, int tilesz,  baseline_t *barr,  clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, int whiten, double *mean_nu, double *res_0, double *res_1) {
  /* u,v,w : size Nbase*tilesz x 1  x: size Nbase*8*tilesz x 1 */
  /* barr: size Nbase*tilesz x 1 carr: size Mx1 */
  /* pp: size 8*N*M x 1 */
@ -943,7 +940,7 @@ sagefit_visibilities(double *u, double *v, double *w, double *x, int N,
 #endif
     exit(1);
  }
-  if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
+  if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
   if ((robust_nuM=(double*)calloc((size_t)(M),sizeof(double)))==0) {
 #ifndef USE_MIC
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
@ -1012,7 +1009,7 @@ printf("\n\ncluster %d iter=%d\n",cj,this_itermax);
         /* only the last EM iteration is robust */
         if (ci==max_emiter-1){
          lmdata.robust_nu=robust_nu0;
-          ret=rlevmar_der_single_nocuda(mylm_fit_single_pth0, mylm_jac_single_pth, &p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], 8*N, 8*ntiles*Nbase, this_itermax, NULL, info, linsolv, Nt, nulow, nuhigh, (void*)&lmdata);  
+          ret=rlevmar_der_single_nocuda(mylm_fit_single_pth0, mylm_jac_single_pth, &p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], 8*N, 8*ntiles*Nbase, this_itermax, NULL, info, linsolv, Nt, nulow, nuhigh, whiten, (void*)&lmdata);  
          /* get updated value of robust_nu */
          robust_nuM[cj]+=lmdata.robust_nu;
          } else {
@ -1022,7 +1019,7 @@ printf("\n\ncluster %d iter=%d\n",cj,this_itermax);
         /* only the last EM iteration is robust */
         if (ci==max_emiter-1){
          lmdata.robust_nu=robust_nu0;
-          ret=osrlevmar_der_single_nocuda(mylm_fit_single_pth0, mylm_jac_single_pth, &p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], 8*N, 8*ntiles*Nbase, this_itermax, NULL, info, linsolv, Nt,  nulow, nuhigh, randomize, (void*)&lmdata);  
+          ret=osrlevmar_der_single_nocuda(mylm_fit_single_pth0, mylm_jac_single_pth, &p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], 8*N, 8*ntiles*Nbase, this_itermax, NULL, info, linsolv, Nt,  nulow, nuhigh, randomize, whiten, (void*)&lmdata);  
          /* get updated value of robust_nu */
          robust_nuM[cj]+=lmdata.robust_nu;
          } else {
@ -1042,6 +1039,15 @@ printf("\n\ncluster %d iter=%d\n",cj,this_itermax);
           if (ci==max_emiter-1){
            robust_nuM[cj]+=lmdata.robust_nu;
           }
+       } else if (solver_mode==SM_NSD_RLBFGS) { /* Nesterov's */
+            /* NSD */
+           if (!ci){
+            lmdata.robust_nu=robust_nu0;
+           } 
+           ret=nsd_solve_nocuda_robust(&p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], N, ntiles*Nbase, this_itermax+15, nulow, nuhigh, info, &lmdata);
+           if (ci==max_emiter-1){
+            robust_nuM[cj]+=lmdata.robust_nu;
+           }
       } else { /* not used */
         //ret=mlm_der_single(mylm_fit_single_pth0, mylm_jac_single_pth, &p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], 8*N, 8*ntiles*Nbase, this_itermax, NULL, info, linsolv, (void*)&lmdata);  
 #ifndef USE_MIC
@ -1069,7 +1075,7 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
     mylm_fit_single_pth(p, xsub, 8*N, n, (void*)&lmdata);
     my_daxpy(n, xsub, -1.0, xdummy);
     /* if robust LM, calculate average nu over hybrid clusters */
-     if ((solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) && (ci==max_emiter-1)) {
+     if ((solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) && (ci==max_emiter-1)) {
      robust_nuM[cj]/=(double)carr[cj].nchunk;
     }
    }
@ -1088,7 +1094,7 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
 }
  free(nerr);
  free(xdummy);
-  if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
+  if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
    /* calculate mean robust_nu over all clusters */
    robust_nu0=my_dasum(M,robust_nuM)/(double)M;
 #ifdef DEBUG
@ -1100,6 +1106,8 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
    free(robust_nuM);
    if (robust_nu0<nulow) {
     robust_nu0=nulow;
+    } else if (robust_nu0>nuhigh) {
+     robust_nu0=nuhigh;
    }
  }

@ -1108,13 +1116,10 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
  lmdata.Nt=32; /* FIXME increase threads for MIC */
 #endif
  /* use LBFGS */
-   if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS ||  solver_mode==SM_RTR_OSRLM_RLBFGS) {
-    /* if RTR, divide by 8 */
-    if (solver_mode==SM_RTR_OSRLM_RLBFGS) {
-     robust_nu0 *=0.125;
-    }
+   if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS ||  solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
    lmdata.robust_nu=robust_nu0;
-    ret=lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
+    /* pre-whiten data when calculating residual */
+    ret=lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, whiten, (void*)&lmdata);
   } else {
    ret=lbfgs_fit(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
   }
@ -1273,7 +1278,7 @@ bfgsfit_visibilities(double *u, double *v, double *w, double *x, int N,
  /* use LBFGS */
   if (solver_mode==2 || solver_mode==3) {
    lmdata.robust_nu=mean_nu;
-    ret=lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
+    ret=lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, 0, (void*)&lmdata); /* 0 to disable whitening */
   } else {
    ret=lbfgs_fit(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
   }
--- a/src/lib/manifold_fl.cu
+++ b/src/lib/manifold_fl.cu
@ -1352,7 +1352,7 @@ kernel_fns_fupdate_weights(int N, int Nbase, cuFloatComplex *x, float *y, float
      1) its not flagged (sta1,sta2)>=0
    */
    float sumn=0.0f;
-    float temp1,temp2,tt,yy,c=0.0f;
+    float temp1,temp2,tt;
    if (sta1>=0 && sta2>=0) {
     cuFloatComplex G1[4];
     cuFloatComplex G2[4];
@ -1382,25 +1382,28 @@ kernel_fns_fupdate_weights(int N, int Nbase, cuFloatComplex *x, float *y, float
     /* T=T*G2' */
     ambt(T1,G2,T2);

-     /* error using Kahan summation */
+     /* use p=2, find MAX value of residual error out of XX,XY,YX,YY
+        instead of the sum */
     /* V->U */
     temp1=y[8*n]-T2[0].x; 
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
-     temp1=y[8*n+1]-T2[0].y;
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
+     temp2=y[8*n+1]-T2[0].y;
+     sumn=temp1*temp1+temp2*temp2;
     temp1=y[8*n+2]-T2[1].x;
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
-     temp1=y[8*n+3]-T2[1].y;
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
+     temp2=y[8*n+3]-T2[1].y;
+     tt=temp1*temp1+temp2*temp2;
+     if (sumn<tt) { sumn=tt; }
+     
     temp1=y[8*n+4]-T2[2].x;
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
-     temp1=y[8*n+5]-T2[2].y;
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
+     temp2=y[8*n+5]-T2[2].y;
+     tt=temp1*temp1+temp2*temp2;
+     if (sumn<tt) { sumn=tt; }
+
     temp1=y[8*n+6]-T2[3].x;
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
-     temp1=y[8*n+7]-T2[3].y;
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
-     wtd[n]=(nu0+8.0f)/(nu0+sumn);  /* 8 variate T distribution */
+     temp2=y[8*n+7]-T2[3].y;
+     tt=temp1*temp1+temp2*temp2;
+     if (sumn<tt) { sumn=tt; }
+     //wtd[n]=(nu0+8.0f)/(nu0+sumn); /* 8 variate T distribution */ 
+     wtd[n]=(nu0+2.0f)/(nu0+sumn); /* 2 variate T distribution */ 
    } 
  }

@ -1422,7 +1425,7 @@ kernel_fns_fupdate_weights_q(int N, int Nbase, cuFloatComplex *x, float *y, floa
      1) its not flagged (sta1,sta2)>=0
    */
    float sumn=0.0f;
-    float temp1,temp2,tt,yy,c=0.0f;
+    float temp1,temp2,tt;
    if (sta1>=0 && sta2>=0) {
     cuFloatComplex G1[4];
     cuFloatComplex G2[4];
@ -1452,25 +1455,28 @@ kernel_fns_fupdate_weights_q(int N, int Nbase, cuFloatComplex *x, float *y, floa
     /* T=T*G2' */
     ambt(T1,G2,T2);

-     /* error using Kahan summation */
+     /* use p=2, find MAX value of residual error out of XX,XY,YX,YY
+        instead of the sum */
     /* V->U */
     temp1=y[8*n]-T2[0].x; 
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
-     temp1=y[8*n+1]-T2[0].y;
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
+     temp2=y[8*n+1]-T2[0].y;
+     sumn=temp1*temp1+temp2*temp2;
     temp1=y[8*n+2]-T2[1].x;
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
-     temp1=y[8*n+3]-T2[1].y;
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
+     temp2=y[8*n+3]-T2[1].y;
+     tt=temp1*temp1+temp2*temp2;
+     if (sumn<tt) { sumn=tt; }
+     
     temp1=y[8*n+4]-T2[2].x;
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
-     temp1=y[8*n+5]-T2[2].y;
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
+     temp2=y[8*n+5]-T2[2].y;
+     tt=temp1*temp1+temp2*temp2;
+     if (sumn<tt) { sumn=tt; }
+
     temp1=y[8*n+6]-T2[3].x;
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
-     temp1=y[8*n+7]-T2[3].y;
-     temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
-     wtd[n]=(nu0+8.0f)/(nu0+sumn); /* 8 variate T distribution */ 
+     temp2=y[8*n+7]-T2[3].y;
+     tt=temp1*temp1+temp2*temp2;
+     if (sumn<tt) { sumn=tt; }
+     //wtd[n]=(nu0+8.0f)/(nu0+sumn); /* 8 variate T distribution */ 
+     wtd[n]=(nu0+2.0f)/(nu0+sumn); /* 2 variate T distribution */ 
     qd[n]=wtd[n]-logf(wtd[n]);  
    } 
  }
@ -1667,7 +1673,7 @@ cudakernel_fns_f(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatCo

  return ed: error vector, BlocksPerGridx1
 */
-/* need BlocksPerGrid+1+L float storage */
+/* need BlocksPerGrid+4+L float storage <= (2 BlocksPerGrid + 4) */
 float 
 cudakernel_fns_f_robust(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, float *y, float *coh, char *bbh, float *wtd, float *gWORK) {
 #ifdef CUDA_DBG
--- a/src/lib/predict.c
+++ b/src/lib/predict.c
@ -87,7 +87,7 @@ calculate_uv_mode_vectors_scalar(double u, double v, double beta, int n0, double
 		shpvl[zci][xci]=H_e(xval,xci)*expval/sqrt((double)(2<<xci)*fact[xci]);
 	}
 	zci=1;
- 	xval=v*beta;
+  xval=v*beta;
  expval=exp(-0.5*xval*xval);
 	for (xci=0; xci<n0; xci++) {
 		shpvl[zci][xci]=H_e(xval,xci)*expval/sqrt((double)(2<<xci)*fact[xci]);
--- a/src/lib/readsky.c
+++ b/src/lib/readsky.c
@ -209,10 +209,10 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
  /* each element of list is a list of source names */
  if ((cfp=fopen(clusterfile,"r"))==0) {
      fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
-      return 1;
+      exit(1);
  }
  /* allocate memory for buffer */
-  buff_len = 2048; /* FIXME: handle long names */
+  buff_len = MAX_SNAME; /* handle long names */
  if((buff = (char*)malloc(sizeof(char)*(size_t)(buff_len)))==NULL) {
        fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
        exit(1);
@ -231,7 +231,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
     /* new cluster found */
     if ((clus= (clust_t*)malloc(sizeof(clust_t)))==0) {
            fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-            return 1;
+            exit(1);
     }
     sscanf(buff,"%d",&clus->id);
     clus->slist=NULL;
@ -248,11 +248,11 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
      /* source found for this cluster */
      if ((sclus= (clust_n*)malloc(sizeof(clust_n)))==0) {
            fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-            return 1;
+            exit(1);
      }     
      if ((sclus->name=(char*)malloc((size_t)(strlen(buff)+1)*sizeof(char)))==0) {
            fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-            return 1;
+            exit(1);
      }
      strcpy(sclus->name,buff);
      clus->slist=g_list_prepend(clus->slist,sclus);
@ -277,10 +277,10 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
  */
  if ((cfp=fopen(skymodel,"r"))==0) {
      fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
-      return 1;
+      exit(1);
  }

-  if ((buff = (char*)realloc((void*)(buff),sizeof(char)*(size_t)(128)))==NULL) {
+  if ((buff = (char*)realloc((void*)(buff),sizeof(char)*(size_t)(MAX_SNAME)))==NULL) {
     fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
     exit(1);
  }
@ -298,12 +298,12 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
    if (c!=EOF && c>0) {
      if ((hkey=(char*)malloc((size_t)(strlen(buff)+1)*sizeof(char)))==0) {
            fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-            return 1;
+            exit(1);
      }    
      strcpy(hkey,buff);
      if ((source=(sinfo_t*)malloc(sizeof(sinfo_t)))==0) {
            fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-            return 1;
+            exit(1);
      }      
      /* calculate l,m */
      /* Rad=(hr+min/60+sec/60*60)*pi/12 */
@ -328,11 +328,11 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
       fratio=log(freq0/f0);
       fratio1=fratio*fratio;
       fratio2=fratio1*fratio;
-       /* catch -ve sI */
+       /* catch -ve and 0 sI */
       if (sI>0.0) {
        source->sI=exp(log(sI)+spec_idx*fratio+spec_idx1*fratio1+spec_idx2*fratio2);
       } else {
-        source->sI=-exp(log(-sI)+spec_idx*fratio+spec_idx1*fratio1+spec_idx2*fratio2);
+        source->sI=(sI==0.0?0.0:-exp(log(-sI)+spec_idx*fratio+spec_idx1*fratio1+spec_idx2*fratio2));
       }
      } else {
       source->sI=sI;
@ -365,7 +365,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
       source->stype=STYPE_GAUSSIAN;
       if((exg=(exinfo_gaussian *)malloc(sizeof(exinfo_gaussian)))==0) {
         fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-         return 1;
+         exit(1);
       } 
       exg->eX=2.0*eX; /* scale by 2 */
       exg->eY=2.0*eY;
@ -387,7 +387,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
       source->stype=STYPE_DISK;
       if((exd=(exinfo_disk*)malloc(sizeof(exinfo_disk)))==0) {
         fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-         return 1;
+         exit(1);
       } 
       exd->eX=eX;
       /* negate angles */
@ -407,7 +407,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
       source->stype=STYPE_RING;
       if((exr=(exinfo_ring*)malloc(sizeof(exinfo_ring)))==0) {
         fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-         return 1;
+         exit(1);
       } 
       exr->eX=eX;
       /* negate angles */
@ -427,7 +427,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
       source->stype=STYPE_SHAPELET;
       if((exs=(exinfo_shapelet*)malloc(sizeof(exinfo_shapelet)))==0) {
         fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-         return 1;
+         exit(1);
       } 
       exs->eX=eX;
       exs->eY=eY;
@ -474,7 +474,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
  /* setup the array of cluster/source information */
  if ((*carr=(clus_source_t*)malloc((size_t)(g_list_length(clusters))*sizeof(clus_source_t)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     return 1;
+     exit(1);
  } 
  
  ci=0;
@ -491,48 +491,48 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **

    if (((*carr)[ci].ll=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     return 1;
+     exit(1);
    }
    if (((*carr)[ci].mm=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     return 1;
+     exit(1);
    }
    if (((*carr)[ci].nn=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     return 1;
+     exit(1);
    }
    if (((*carr)[ci].sI=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     return 1;
+     exit(1);
    }
    if (((*carr)[ci].stype=(unsigned char*)malloc((size_t)((*carr)[ci].N)*sizeof(unsigned char)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     return 1;
+     exit(1);
    }
    if (((*carr)[ci].ex=(void**)malloc((size_t)((*carr)[ci].N)*sizeof(void*)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     return 1;
+     exit(1);
    }
    /* for handling multi channel data */
    if (((*carr)[ci].sI0=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     return 1;
+     exit(1);
    }
    if (((*carr)[ci].f0=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     return 1;
+     exit(1);
    }
    if (((*carr)[ci].spec_idx=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     return 1;
+     exit(1);
    }
    if (((*carr)[ci].spec_idx1=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     return 1;
+     exit(1);
    }
    if (((*carr)[ci].spec_idx2=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-     return 1;
+     exit(1);
    }


@ -689,3 +689,55 @@ update_ignorelist(const char *ignfile, int *ignlist, int M, clus_source_t *carr)
    printf("Total %d clustes ignored in simulation.\n",cn);
    return 0;
 }
+
+
+
+
+int
+read_arho_fromfile(const char *admm_rho_file,int Mt,double *arho, int M, double *arhoslave) {
+
+  FILE *cfp;
+  int c,ci,cj,cluster_id,hybrid,hb;
+  double admm_rho;
+  if ((cfp=fopen(admm_rho_file,"r"))==0) {
+      fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
+      exit(1);
+  }
+
+  c=skip_lines(cfp);
+  ci=0; /* store it in reverse order */
+  cj=0;
+  while(c>=0) {
+    c=fscanf(cfp,"%d %d %lf",&cluster_id,&hybrid,&admm_rho);
+    /* add this value to arho array */
+    if (c!=EOF && c>0) {
+      /* found a valid line */
+      arhoslave[M-1-cj]=admm_rho; /* reverse order */
+      for (hb=0; hb<hybrid; hb++) {
+        if (hb==0) {
+         arhoslave[M-1-cj]=admm_rho; /* reverse order */
+         //printf("clus=%d arr=%d rhoslave=%lf\n",cluster_id,M-1-cj,admm_rho);
+         cj++;
+        }
+        arho[Mt-1-ci]=admm_rho; /* reverse order */
+        //printf("clus=%d arr=%d rho=%lf\n",cluster_id,Mt-1-ci,admm_rho);
+        if (ci<Mt-1) {
+         ci++;
+        } else {
+         /* array size does not match one given by text file */
+         break;
+        }
+      }
+    }
+    c=skip_restof_line(cfp);
+    c=skip_lines(cfp);
+  }
+  /* report any errors */
+  if (!(c==EOF && ci==Mt-1)) {
+    fprintf(stderr,"%s: %d: Error: cluster numbers in cluster file and regularization file do not match up.\n",__FILE__,__LINE__);
+  }
+  fclose(cfp);
+
+
+ return 0;
+}
--- a/src/lib/residual.c
+++ b/src/lib/residual.c
@ -573,11 +573,11 @@ residual_threadfn_onefreq(void *data) {
         fratio=log(freq0/t->carr[cm].f0[cn]);
         fratio1=fratio*fratio;
         fratio2=fratio1*fratio;
-         /* catch -ve sI */
+         /* catch -ve and 0 sI */
         if (t->carr[cm].sI0[cn]>0.0) {
          prodterm=exp(log(t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
         } else {
-          prodterm=-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
+          prodterm=(t->carr[cm].sI0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph));
         }
       } else {
         prodterm=t->carr[cm].sI[cn]*(cosph+_Complex_I*sinph);
@ -846,11 +846,11 @@ residual_threadfn_multifreq(void *data) {
         fratio=log(freq0/t->carr[cm].f0[cn]);
         fratio1=fratio*fratio;
         fratio2=fratio1*fratio;
-         /* catch -ve sI */
+         /* catch -ve and 0 sI */
         if (t->carr[cm].sI0[cn]>0.0) {
          prodterm=exp(log(t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
         } else {
-          prodterm=-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
+          prodterm=(t->carr[cm].sI0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph));
         }
       } else {
         prodterm=t->carr[cm].sI[cn]*(cosph+_Complex_I*sinph);
@ -1087,11 +1087,11 @@ visibilities_threadfn_multifreq(void *data) {
         fratio=log(freq0/t->carr[cm].f0[cn]);
         fratio1=fratio*fratio;
         fratio2=fratio1*fratio;
-         /* catch -ve sI */ 
+         /* catch -ve and 0 sI */ 
         if (t->carr[cm].sI0[cn]>0.0) {
          prodterm=exp(log(t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
         } else {
-          prodterm=-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
+          prodterm=(t->carr[cm].sI0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph));
         }
       } else {
         prodterm=t->carr[cm].sI[cn]*(cosph+_Complex_I*sinph);
@ -1151,7 +1151,7 @@ visibilities_threadfn_multifreq(void *data) {

 /* FIXME: tail timeslots still not written properly (probably due to flagging while reading data) */
 int
-predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,int Nt) {
+predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,int Nt, int add_to_data) {
  int nth,nth1,ci;

  int Nthb0,Nthb;
@ -1177,8 +1177,10 @@ predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int
    exit(1);
  }

-  /* set output column to zero */
-  memset(x,0,sizeof(double)*8*Nbase*tilesz*Nchan);
+  if (!add_to_data) {
+   /* set output column to zero */
+   memset(x,0,sizeof(double)*8*Nbase*tilesz*Nchan);
+  }

  /* iterate over threads, allocating baselines per thread */
  ci=0;
@ -1211,6 +1213,8 @@ predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int
    threaddata[nth].fdelta=fdelta/(double)Nchan;
    threaddata[nth].tdelta=tdelta;
    threaddata[nth].dec0=dec0;
+    threaddata[nth].add_to_data=add_to_data;
+    
   
    pthread_create(&th_array[nth],&attr,visibilities_threadfn_multifreq,(void*)(&threaddata[nth]));
    /* next baseline set */
@ -1252,8 +1256,10 @@ predictwithgain_threadfn_multifreq(void *data) {
   /* iterate over the sky model and calculate contribution */
   /* for this x[8*ci:8*(ci+1)-1] */
   /* if this baseline is flagged, we do not compute */
-   for (cf=0; cf<t->Nchan; cf++) {
-    memset(&t->x[8*ci+cf*Ntilebase*8],0,sizeof(double)*8);
+   if (!t->add_to_data) { /* only model is written as output */
+    for (cf=0; cf<t->Nchan; cf++) {
+     memset(&t->x[8*ci+cf*Ntilebase*8],0,sizeof(double)*8);
+    }
   }

   /* stations for this baseline */
@ -1303,11 +1309,11 @@ predictwithgain_threadfn_multifreq(void *data) {
         fratio=log(freq0/t->carr[cm].f0[cn]);
         fratio1=fratio*fratio;
         fratio2=fratio1*fratio;
-         /* catch -ve sI */
+         /* catch -ve and 0 sI */
         if (t->carr[cm].sI0[cn]>0.0) {
          prodterm=exp(log(t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
         } else {
-          prodterm=-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
+          prodterm=(t->carr[cm].sI0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph));
         }
       } else {
         prodterm=t->carr[cm].sI[cn]*(cosph+_Complex_I*sinph);
@ -1364,12 +1370,44 @@ predictwithgain_threadfn_multifreq(void *data) {
     }
     }
   }
+  /* if valid cluster is given, correct with its solutions */
+   if (t->pinv) {
+    cm=t->ccid;
+    px=(ci+t->boff)/((Ntilebase+t->carr[cm].nchunk-1)/t->carr[cm].nchunk);
+    pm=&(t->pinv[8*t->N*px]);
+    G1[0]=(pm[sta1*8])+_Complex_I*(pm[sta1*8+1]);
+    G1[1]=(pm[sta1*8+2])+_Complex_I*(pm[sta1*8+3]);
+    G1[2]=(pm[sta1*8+4])+_Complex_I*(pm[sta1*8+5]);
+    G1[3]=(pm[sta1*8+6])+_Complex_I*(pm[sta1*8+7]);
+    G2[0]=(pm[sta2*8])+_Complex_I*(pm[sta2*8+1]);
+    G2[1]=(pm[sta2*8+2])+_Complex_I*(pm[sta2*8+3]);
+    G2[2]=(pm[sta2*8+4])+_Complex_I*(pm[sta2*8+5]);
+    G2[3]=(pm[sta2*8+6])+_Complex_I*(pm[sta2*8+7]);
+
+     /* now do correction, if any */
+     C[0]=t->x[8*ci]+_Complex_I*t->x[8*ci+1];
+     C[1]=t->x[8*ci+2]+_Complex_I*t->x[8*ci+3];
+     C[2]=t->x[8*ci+4]+_Complex_I*t->x[8*ci+5];
+     C[3]=t->x[8*ci+6]+_Complex_I*t->x[8*ci+7];
+     /* T1=G1*C  */
+     amb(G1,C,T1);
+     /* T2=T1*G2' */
+     ambt(T1,G2,T2);
+     t->x[8*ci]=creal(T2[0]);
+     t->x[8*ci+1]=cimag(T2[0]);
+     t->x[8*ci+2]=creal(T2[1]);
+     t->x[8*ci+3]=cimag(T2[1]);
+     t->x[8*ci+4]=creal(T2[2]);
+     t->x[8*ci+5]=cimag(T2[2]);
+     t->x[8*ci+6]=creal(T2[3]);
+     t->x[8*ci+7]=cimag(T2[3]);
+   }
 }
 return NULL;
 }

 int
-predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,double *x,int *ignlist,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt) {
+predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,double *x,int *ignlist,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt, int add_to_data, int ccid, double rho) {
  int nth,nth1,ci;

  int Nthb0,Nthb;
@ -1379,6 +1417,32 @@ predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,d

  int Nbase1=Nbase*tilesz;

+
+  int cm,cj;
+  double *pm,*pinv=0;
+  cm=-1;
+  /* find if any cluster is specified for correction of data */
+  for (cj=0; cj<M; cj++) { /* clusters */
+    /* check if cluster id == ccid to do a correction */
+    if (carr[cj].id==ccid) {
+     cm=cj;
+     ci=1; /* correction cluster found */
+    }
+  }
+  if (cm>=0) { /* valid cluser for correction */
+   /* allocate memory for inverse J */
+   if ((pinv=(double*)malloc((size_t)8*N*carr[cm].nchunk*sizeof(double)))==0) {
+     fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+     exit(1);
+   }
+   for (cj=0; cj<carr[cm].nchunk; cj++) {
+    pm=&(p[carr[cm].p[cj]]); /* start of solutions */
+    /* invert N solutions */
+    for (ci=0; ci<N; ci++) {
+     mat_invert(&pm[8*ci],&pinv[8*ci+8*N*cj], rho);
+    }
+   }
+  }
    
  /* calculate min baselines a thread can handle */
  Nthb0=(Nbase1+Nt-1)/Nt;
@ -1427,6 +1491,11 @@ predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,d
    threaddata[nth].fdelta=fdelta/(double)Nchan;
    threaddata[nth].tdelta=tdelta;
    threaddata[nth].dec0=dec0;
+    threaddata[nth].add_to_data=add_to_data;
+    /* for correction of data */
+    threaddata[nth].pinv=pinv;
+    threaddata[nth].ccid=cm;
+
    
    pthread_create(&th_array[nth],&attr,predictwithgain_threadfn_multifreq,(void*)(&threaddata[nth]));
    /* next baseline set */
--- a/src/lib/robust_fl.cu
+++ b/src/lib/robust_fl.cu
@ -321,10 +321,11 @@ __global__ void
 kernel_evaluatenu_fl_eight(int Nd, float qsum, float *q, float deltanu,float nulow, float nu0) {
  unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
  /* each block calculte  psi((nu+8)/2)-log((nu+8)/2) */
+  /* actually p=2, so psi((nu+2)/2)-log((nu+2)/2) */
  float dgm0;
  if (threadIdx.x==0) {
-   dgm0=digamma_fl(nu0*0.5f+4.0f);
-   dgm0=dgm0-logf((nu0+8.0f)*0.5f); /* psi((nu0+8)/2)-log((nu0+8)/2) */
+   dgm0=digamma_fl(nu0*0.5f+1.0f);
+   dgm0=dgm0-logf((nu0+2.0f)*0.5f); /* psi((nu0+8)/2)-log((nu0+8)/2) */
  }
  __syncthreads();
  if (tid<Nd) {
@ -453,7 +454,7 @@ cudakernel_evaluatenu_fl(int ThreadsPerBlock, int BlocksPerGrid, int Nd, float q


 /* evaluate expression for finding optimum nu for 
-  a range of nu values, using AECM
+  a range of nu values, using AECM (p=8 before, but now p=2)
  nu0: current value of robust_nu*/
 void
 cudakernel_evaluatenu_fl_eight(int ThreadsPerBlock, int BlocksPerGrid, int Nd, float qsum, float *q, float deltanu,float nulow, float nu0) {
--- a/src/lib/robust_lbfgs_nocuda.c
+++ b/src/lib/robust_lbfgs_nocuda.c
@ -911,7 +911,9 @@ func_grad_robust(
 int
 lbfgs_fit_robust(
   void (*func)(double *p, double *hx, int m, int n, void *adata),
-   double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, void *adata) {
+   double *p, double *x, int m, int n, int itmax, int M, int gpu_threads,
+ int whiten, /* if >0 whiten data 1: NCP, 2... */
+ void *adata) {

  double *gk; /* gradients at both k+1 and k iter */
  double *xk1,*xk; /* parameters at k+1 and k iter */
--- a/src/lib/robustlm.c
+++ b/src/lib/robustlm.c
@ -1204,6 +1204,7 @@ osrlevmar_der_single_cuda_fl(
  int ntiles, /* total tile (data) size being solved for */
  double robust_nulow, double robust_nuhigh, /* robust nu range */
  int randomize, /* if >0 randomize */
+  int whiten, /* if >0 whiten data 1: NCP, 2... */
  void *adata)       /* pointer to possibly additional data, passed uninterpreted to func & jacf.
                      * Set to NULL if not needed
                      */
@ -1883,6 +1884,7 @@ rlevmar_der_single_nocuda(
  int linsolv, /* 0 Cholesky, 1 QR, 2 SVD */
  int Nt, /* no of threads */
  double robust_nulow, double robust_nuhigh, /* robust nu range */
+  int whiten, /* if >0 whiten data 1: NCP, 2... */
  void *adata)       /* pointer to possibly additional data, passed uninterpreted to func & jacf.
                      * Set to NULL if not needed
                      */
@ -2019,17 +2021,19 @@ rlevmar_der_single_nocuda(
      exit(1);
  }
  WORK=Ud=Sd=VTd=0;
-//  for (ci=0;ci<M; ci++) {
-//   aones[ci]=1.0;
-//  }
-  me_data_t *dt=(me_data_t*)adata;
-  setweights(M,aones,1.0,dt->Nt);
+  int nw,wt_itmax=3;
+  me_data_t *lmdata=(me_data_t*)adata;
+  double wt_sum,lambda,robust_nu=lmdata->robust_nu;
+  double robust_nu1;

+  setweights(M,aones,1.0,lmdata->Nt);
  /*W set initial weights to 1 */
-//  for (ci=0;ci<N; ci++) {
-//   wtd[ci]=1.0;
-//  }
-  setweights(N,wtd,1.0,dt->Nt);
+  setweights(N,wtd,1.0,lmdata->Nt);
+  /* modify weights with whitening weights */
+  if (whiten) {
+   /* use correct offset for u,v based on tile offset */
+   add_whitening_weights(N/8, wtd, &lmdata->u[lmdata->tileoff*lmdata->Nbase], &lmdata->v[lmdata->tileoff*lmdata->Nbase], *(lmdata->freq0), lmdata->Nt);
+  }

  /* memory allocation: different solvers */
  if (solve_axb==0) {
@ -2078,10 +2082,6 @@ rlevmar_der_single_nocuda(
    }
  }

-  int nw,wt_itmax=3;
-  me_data_t *lmdata=(me_data_t*)adata;
-  double wt_sum,lambda,robust_nu=lmdata->robust_nu;
-  double robust_nu1;

  /* EM iteration loop */
  /************************************************************/
@ -2403,6 +2403,10 @@ printf("norm ||dp|| =%lf, norm ||p||=%lf\n",Dp_L2,p_L2);
   printf("nu updated from %lf in [%lf,%lf] to %lf\n",robust_nu,robust_nulow, robust_nuhigh,robust_nu1);
 #endif
   robust_nu=robust_nu1;
+   if (whiten) {
+    /* use correct offset for u,v based on tile offset */
+    add_whitening_weights(N/8, wtd, &lmdata->u[lmdata->tileoff*lmdata->Nbase], &lmdata->v[lmdata->tileoff*lmdata->Nbase], *(lmdata->freq0), lmdata->Nt);
+   }

   /* normalize weights */
   wt_sum=lambda/(double)N;
@ -2491,6 +2495,7 @@ osrlevmar_der_single_nocuda(
  int Nt, /* no of threads */
  double robust_nulow, double robust_nuhigh, /* robust nu range */
  int randomize, /* if >0 randomize */
+  int whiten, /* if >0 whiten data 1: NCP, 2... */
  void *adata)       /* pointer to possibly additional data, passed uninterpreted to func & jacf.
                      * Set to NULL if not needed
                      */
@ -2611,16 +2616,20 @@ osrlevmar_der_single_nocuda(
      exit(1);
  }
  WORK=Ud=Sd=VTd=0;
-//  for (ci=0;ci<M; ci++) {
-//   aones[ci]=1.0;
-//  }
-  me_data_t *dt=(me_data_t*)adata;
-  setweights(M,aones,1.0,dt->Nt);
+  me_data_t *lmdata0=(me_data_t*)adata;
+  int nw,wt_itmax=3;
+  double wt_sum,lambda,robust_nu=lmdata0->robust_nu;
+  double robust_nu1;
+
+
+  setweights(M,aones,1.0,lmdata0->Nt);
  /*W set initial weights to 1 */
-//  for (ci=0;ci<N; ci++) {
-//   wtd[ci]=1.0;
-//  }
-  setweights(N,wtd,1.0,dt->Nt);
+  setweights(N,wtd,1.0,lmdata0->Nt);
+  /* modify weights with whitening weights */
+  if (whiten) {
+   add_whitening_weights(N/8, wtd, &lmdata0->u[lmdata0->tileoff*lmdata0->Nbase], &lmdata0->v[lmdata0->tileoff*lmdata0->Nbase], *(lmdata0->freq0), lmdata0->Nt);
+  }
+
  /* memory allocation: different solvers */
  if (solve_axb==0) {

@ -2668,10 +2677,7 @@ osrlevmar_der_single_nocuda(
    }
  }

-  int nw,wt_itmax=3;
-  me_data_t *lmdata0=(me_data_t*)adata;
-  double wt_sum,lambda,robust_nu=lmdata0->robust_nu;
-  double robust_nu1;
+
  /* setup OS subsets and stating offsets */
  /* ME data for Jacobian calculation (need a new one) */
  me_data_t lmdata;
@ -2737,7 +2743,6 @@ osrlevmar_der_single_nocuda(
    l=l+Ntpersubset;
  }

-
  /* EM iteration loop */
  /************************************************************/
  for (nw=0; nw<wt_itmax; nw++) {
@ -3051,6 +3056,9 @@ printf("norm ||dp|| =%lf, norm ||p||=%lf\n",Dp_L2,p_L2);
   printf("nu updated from %lf in [%lf,%lf] to %lf\n",robust_nu,robust_nulow, robust_nuhigh,robust_nu1);
 #endif
   robust_nu=robust_nu1;
+   if (whiten) {
+    add_whitening_weights(N/8, wtd, &lmdata0->u[lmdata0->tileoff*lmdata0->Nbase], &lmdata0->v[lmdata0->tileoff*lmdata0->Nbase], *(lmdata0->freq0), lmdata0->Nt);
+   }

   /* normalize weights */
   wt_sum=lambda/(double)N;
--- a/src/lib/rtr_solve_robust.c
+++ b/src/lib/rtr_solve_robust.c
@ -312,6 +312,7 @@ fns_f(complex double *x, double *y,  global_data_rtr_t *gdata) {


 /* worker thread function for weight update (nu+8)/(nu+error^2) */
+/* update: error: min of XX,XY,YX,YY errors, so p=2 */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
@ -363,7 +364,8 @@ threadfn_fns_fupdate_weights(void *data) {
     double r11=t->y[8*ci+6]-creal(T2[3]);
     double i11=t->y[8*ci+7]-cimag(T2[3]);

-     t->wtd[ci] = (t->nu0+8.0)/(t->nu0+(r00*r00+i00*i00+r01*r01+i01*i01+r10*r10+i10*i10+r11*r11+i11*i11));
+     //t->wtd[ci] = (t->nu0+8.0)/(t->nu0+(r00*r00+i00*i00+r01*r01+i01*i01+r10*r10+i10*i10+r11*r11+i11*i11));
+     t->wtd[ci] = (t->nu0+2.0)/(t->nu0+MAX(r00*r00+i00*i00,MAX(r01*r01+i01*i01,MAX(r10*r10+i10*i10,r11*r11+i11*i11))));
   }
 }

@ -475,12 +477,15 @@ fns_fupdate_weights(complex double *x, double *y,  global_data_rtr_t *gdata) {
   pthread_join(gdata->th_array[nth1],NULL);
  }
  sumlogw/=(double)Nbase1;
- free(threaddata);
+  free(threaddata);

- /* find new value for nu, p-variate T dist, p=8 */
+ /* find new value for nu, p-variate T dist, p=8 (update p=2 because using MAX()  for residual calculation, not sum) */
 /*  psi((nu_old+p)/2)-ln((nu_old+p)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1 = 0, AECM */
- double nu1=update_nu(sumlogw, 30, Nt, gdata->nulow, gdata->nuhigh, 8, dp->robust_nu);
+ double nu1=update_nu(sumlogw, 30, Nt, gdata->nulow, gdata->nuhigh, 2, dp->robust_nu);

+ /* make sure new value stays within bounds */
+ if (nu1<gdata->nulow) { return gdata->nulow; }
+ if (nu1>gdata->nuhigh) { return gdata->nuhigh; }
 return nu1;
 }

@ -1400,6 +1405,7 @@ armijostep(int N,complex double *x,complex double *teta, double *y, global_data_
 return nocostred;
 }

+
 /* Fine tune initial trust region radius, also update initial value for x
   A. Sartenaer, 1995
   returns : trust region estimate,
@ -1427,10 +1433,10 @@ itrr(int N,complex double *x,complex double *eta, complex double *Heta, double *
 fns_fgrad(x,eta,y,gdata,1);
 //normalize
 double eta_nrm=my_cnrm2(4*N,eta);
- my_cscal(4*N, 1.0/eta_nrm+0.0*_Complex_I, eta);
+ my_cscal(4*N, 1.0/eta_nrm+0.0*_Complex_I, eta); 

 my_ccopy(4*N,eta,1,s,1);
- my_cscal(4*N, delta_0+0.0*_Complex_I, s);
+ my_cscal(4*N, delta_0+0.0*_Complex_I, s); 
 //Hessian at s
 fns_fhess(x,s,Heta,y,gdata);

@ -1439,7 +1445,7 @@ itrr(int N,complex double *x,complex double *eta, complex double *Heta, double *
 double mu_0=0.5; double mu_1=0.5; double mu_2=0.35;
 double teta=0.25;

-
+ 
 int m,MK=4;
 for (m=0; m<MK; m++) {
   /* x_prop=x0-s */
@ -1449,7 +1455,7 @@ itrr(int N,complex double *x,complex double *eta, complex double *Heta, double *
   /* model = f0 - g(x_prop,g0,s) - 0.5 g(x_prop,Hess,s) */
   mk=f0-fns_g(N,x_prop,eta,s)-0.5*fns_g(N,x_prop,Heta,s);
   fk=fns_f(x_prop,y,gdata);
-
+ 
   if (f0==mk) {
    rho=1e9;
   } else {
@ -1521,7 +1527,7 @@ printf("m=%d delta_0=%e delta_max=%e beta=%e rho=%e\n",m,delta_0,delta_m,beta_i,
 #endif

   my_ccopy(4*N,eta,1,s,1);
-   my_cscal(4*N,delta_0+0.0*_Complex_I, s);
+   my_cscal(4*N,delta_0+0.0*_Complex_I, s); 
 }


@ -1539,6 +1545,8 @@ printf("m=%d delta_0=%e delta_max=%e beta=%e rho=%e\n",m,delta_0,delta_m,beta_i,
 return Delta0;
 }

+
+
 int
 rtr_solve_nocuda_robust(
  double *x0,         /* initial values and updated solution at output (size 8*N double) */
@ -1694,6 +1702,7 @@ rtr_solve_nocuda_robust(
 int rsdstat=0;
 /***************************************************/
 /* RSD solution */
+ //for (ci=0; ci<itmax_rsd; ci++) {
 for (ci=0; ci<0; ci++) {
  /* Armijo step */
  /* teta=armijostep(V,C,N,x); */
@ -1707,11 +1716,13 @@ rtr_solve_nocuda_robust(
  }
 }

+
 double Delta_new=itrr(N,x,eta,Heta, y, &gdata, fgradx, x_prop);
 #ifdef DEBUG
 printf("TR radius given=%lf est=%lf\n",Delta0,Delta_new);
 #endif

+ //old values
 //Delta_bar=MIN(fx,0.01);
 //Delta0=Delta_bar*0.125;
 Delta0=MIN(Delta_new,0.01); /* need to be more restrictive for EM */
@ -1971,3 +1982,265 @@ Delta = Delta0;
  free(x);
  return 0;
 }
+
+
+int
+nsd_solve_nocuda_robust(
+  double *x0,         /* initial values and updated solution at output (size 8*N double) */
+  double *y,         /* data vector (size 8*M double) */
+  int N,              /* no. of stations */
+  int M,              /* no. of constraints */
+  int itermax,          /* maximum number of iterations RSD */
+  double robust_nulow, double robust_nuhigh, /* robust nu range */
+  double *info, /* initial and final residuals */
+  me_data_t *adata) { /* pointer to additional data */
+
+  /* reshape x to make J: 2Nx2 complex double 
+  */
+  complex double *x;
+  if ((x=(complex double*)malloc((size_t)4*N*sizeof(complex double)))==0) {
+#ifndef USE_MIC
+   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+#endif
+   exit(1);
+  }
+  /* map x: [(re,im)J_1(0,0) (re,im)J_1(0,1) (re,im)J_1(1,0) (re,im)J_1(1,1)...]
+   to
+  J: [J_1(0,0) J_1(1,0) J_2(0,0) J_2(1,0) ..... J_1(0,1) J_1(1,1) J_2(0,1) J_2(1,1)....]
+ */
+  double *Jd=(double*)x;
+  /* re J(0,0) */
+  my_dcopy(N, &x0[0], 8, &Jd[0], 4);
+  /* im J(0,0) */
+  my_dcopy(N, &x0[1], 8, &Jd[1], 4);
+  /* re J(1,0) */
+  my_dcopy(N, &x0[4], 8, &Jd[2], 4);
+  /* im J(1,0) */
+  my_dcopy(N, &x0[5], 8, &Jd[3], 4);
+  /* re J(0,1) */
+  my_dcopy(N, &x0[2], 8, &Jd[4*N], 4);
+  /* im J(0,1) */
+  my_dcopy(N, &x0[3], 8, &Jd[4*N+1], 4);
+  /* re J(1,1) */
+  my_dcopy(N, &x0[6], 8, &Jd[4*N+2], 4);
+  /* im J(1,1) */
+  my_dcopy(N, &x0[7], 8, &Jd[4*N+3], 4);
+
+
+
+  int Nt=adata->Nt;
+  int ci;
+  global_data_rtr_t gdata;
+
+  gdata.medata=adata;
+  /* setup threads */
+  pthread_attr_init(&gdata.attr);
+  pthread_attr_setdetachstate(&gdata.attr,PTHREAD_CREATE_JOINABLE);
+
+  if ((gdata.th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
+#ifndef USE_MIC
+   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+#endif
+   exit(1);
+  }
+  
+  if ((gdata.mx_array=(pthread_mutex_t*)malloc((size_t)N*sizeof(pthread_mutex_t)))==0) {
+#ifndef USE_MIC
+   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+#endif
+   exit(1);
+  }
+  if ((gdata.iw=(double*)malloc((size_t)N*sizeof(double)))==0) {
+#ifndef USE_MIC
+   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+#endif
+   exit(1);
+  }
+  /* weights for robust LS, length could be less than total no of baselines
+    therefore use relative offset boff */
+  if ((gdata.wtd=(double*)malloc((size_t)M*sizeof(double)))==0) {
+#ifndef USE_MIC
+      printf("%s: %d: no free memory\n",__FILE__,__LINE__);
+#endif
+      exit(1);
+  }
+
+
+  for (ci=0; ci<N; ci++) {
+   pthread_mutex_init(&gdata.mx_array[ci],NULL);
+  }
+ /* count baseline->station contributions 
+   NOTE: has to be done here because the baseline offset would change */
+ fns_fcount(&gdata);
+/***************************************************/
+ complex double *fgradx,*eta,*z,*x_prop,*z_prop;
+ if ((fgradx=(complex double*)calloc((size_t)4*N,sizeof(complex double)))==0) {
+#ifndef USE_MIC
+      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+#endif
+      exit(1);
+ }
+ if ((eta=(complex double*)calloc((size_t)4*N,sizeof(complex double)))==0) {
+#ifndef USE_MIC
+      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+#endif
+      exit(1);
+ }
+ if ((z=(complex double*)calloc((size_t)4*N,sizeof(complex double)))==0) {
+#ifndef USE_MIC
+      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+#endif
+      exit(1);
+ }
+ if ((x_prop=(complex double*)calloc((size_t)4*N,sizeof(complex double)))==0) {
+#ifndef USE_MIC
+      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+#endif
+      exit(1);
+ }
+ if ((z_prop=(complex double*)calloc((size_t)4*N,sizeof(complex double)))==0) {
+#ifndef USE_MIC
+      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+#endif
+      exit(1);
+ }
+
+ /*set initial weights to 1 */
+ setweights(M,gdata.wtd,1.0,Nt);
+ gdata.nulow=robust_nulow;
+ gdata.nuhigh=robust_nuhigh;
+
+ double fx;
+ fx=fns_f(x,y,&gdata);
+ double fx0=fx;
+/***************************************************/
+  /* gradient at x0 */
+  fns_fgrad(x,fgradx,y,&gdata,1);
+  /* Hessian at x0,x0 */
+  fns_fhess(x,x,z,y,&gdata);
+  /* intial step ~ 1/||Hessian|| */
+  double hess_nrm=my_cnrm2(4*N,z);
+  double t=1.0/hess_nrm;
+  /* if initial step too small */
+  if (t<1e-6) {
+   t=1e-6;
+  }
+
+  /* z <= x */
+  my_ccopy(4*N,x,1,z,1);
+  double theta=1.0;
+  double ALPHA=1.01; /*  step-size growth factor */
+  double BETA=0.5; /* step-size shrinkage factor */
+
+  int k;
+  for (k=0; k<itermax; k++) {
+   /* x_prop <= x */
+   my_ccopy(4*N,x,1,x_prop,1);
+   /* z_prop <= z */
+   my_ccopy(4*N,z,1,z_prop,1);
+
+   /* x <= z - t * grad */
+   my_ccopy(4*N,z,1,x,1);
+   my_caxpy(4*N, fgradx, -t+0.0*_Complex_I, x);
+
+   /* if ||x-z|| == t||grad|| is below threshold, stop iteration */
+   double grad_nrm=my_cnrm2(4*N,fgradx);
+   double x_nrm=my_cnrm2(4*N,x);
+   /* norm(y-x)/max(1,norm(x)); */
+   if (grad_nrm*t/MAX(1.0,x_nrm) < 1e-6) {
+      break;
+   }
+
+   /* theta = 2/(1 + sqrt(1+4/(theta^2))); */
+   theta=2.0/(1.0 + sqrt(1.0+4.0/(theta*theta)));
+
+   /* z = x + (1-theta)*(x-x_prop); 
+       z = (2-theta)*x  - (1-theta) * x_prop */
+   my_ccopy(4*N,x,1,z,1);
+   my_cscal(4*N, 2.0-theta+0.0*_Complex_I, z);
+   my_caxpy(4*N, x_prop, -(1.0-theta)+0.0*_Complex_I, z);
+
+   /* eta = grad_old;
+     grad  <= grad_f( z ) */
+   my_ccopy(4*N,fgradx,1,eta,1);
+   fns_fgrad(z,fgradx,y,&gdata,1);
+
+   /* z_prop <= z_prop - z */
+   my_caxpy(4*N, z, -1.0+0.0*_Complex_I, z_prop);
+   /* eta <= eta - new_grad */
+   my_caxpy(4*N, fgradx, -1.0+0.0*_Complex_I, eta);
+
+   /* ||z-z_prop|| */
+   double ydiffnrm=my_cnrm2(4*N,z_prop);
+   /* (z-z_prop)'*(grad-grad_old) */
+   double dot_ydiff_gdiff=my_ddot(8*N, (double *)z_prop, (double *)eta);
+#ifdef DEBUG
+   printf("num=%e den=%e\n",ydiffnrm,dot_ydiff_gdiff);
+#endif
+   /* the above can be NAN, if so break loop */
+   if (isnan(dot_ydiff_gdiff) || isinf(dot_ydiff_gdiff)) { 
+     break;
+   }
+
+   /* backtracking
+     t_hat = 0.5*(norm(y-y_old)^2)/abs((y - y_old)'*(g_old - g));
+     t = min( ALPHA*t, max( BETA*t, t_hat ));
+   */
+   double t_hat=0.5*(ydiffnrm*ydiffnrm)/fabs(dot_ydiff_gdiff);
+   t=MIN(ALPHA*t,MAX(BETA*t,t_hat));
+   
+#ifdef DEBUG
+printf("k=%d theta=%e step=%e\n",k,theta,t);
+#endif
+
+  }
+
+   fx=fns_f(x,y,&gdata);
+   /* final residual */
+   info[1]=fx;
+
+#ifdef DEBUG
+printf("k=%d cost initial=%e final=%e\n",k,fx0,fx);
+#endif
+
+
+
+   free(fgradx);
+   free(eta);
+   free(z);
+   free(z_prop);
+   free(x_prop);
+/***************************************************/
+  double robust_nu1=fns_fupdate_weights(x,y,&gdata);
+  adata->robust_nu=robust_nu1;
+  if (fx0>fx) {
+  /* copy back solution to x0 */
+  /* re J(0,0) */
+  my_dcopy(N, &Jd[0], 4, &x0[0], 8);
+  /* im J(0,0) */
+  my_dcopy(N, &Jd[1], 4, &x0[1], 8);
+  /* re J(1,0) */
+  my_dcopy(N, &Jd[2], 4, &x0[4], 8);
+  /* im J(1,0) */
+  my_dcopy(N, &Jd[3], 4, &x0[5], 8);
+  /* re J(0,1) */
+  my_dcopy(N, &Jd[4*N], 4, &x0[2], 8);
+  /* im J(0,1) */
+  my_dcopy(N, &Jd[4*N+1], 4, &x0[3], 8);
+  /* re J(1,1) */
+  my_dcopy(N, &Jd[4*N+2], 4, &x0[6], 8);
+  /* im J(1,1) */
+  my_dcopy(N, &Jd[4*N+3], 4, &x0[7], 8);
+  }
+
+  for (ci=0; ci<N; ci++) {
+   pthread_mutex_destroy(&gdata.mx_array[ci]);
+  }
+  pthread_attr_destroy(&gdata.attr);
+  free(gdata.th_array);
+  free(gdata.mx_array);
+  free(gdata.iw);
+  free(gdata.wtd);
+  free(x);
+  return 0;
+}
--- a/src/lib/rtr_solve_robust_admm.c
+++ b/src/lib/rtr_solve_robust_admm.c
@ -331,7 +331,8 @@ fns_f(complex double *x, double *y,  global_data_rtr_t *gdata) {
 }


-/* worker thread function for weight update (nu+8)/(nu+error^2) */
+/* worker thread function for weight update (nu+p)/(nu+error^2) */
+/* p=2, not p=8 because using MAX() not sum for error^2 */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
@ -383,7 +384,8 @@ threadfn_fns_fupdate_weights(void *data) {
     double r11=t->y[8*ci+6]-creal(T2[3]);
     double i11=t->y[8*ci+7]-cimag(T2[3]);

-     t->wtd[ci] = (t->nu0+8.0)/(t->nu0+(r00*r00+i00*i00+r01*r01+i01*i01+r10*r10+i10*i10+r11*r11+i11*i11));
+     //t->wtd[ci] = (t->nu0+8.0)/(t->nu0+(r00*r00+i00*i00+r01*r01+i01*i01+r10*r10+i10*i10+r11*r11+i11*i11));
+     t->wtd[ci] = (t->nu0+2.0)/(t->nu0+MAX(r00*r00+i00*i00,MAX(r01*r01+i01*i01,MAX(r10*r10+i10*i10,r11*r11+i11*i11))));
   }
 }

@ -495,11 +497,14 @@ fns_fupdate_weights(complex double *x, double *y,  global_data_rtr_t *gdata) {
   pthread_join(gdata->th_array[nth1],NULL);
  }
  sumlogw/=(double)Nbase1;
- free(threaddata);
+  free(threaddata);

- /* find new value for nu, p-variate T dist, p=8 */
+ /* find new value for nu, p-variate T dist, p=8 (update p=2 because using MAX()  for residual calculation, not sum) */
 /*  psi((nu_old+p)/2)-ln((nu_old+p)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1 = 0, AECM */
- double nu1=update_nu(sumlogw, 30, Nt, gdata->nulow, gdata->nuhigh, 8, dp->robust_nu);
+ double nu1=update_nu(sumlogw, 30, Nt, gdata->nulow, gdata->nuhigh, 2, dp->robust_nu);
+ /* make sure new value stays within bounds */
+ if (nu1<gdata->nulow) { return gdata->nulow; }
+ if (nu1>gdata->nuhigh) { return gdata->nuhigh; }

 return nu1;
 }
@ -1385,6 +1390,145 @@ armijostep(int N,complex double *x,complex double *teta, double *y, global_data_
 return nocostred;
 }

+/* Fine tune initial trust region radius, also update initial value for x
+   A. Sartenaer, 1995
+   returns : trust region estimate,
+   also modifies x
+   eta,Heta,s,x_prop: used as storage
+ */
+#ifdef USE_MIC
+__attribute__ ((target(MIC)))
+#endif
+static double
+itrr(int N,complex double *x,complex double *eta, complex double *Heta, double *y, global_data_rtr_t *gdata, complex double *s, complex double *x_prop) {
+
+ double f0,fk,mk,rho,rho1,Delta0;
+
+ /* initialize trust region radii */
+ double delta_0=1.0;
+ double delta_m=0.0;
+
+ double sigma=0.0;
+ double delta=0.0;
+
+ // initial cost
+ f0=fns_f(x,y,gdata);
+ // gradient at x0
+ fns_fgrad(x,eta,y,gdata,1);
+ //normalize
+ double eta_nrm=my_cnrm2(4*N,eta);
+ my_cscal(4*N, 1.0/eta_nrm+0.0*_Complex_I, eta);
+
+ my_ccopy(4*N,eta,1,s,1);
+ my_cscal(4*N, delta_0+0.0*_Complex_I, s);
+ //Hessian at s
+ fns_fhess(x,s,Heta,y,gdata);
+
+ /* constants used */
+ double gamma_1=0.0625; double gamma_2=5.0; double gamma_3=0.5; double gamma_4=2.0;
+ double mu_0=0.5; double mu_1=0.5; double mu_2=0.35;
+ double teta=0.25;
+
+
+ int m,MK=4;
+ for (m=0; m<MK; m++) {
+   /* x_prop=x0-s */
+   my_ccopy(4*N,x,1,x_prop,1);
+   my_caxpy(4*N, s, -1.0+0.0*_Complex_I, x_prop);
+
+   /* model = f0 - g(x_prop,g0,s) - 0.5 g(x_prop,Hess,s) */
+   mk=f0-fns_g(N,x_prop,eta,s)-0.5*fns_g(N,x_prop,Heta,s);
+   fk=fns_f(x_prop,y,gdata);
+
+   if (f0==mk) {
+    rho=1e9;
+   } else {
+    rho=(f0-fk)/(f0-mk);
+   }
+   rho1=fabs(rho-1.0);
+
+   /* update max radius */
+   if (rho1<mu_0) {
+     delta_m=MAX(delta_m,delta_0);
+   }
+   if ((f0-fk)>delta) {
+     delta=f0-fk;
+     sigma=delta_0;
+   }
+   /* radius update */
+   double beta_1,beta_2,beta_i;
+   beta_1=0.0;
+   beta_2=0.0;
+   if (m<MK) {
+     double g0_s=fns_g(N,x,eta,s);
+     double b1=(teta*(f0-g0_s)+(1.0-teta)*mk-fk);
+     beta_1=(b1==0.0?1e9:-teta*g0_s/b1);
+
+     double b2=(-teta*(f0-g0_s)+(1.0+teta)*mk-fk);
+     beta_2=(b2==0.0?1e9:teta*g0_s/b2);
+
+     double minbeta=MIN(beta_1,beta_2);
+     double maxbeta=MAX(beta_1,beta_2);
+     if (rho1>mu_1) {
+       if (minbeta>1.0) {
+        beta_i=gamma_3;
+       } else if ((maxbeta<gamma_1) || (minbeta<gamma_1 && maxbeta>=1.0)) {
+        beta_i=gamma_1;
+       } else if ((beta_1>=gamma_1 && beta_1<1.0) && (beta_2<gamma_1 || beta_2>=1.0)) {
+        beta_i=beta_1;
+       } else if ((beta_2>=gamma_1 && beta_2<1.0) && (beta_1<gamma_1 || beta_1>=1.0)) {
+        beta_i=beta_2;
+      } else {
+        beta_i=maxbeta;
+      }
+     } else if (rho1<=mu_2) {
+       if (maxbeta<1.0) {
+         beta_i=gamma_4;
+       } else if (maxbeta>gamma_2) {
+         beta_i=gamma_2;
+       } else if ((beta_1>=1.0 && beta_1<=gamma_2) && beta_2<1.0) {
+         beta_i=beta_1;
+       } else if ((beta_2>=1.0 && beta_2<=gamma_2) && beta_1<1.0) {
+         beta_i=beta_2;
+       } else {
+         beta_i=maxbeta;
+       }
+     } else {
+       if (maxbeta<gamma_3) {
+         beta_i=gamma_3;
+       } else if (maxbeta>gamma_4) {
+         beta_i=gamma_4;
+       } else {
+         beta_i=maxbeta;
+       }
+     }
+     /* update radius */
+     delta_0=delta_0/beta_i;
+   }
+
+#ifdef DEBUG
+printf("m=%d delta_0=%e delta_max=%e beta=%e rho=%e\n",m,delta_0,delta_m,beta_i,rho);
+#endif
+
+   my_ccopy(4*N,eta,1,s,1);
+   my_cscal(4*N,delta_0+0.0*_Complex_I, s);
+ }
+
+
+ // update initial value
+ if (delta>0.0) {
+   my_caxpy(4*N, eta, -sigma+0.0*_Complex_I, x);
+ }
+
+ if (delta_m>0.0) {
+  Delta0=delta_m;
+ } else {
+  Delta0=delta_0;
+ }
+
+ return Delta0;
+}
+


 int
@ -1583,7 +1727,8 @@ rtr_solve_nocuda_robust_admm(
 int rsdstat=0;
 /***************************************************/
 /* RSD solution */
- for (ci=0; ci<itmax_rsd; ci++) {
+ //for (ci=0; ci<itmax_rsd; ci++) {
+ for (ci=0; ci<0; ci++) {
  /* Armijo step */
  /* teta=armijostep(V,C,N,x); */
  rsdstat=armijostep(N,x,eta,y,&gdata,fgradx,x_prop,&fx); /* NOTE last two are just storage */
@ -1596,7 +1741,16 @@ rtr_solve_nocuda_robust_admm(
  }
 }

- Delta_bar=MIN(fx,Delta_bar); 
+ double Delta_new=itrr(N,x,eta,Heta, y, &gdata, fgradx, x_prop);
+#ifdef DEBUG
+ printf("TR radius given=%lf est=%lf\n",Delta0,Delta_new);
+#endif
+
+ //old values
+ //Delta_bar=MIN(fx,Delta_bar); 
+ Delta0=MIN(Delta_new,0.01); /* need to be more restrictive for EM */
+ Delta_bar=Delta0*8.0;
+
 rho_regularization=fx*1e-6;
 //printf("fx=%g Delta_bar=%g Delta0=%g\n",fx,Delta_bar,Delta0);

--- a/src/lib/rtr_solve_robust_cuda.c
+++ b/src/lib/rtr_solve_robust_cuda.c
@ -116,7 +116,7 @@ cudakernel_fns_fgrad_robust1(int ThreadsPerBlock, int BlocksPerGrid, int N, int
   to tangent space before it is averaged 
 so calculate grad using N(N-1)/2 constraints each (total M)
 */
-/* need 8N*BlocksPerGrid+ 8N*2 float storage */
+/* need 8N*M/ThreadsPerBlock+ 8N float storage */
 static void
 cudakernel_fns_fgrad_robust(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, float *y, float *coh, char *bbh, float *iw, float *wtd, int negate, cublasHandle_t cbhandle,float *gWORK) {
 /* baselines per timeslot = N(N-1)/2 ~2400, timeslots = M/baselines ~120
@ -213,11 +213,10 @@ printf("N=%d Baselines=%d timeslots=%d total=%d,Threads=%d Blocks=%d\n",N,nbase,
 } 
 cudaMemcpy(eta,tempeta,4*N*sizeof(cuFloatComplex),cudaMemcpyDeviceToDevice);
 cudaFree(Bd);
- //cudakernel_fns_proj(N, x, tempeta, eta, cbhandle);
 }

 /* Hessian, also projected to tangent space */
-/* need 8N*BlocksPerGrid+ 8N*2 float storage */
+/* need 8N*M/ThreadsPerBlock+ 8N*2 float storage */
 static void
 cudakernel_fns_fhess_robust1(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, cuFloatComplex *fhess, float *y, float *coh, char *bbh, float *iw, float *wtd, cublasHandle_t cbhandle, float *gWORK) {
 cuFloatComplex *tempeta,*tempb;
@ -265,7 +264,7 @@ cudakernel_fns_fhess_robust1(int ThreadsPerBlock, int BlocksPerGrid, int N, int
   to tangent space before it is averaged 
 so calculate grad using N(N-1)/2 constraints each (total M)
 */
-/* need 8N*BlocksPerGrid+ 8N*2 float storage */
+/* need 8N*M/ThreadsPerBlock+ 8N float storage */
 static void
 cudakernel_fns_fhess_robust(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, cuFloatComplex *fhess, float *y, float *coh, char *bbh, float *iw, float *wtd, cublasHandle_t cbhandle, float *gWORK) {
 cuFloatComplex *tempeta;
@ -350,7 +349,6 @@ printf("N=%d Baselines=%d timeslots=%d total=%d,Threads=%d Blocks=%d\n",N,nbase,
 cudaMemcpy(fhess,tempeta,4*N*sizeof(cuFloatComplex),cudaMemcpyDeviceToDevice);
 cudaFree(Bd);

- //cudakernel_fns_proj(N, x, tempeta, fhess, cbhandle);
 }


@ -449,6 +447,160 @@ printf("m=%d lhs=%e rhs=%e rat=%e norm=%e\n",m,lhs,rhs,lhs/rhs,metric);
 }


+/* Fine tune initial trust region radius, also update initial value for x
+   A. Sartenaer, 1995
+   returns : trust region estimate,
+   also modifies x
+   eta,Heta: used as storage
+ */
+/* need 8N*2 + MAX(2 Blocks + 4, 8N (1 + ceil(M/Threads))) float storage */
+static float
+itrr(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta,  cuFloatComplex *Heta, float *y, float *coh, char *bbh, float *iw, float *wtd, cublasHandle_t cbhandle, float *gWORK) {
+ cuFloatComplex alpha;
+ cublasStatus_t cbstatus;
+ /* temp storage, re-using global storage */ 
+ cuFloatComplex *s, *x_prop;
+ unsigned long int moff=0;
+ s=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N;
+ x_prop=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N;
+ float *gWORK1=&gWORK[moff];
+
+
+ float f0,fk,mk,rho,rho1,Delta0;
+ /* initialize trust region radii */
+ float delta_0=1.0f;
+ float delta_m=0.0f; 
+
+ float sigma=0.0f;
+ float delta=0.0f;
+
+ // initial cost
+ f0=cudakernel_fns_f_robust(ThreadsPerBlock,BlocksPerGrid,N,M,x,y,coh,bbh,wtd,gWORK1);
+ // gradient at x0;
+ cudakernel_fns_fgrad_robust(ThreadsPerBlock,BlocksPerGrid,N,M,x,eta,y,coh,bbh,iw,wtd,1,cbhandle, gWORK1);
+ // normalize
+ float eta_nrm;
+ cublasScnrm2(cbhandle,4*N,eta,1,&eta_nrm);
+ alpha.x=1.0f/eta_nrm;alpha.y=0.0f;
+ cbstatus=cublasCscal(cbhandle,4*N,&alpha,eta,1);
+
+ cbstatus=cublasCcopy(cbhandle,4*N,eta,1,s,1);
+ alpha.x=delta_0;alpha.y=0.0f;
+ cbstatus=cublasCscal(cbhandle,4*N,&alpha,s,1);
+ /* Hessian at s */
+ cudakernel_fns_fhess_robust(ThreadsPerBlock,BlocksPerGrid,N,M,x,s,Heta,y,coh,bbh,iw,wtd,cbhandle, gWORK1);
+
+ /* constants used */
+ float gamma_1=0.0625f; float gamma_2=5.0f; float gamma_3=0.5f; float gamma_4=2.0f;
+ float mu_0=0.5f; float mu_1=0.5f; float mu_2=0.35f;
+ float teta=0.25f;
+
+
+ int MK=4;
+ int m;
+ for (m=0; m<MK; m++) {
+   /* x_prop=x0-s */
+   cbstatus=cublasCcopy(cbhandle,4*N,x,1,x_prop,1);
+   alpha.x=-1.0f;alpha.y=0.0f;
+   cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, s, 1, x_prop, 1);
+
+   /* model = f0 - g(x_prop,g0,s) - 0.5 g(x_prop,Hess,s) */
+   mk=f0-cudakernel_fns_g(N,x_prop,eta,s,cbhandle)-0.5f*cudakernel_fns_g(N,x_prop,Heta,s,cbhandle);
+   fk=cudakernel_fns_f_robust(ThreadsPerBlock,BlocksPerGrid,N,M,x_prop,y,coh,bbh,wtd,gWORK1);
+
+   if (f0==mk) {
+    rho=1e9f;
+   } else {
+    rho=(f0-fk)/(f0-mk);
+   }
+   rho1=fabsf(rho-1.0f);
+   
+   /* update max radius */
+   if (rho1<mu_0) {
+     delta_m=MAX(delta_m,delta_0);
+   }
+   if ((f0-fk)>delta) {
+     delta=f0-fk;
+     sigma=delta_0;
+   }
+   /* radius update */
+   float beta_1,beta_2,beta_i;
+   beta_1=0.0f;
+   beta_2=0.0f;
+   
+   if (m<MK) {
+     float g0_s=cudakernel_fns_g(N,x,eta,s,cbhandle);
+     float b1=(teta*(f0-g0_s)+(1.0f-teta)*mk-fk);
+     beta_1=(b1==0.0f?1e9f:-teta*g0_s/b1); 
+     
+     float b2=(-teta*(f0-g0_s)+(1.0f+teta)*mk-fk);
+     beta_2=(b2==0.0f?1e9f:teta*g0_s/b2); 
+    
+     float minbeta=MIN(beta_1,beta_2);
+     float maxbeta=MAX(beta_1,beta_2);
+     if (rho1>mu_1) {
+       if (minbeta>1.0f) {
+        beta_i=gamma_3;
+       } else if ((maxbeta<gamma_1) || (minbeta<gamma_1 && maxbeta>=1.0f)) {
+        beta_i=gamma_1;
+       } else if ((beta_1>=gamma_1 && beta_1<1.0f) && (beta_2<gamma_1 || beta_2>=1.0f)) {
+        beta_i=beta_1;
+       } else if ((beta_2>=gamma_1 && beta_2<1.0f) && (beta_1<gamma_1 || beta_1>=1.0f)) {
+        beta_i=beta_2;
+      } else {
+        beta_i=maxbeta;
+      }
+     } else if (rho1<=mu_2) {
+       if (maxbeta<1.0f) {
+         beta_i=gamma_4;
+       } else if (maxbeta>gamma_2) {
+         beta_i=gamma_2;
+       } else if ((beta_1>=1.0f && beta_1<=gamma_2) && beta_2<1.0f) {
+         beta_i=beta_1;
+       } else if ((beta_2>=1.0f && beta_2<=gamma_2) && beta_1<1.0f) {
+         beta_i=beta_2;
+       } else {
+         beta_i=maxbeta;
+       }
+     } else {
+       if (maxbeta<gamma_3) {
+         beta_i=gamma_3;
+       } else if (maxbeta>gamma_4) {
+         beta_i=gamma_4;
+       } else {
+         beta_i=maxbeta;
+       }
+     }
+     /* update radius */
+     delta_0=delta_0/beta_i;
+   }
+#ifdef DEBUG
+printf("m=%d delta_0=%e delta_max=%e beta=%e rho=%e\n",m,delta_0,delta_m,beta_i,rho);
+#endif
+
+   cbstatus=cublasCcopy(cbhandle,4*N,eta,1,s,1);
+   alpha.x=delta_0;alpha.y=0.0f;
+   cbstatus=cublasCscal(cbhandle,4*N,&alpha,s,1);
+ }
+
+ // update initial value
+ if (delta>0.0f) {
+  alpha.x=-sigma; alpha.y=0.0f;
+  cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, eta, 1, x, 1);
+ }
+
+ if (delta_m>0.0f) {
+  Delta0=delta_m;
+ } else {
+  Delta0=delta_0;
+ }
+
+ return Delta0;
+}
+
+
 /* truncated conjugate gradient method 
  x, grad, eta, r, z, delta, Hxd  : size 2N x 2  complex 
  so, vector size is 4N complex double
@ -572,6 +724,12 @@ tcg_solve_cuda(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComp
 }


+
+/* storage:
+  8N * 5 + N + 8M * 2 + 2M + M (base storage)
+  MAX( 2 * Blocks + 4, 8N(6 + ceil(M/Threads)))  for functions
+  Blocks = ceil(M/Threads)
+*/
 int
 rtr_solve_cuda_robust_fl(
  float *x0,         /* initial values and updated solution at output (size 8*N float) */
@ -745,28 +903,36 @@ rtr_solve_cuda_robust_fl(
 printf("Initial Cost=%g\n",fx0);
 #endif
 /***************************************************/
- int rsdstat=0;
- /* RSD solution */
- for (ci=0; ci<itmax_sd; ci++) {
-  /* Armijo step */
-  /* teta=armijostep(V,C,N,x); */
-  rsdstat=armijostep(ThreadsPerBlock, BlocksPerGrid, N, M, xd, etad, yd, cohd, bbd,iwd,wtd,&fx,cbhandle,gWORK1);
-  /* x=R(x,teta); */
-  cudakernel_fns_R(N,xd,etad,x_propd,cbhandle);
-  //my_ccopy(4*N,x_propd,1,xd,1);
-  if (!rsdstat) {
-   /* cost reduced, update solution */
-   cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
-  } else {
-   /* no cost reduction, break loop */
-   break; 
-  }
- }
+// int rsdstat=0;
+// /* RSD solution - disabled */
+// for (ci=0; ci<itmax_sd; ci++) {
+//  /* Armijo step */
+//  /* teta=armijostep(V,C,N,x); */
+//  rsdstat=armijostep(ThreadsPerBlock, BlocksPerGrid, N, M, xd, etad, yd, cohd, bbd,iwd,wtd,&fx,cbhandle,gWORK1);
+//  /* x=R(x,teta); */
+//  cudakernel_fns_R(N,xd,etad,x_propd,cbhandle);
+//  if (!rsdstat) {
+//   /* cost reduced, update solution */
+//   cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
+//  } else {
+//   /* no cost reduction, break loop */
+//   break; 
+//  }
+// }
+
+ float Delta_new=itrr(ThreadsPerBlock, BlocksPerGrid, N, M, xd, etad, Hetad, yd, cohd, bbd, iwd, wtd, cbhandle, gWORK1);
+
+#ifdef DEBUG
+ printf("TR radius given=%f est=%f\n",Delta0,Delta_new);
+#endif
+ 
+ //old values
+ //Delta_bar=MIN(fx,0.01f);
+ //Delta0=Delta_bar*0.125f;
+ Delta0=MIN(Delta_new,0.01f); /* need to be more restrictive for EM */
+ Delta_bar=Delta0*8.0f;

 cudakernel_fns_fupdate_weights(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,robust_nu);
-
- Delta_bar=MIN(fx,0.01f);
- Delta0=Delta_bar*0.125f;
 //printf("fx=%g Delta_bar=%g Delta0=%g\n",fx,Delta_bar,Delta0);

 #ifdef DEBUG
@ -918,7 +1084,8 @@ printf("NEW RTR cost=%g\n",fx);

 /***************************************************/
 cudaDeviceSynchronize();
-   /* w <= (8+nu)/(1+error^2), q<=w-log(w) */
+   /* w <= (p+nu)/(1+error^2), q<=w-log(w) */
+   /* p = 2, use MAX() residual of XX,XY,YX,YY, not the sum */
   cudakernel_fns_fupdate_weights_q(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,qd,robust_nu);
   /* sumq<=sum(w-log(w))/N */
   cbstatus=cublasSasum(cbhandle, M, qd, 1, &q_sum);
@ -938,7 +1105,349 @@ printf("NEW RTR cost=%g\n",fx);
 #ifdef DEBUG
   printf("nu updated %d from %f [%lf,%lf] to %f\n",ci,robust_nu,robust_nulow,robust_nuhigh,robust_nu1);
 #endif
-   dp->robust_nu=(double)robust_nu1;
+   /* seems pedantic, but make sure new value for robust_nu fits within bounds */
+   if (robust_nu1<robust_nulow) {
+    dp->robust_nu=robust_nulow;
+   } else if (robust_nu1>robust_nuhigh) {
+    dp->robust_nu=robust_nuhigh;
+   } else {
+    dp->robust_nu=(double)robust_nu1;
+   }
+  
+  if(fx0>fx) {
+  //printf("Cost final %g  initial %g\n",fx,fx0);
+  /* copy back current solution */
+  err=cudaMemcpy(x,xd,8*N*sizeof(float),cudaMemcpyDeviceToHost);
+  checkCudaError(err,__FILE__,__LINE__);
+
+
+  /* copy back solution to x0 : format checked*/
+  /* re J(0,0) */
+  my_fcopy(N, &Jd[0], 4, &x0[0], 8);
+  /* im J(0,0) */
+  my_fcopy(N, &Jd[1], 4, &x0[1], 8);
+  /* re J(1,0) */
+  my_fcopy(N, &Jd[2], 4, &x0[4], 8);
+  /* im J(1,0) */
+  my_fcopy(N, &Jd[3], 4, &x0[5], 8);
+  /* re J(0,1) */
+  my_fcopy(N, &Jd[4*N], 4, &x0[2], 8);
+  /* im J(0,1) */
+  my_fcopy(N, &Jd[4*N+1], 4, &x0[3], 8);
+  /* re J(1,1) */
+  my_fcopy(N, &Jd[4*N+2], 4, &x0[6], 8);
+  /* im J(1,1) */
+  my_fcopy(N, &Jd[4*N+3], 4, &x0[7], 8);
+
+  }
+  free(x);
+
+  return 0;
+}
+
+
+
+/* storage:
+  8N * 6 + N + 8M * 2 + 2M + M (base storage)
+  MAX( 2 * Blocks + 4, 8N(1 + ceil(M/Threads)))  for functions
+  Blocks = ceil(M/Threads)
+*/
+int
+nsd_solve_cuda_robust_fl(
+  float *x0,         /* initial values and updated solution at output (size 8*N float) */
+  float *y,         /* data vector (size 8*M float) */
+  int N,              /* no of stations */
+  int M,              /* no of constraints */
+  int itmax,          /* maximum number of iterations */
+  double robust_nulow, double robust_nuhigh, /* robust nu range */
+  double *info, /* initial and final residuals */
+  cublasHandle_t cbhandle, /* device handle */
+  float *gWORK, /* GPU allocated memory */
+  int tileoff, /* tile offset when solving for many chunks */
+  int ntiles, /* total tile (data) size being solved for */
+  me_data_t *adata)
+{
+
+  /* general note: all device variables end with a 'd' */
+  cudaError_t err;
+  cublasStatus_t cbstatus;
+
+  /* ME data */
+  me_data_t *dp=(me_data_t*)adata;
+  int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
+  /* coherency on device */
+  float *cohd;
+  /* baseline-station map on device/host */
+  char *bbd;
+
+  /* calculate no of cuda threads and blocks */
+  int ThreadsPerBlock=128;
+  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+
+
+  /* reshape x to make J: 2Nx2 complex double 
+  */
+  complex float *x;
+  if ((x=(complex float*)malloc((size_t)4*N*sizeof(complex float)))==0) {
+#ifndef USE_MIC
+   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+#endif
+   exit(1);
+  }
+  /* map x: [(re,im)J_1(0,0) (re,im)J_1(0,1) (re,im)J_1(1,0) (re,im)J_1(1,1)...]
+   to
+  J: [J_1(0,0) J_1(1,0) J_2(0,0) J_2(1,0) ..... J_1(0,1) J_1(1,1) J_2(0,1) J_2(1,1)....]
+ */
+  float *Jd=(float*)x;
+  /* re J(0,0) */
+  my_fcopy(N, &x0[0], 8, &Jd[0], 4);
+  /* im J(0,0) */
+  my_fcopy(N, &x0[1], 8, &Jd[1], 4);
+  /* re J(1,0) */
+  my_fcopy(N, &x0[4], 8, &Jd[2], 4);
+  /* im J(1,0) */
+  my_fcopy(N, &x0[5], 8, &Jd[3], 4);
+  /* re J(0,1) */
+  my_fcopy(N, &x0[2], 8, &Jd[4*N], 4);
+  /* im J(0,1) */
+  my_fcopy(N, &x0[3], 8, &Jd[4*N+1], 4);
+  /* re J(1,1) */
+  my_fcopy(N, &x0[6], 8, &Jd[4*N+2], 4);
+  /* im J(1,1) */
+  my_fcopy(N, &x0[7], 8, &Jd[4*N+3], 4);
+
+
+  int ci;
+
+/***************************************************/
+ cuFloatComplex *xd,*fgradxd,*etad,*zd,*x_propd,*z_propd;
+ float *yd;
+ float *wtd,*qd; /* for robust weight and log(weight) */
+ float robust_nu=(float)dp->robust_nu;
+ float q_sum,robust_nu1;
+ float deltanu;
+ int Nd=100; /* no of points where nu is sampled, note Nd<N */
+ if (Nd>M) { Nd=M; }
+ deltanu=(float)(robust_nuhigh-robust_nulow)/(float)Nd;
+
+ /* for counting how many baselines contribute to each station
+   grad/hess calculation */
+ float *iwd,*iw;
+ if ((iw=(float*)malloc((size_t)N*sizeof(float)))==0) {
+#ifndef USE_MIC
+   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+#endif
+   exit(1);
+ }
+
+
+ unsigned long int moff=0;
+ fgradxd=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N; /* 4N complex means 8N float */
+ etad=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N;
+ zd=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N;
+ x_propd=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N;
+ xd=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N;
+ z_propd=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N;
+
+ yd=&gWORK[moff];
+ moff+=8*M;
+ cohd=&gWORK[moff];
+ moff+=Nbase*8;
+ bbd=(char*)&gWORK[moff];
+ unsigned long int charstor=(Nbase*2*sizeof(char))/sizeof(float);
+ if (!charstor || charstor%4) {
+  moff+=(charstor/4+1)*4; /* NOTE +4 multiple to align memory */
+ } else {
+  moff+=charstor;
+ }
+ iwd=&gWORK[moff];
+ if (!(N%4)) {
+  moff+=N;
+ } else {
+  moff+=(N/4+1)*4;
+ }
+ wtd=&gWORK[moff];
+ if (!(M%4)) {
+  moff+=M;
+ } else {
+  moff+=(M/4+1)*4;
+ }
+ qd=&gWORK[moff];
+ if (!(M%4)) {
+  moff+=M;
+ } else {
+  moff+=(M/4+1)*4;
+ }
+
+
+ /* remaining memory */
+ float *gWORK1=&gWORK[moff];
+
+ /* yd <=y : V */
+ err=cudaMemcpy(yd, y, 8*M*sizeof(float), cudaMemcpyHostToDevice);
+ checkCudaError(err,__FILE__,__LINE__);
+ /* need to give right offset for coherencies */
+ /* offset: cluster offset+time offset */
+ /* C */
+ err=cudaMemcpy(cohd, &(dp->ddcohf[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(float), cudaMemcpyHostToDevice);
+ checkCudaError(err,__FILE__,__LINE__);
+ /* correct offset for baselines */
+ err=cudaMemcpy(bbd, &(dp->ddbase[2*(dp->Nbase)*(tileoff)]), Nbase*2*sizeof(char), cudaMemcpyHostToDevice);
+ checkCudaError(err,__FILE__,__LINE__);
+ /* xd <=x : solution */
+ err=cudaMemcpy(xd, x, 8*N*sizeof(float), cudaMemcpyHostToDevice);
+ checkCudaError(err,__FILE__,__LINE__);
+
+ float fx,fx0;
+
+ /* count how many baselines contribute to each station, store (inverse) in iwd */
+ count_baselines(Nbase,N,iw,&(dp->ddbase[2*(dp->Nbase)*(tileoff)]),dp->Nt);
+ err=cudaMemcpy(iwd, iw, N*sizeof(float), cudaMemcpyHostToDevice);
+ checkCudaError(err,__FILE__,__LINE__);
+ free(iw);
+
+ /* set initial weights to 1 by a cuda kernel */
+ cudakernel_setweights_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, wtd, 1.0f);
+ fx=cudakernel_fns_f_robust(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,gWORK1);
+ fx0=fx;
+#ifdef DEBUG
+printf("Initial Cost=%g\n",fx0);
+#endif
+/***************************************************/
+  // gradient at x0;
+  cudakernel_fns_fgrad_robust(ThreadsPerBlock,BlocksPerGrid,N,M,xd,fgradxd,yd,cohd,bbd,iwd,wtd,1,cbhandle,gWORK1);
+  // Hessian 
+  cudakernel_fns_fhess_robust(ThreadsPerBlock,BlocksPerGrid,N,M,xd,xd,zd,yd,cohd,bbd,iwd,wtd,cbhandle,gWORK1);
+  // initial step = 1/||Hess||
+  float hess_nrm;
+  cublasScnrm2(cbhandle,4*N,zd,1,&hess_nrm);
+  float t=1.0f/hess_nrm;
+  /* if initial step too small */
+  if (t<1e-6f) {
+   t=1e-6f;
+  }
+  
+  /* z <= x */
+  cbstatus=cublasCcopy(cbhandle,4*N,xd,1,zd,1);
+  float theta=1.0f;
+  float ALPHA = 1.01f; // step-size growth factor
+  float BETA = 0.5f; // step-size shrinkage factor
+  int k;
+  cuFloatComplex alpha;
+
+  for (k=0; k<itmax; k++) {
+    /* x_prop <= x */
+    cbstatus=cublasCcopy(cbhandle,4*N,xd,1,x_propd,1);
+    /* z_prop <= z */
+    cbstatus=cublasCcopy(cbhandle,4*N,zd,1,z_propd,1);
+    
+    /* x <= z - t * grad */
+    cbstatus=cublasCcopy(cbhandle,4*N,zd,1,xd,1);
+    alpha.x=-t;alpha.y=0.0f;
+    cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, fgradxd, 1, xd, 1);
+
+    /* if ||x-z|| == t||grad|| is below threshold, stop iteration */
+    float grad_nrm,x_nrm;
+    cublasScnrm2(cbhandle,4*N,fgradxd,1,&grad_nrm);
+    cublasScnrm2(cbhandle,4*N,xd,1,&x_nrm);
+    /* norm(y-x)/max(1,norm(x)); */
+    if (grad_nrm*t/MAX(1.0f,x_nrm) < 1e-6f) {
+      break;
+    }
+
+
+    /* theta = 2/(1 + sqrt(1+4/(theta^2))); */
+    theta=2.0f/(1.0f + sqrtf(1.0f+4.0f/(theta*theta)));
+
+    /* z = x + (1-theta)*(x-x_prop); 
+       z = (2-theta)*x  - (1-theta) * x_prop */
+    cbstatus=cublasCcopy(cbhandle,4*N,xd,1,zd,1);
+    alpha.x=(2.0f-theta);alpha.y=0.0f;
+    cbstatus=cublasCscal(cbhandle,4*N,&alpha,zd,1);
+    alpha.x=-(1.0f-theta);alpha.y=0.0f;
+    cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, x_propd, 1, zd, 1);
+
+    /* eta = grad_old;
+     grad  <= grad_f( z ) */
+    cbstatus=cublasCcopy(cbhandle,4*N,fgradxd,1,etad,1);
+    cudakernel_fns_fgrad_robust(ThreadsPerBlock,BlocksPerGrid,N,M,zd,fgradxd,yd,cohd,bbd,iwd,wtd,1,cbhandle,gWORK1);
+
+    /* z_prop <= z_prop - z */
+    alpha.x=-1.0f;alpha.y=0.0f;
+    cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, zd, 1, z_propd, 1);
+    /* eta <= eta - new_grad */
+    cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, fgradxd, 1, etad, 1);
+   
+    /* ||z-z_prop|| */
+    float ydiffnrm;
+    cublasScnrm2(cbhandle,4*N,z_propd,1,&ydiffnrm);
+    /* (z_zold)'*(grad-grad_old) */
+    float dot_ydiff_gdiff;
+    cbstatus=cublasSdot(cbhandle, 8*N, (float*)z_propd, 1, (float*)etad, 1, &dot_ydiff_gdiff);
+#ifdef DEBUG
+   printf("num=%e den=%e\n",ydiffnrm,dot_ydiff_gdiff);
+#endif
+    /* the above can be NAN, if so break loop */
+    if (isnan(dot_ydiff_gdiff) || isinf(dot_ydiff_gdiff)) {
+     break;
+    }
+
+
+    /* backtracking
+     t_hat = 0.5*(norm(y-y_old)^2)/abs((y - y_old)'*(g_old - g));
+     t = min( ALPHA*t, max( BETA*t, t_hat ));
+    */
+    float t_hat=0.5f*(ydiffnrm*ydiffnrm)/fabsf(dot_ydiff_gdiff);
+    t=MIN(ALPHA*t,MAX(BETA*t,t_hat));
+#ifdef DEBUG
+printf("k=%d theta=%e step=%e\n",k,theta,t);
+#endif
+  }
+
+  /* final residual */
+  fx=cudakernel_fns_f_robust(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,gWORK1);
+  info[1]=fx;
+#ifdef DEBUG
+printf("NEW NSD cost=%g\n",fx);
+#endif
+
+/***************************************************/
+ cudaDeviceSynchronize();
+   /* w <= (p+nu)/(1+error^2), q<=w-log(w) */
+   /* p = 2, use MAX() residual of XX,XY,YX,YY, not the sum */
+   cudakernel_fns_fupdate_weights_q(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,qd,robust_nu);
+   /* sumq<=sum(w-log(w))/N */
+   cbstatus=cublasSasum(cbhandle, M, qd, 1, &q_sum);
+   q_sum/=(float)M;
+#ifdef DEBUG
+   printf("deltanu=%f sum(w-log(w))=%f\n",deltanu,q_sum);
+#endif
+  /* for nu range 2~numax evaluate, p-variate T
+     psi((nu0+p)/2)-ln((nu0+p)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1 
+     note: AECM not ECME
+     and find min(| |) */
+   int ThreadsPerBlock2=ThreadsPerBlock/4;
+   cudakernel_evaluatenu_fl_eight(ThreadsPerBlock2, (Nd+ThreadsPerBlock-1)/ThreadsPerBlock2, Nd, q_sum, qd, deltanu,(float)robust_nulow,robust_nu);
+   /* find min(abs()) value */
+   cbstatus=cublasIsamin(cbhandle, Nd, qd, 1, &ci); /* 1 based index */
+   robust_nu1=(float)robust_nulow+(float)(ci-1)*deltanu;
+#ifdef DEBUG
+   printf("nu updated %d from %f [%lf,%lf] to %f\n",ci,robust_nu,robust_nulow,robust_nuhigh,robust_nu1);
+#endif
+   /* seems pedantic, but make sure new value for robust_nu fits within bounds */
+   if (robust_nu1<robust_nulow) {
+    dp->robust_nu=robust_nulow;
+   } else if (robust_nu1>robust_nuhigh) {
+    dp->robust_nu=robust_nuhigh;
+   } else {
+    dp->robust_nu=(double)robust_nu1;
+   }
  
  if(fx0>fx) {
  //printf("Cost final %g  initial %g\n",fx,fx0);
--- a/src/lib/rtr_solve_robust_cuda_admm.c
+++ b/src/lib/rtr_solve_robust_cuda_admm.c
@ -64,6 +64,7 @@ checkCublasError(cublasStatus_t cbstatus, char *file, int line)


 /* cost function */
+/* storage <= (2 Blocks+4) + 8N */
 static float
 cudakernel_fns_f_robust_admm(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *Y, cuFloatComplex *Z, float admm_rho, float *y, float *coh, char *bbh,  float *wtd, cublasHandle_t cbhandle, float *gWORK){
 cuFloatComplex *Yd;
@ -113,7 +114,7 @@ cudakernel_fns_proj_admm(int N, cuFloatComplex *x, cuFloatComplex *z, cuFloatCom


 /* gradient, also projected to tangent space */
-/* need 8N*BlocksPerGrid+ 8N*2 float storage */
+/* need 8N*M/ThreadsPerBlock+ 8N float storage */
 static void
 cudakernel_fns_fgrad_robust_admm(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *Y, cuFloatComplex *Z, float admm_rho, cuFloatComplex *eta, float *y, float *coh, char *bbh, float *iw, float *wtd, int negate, cublasHandle_t cbhandle,float *gWORK) {

@ -182,7 +183,7 @@ cudakernel_fns_fgrad_robust_admm(int ThreadsPerBlock, int BlocksPerGrid, int N,
 }

 /* Hessian, also projected to tangent space */
-/* need 8N*BlocksPerGrid+ 8N*2 float storage */
+/* need 8N*M/ThreadsPerBlock+ 8N float storage */
 static void
 cudakernel_fns_fhess_robust_admm(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x,  cuFloatComplex *Y, cuFloatComplex *Z, float admm_rho, cuFloatComplex *eta, cuFloatComplex *fhess, float *y, float *coh, char *bbh, float *iw, float *wtd, cublasHandle_t cbhandle, float *gWORK) {
 cuFloatComplex *tempeta;
@ -276,6 +277,161 @@ armijostep(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex
 }


+/* Fine tune initial trust region radius, also update initial value for x
+   A. Sartenaer, 1995
+   returns : trust region estimate,
+   also modifies x
+   eta,Heta: used as storage
+ */
+/* need 8N*2 + MAX(8N+2 Blocks + 4, 8N (1 + ceil(M/Threads))) float storage */
+static float
+itrr(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *Y, cuFloatComplex *Z, float admm_rho, cuFloatComplex *eta,  cuFloatComplex *Heta, float *y, float *coh, char *bbh, float *iw, float *wtd, cublasHandle_t cbhandle, float *gWORK) {
+ cuFloatComplex alpha;
+ cublasStatus_t cbstatus;
+ /* temp storage, re-using global storage */ 
+ cuFloatComplex *s, *x_prop;
+ unsigned long int moff=0;
+ s=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N;
+ x_prop=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N;
+ float *gWORK1=&gWORK[moff];
+
+
+ float f0,fk,mk,rho,rho1,Delta0;
+ /* initialize trust region radii */
+ float delta_0=1.0f;
+ float delta_m=0.0f; 
+
+ float sigma=0.0f;
+ float delta=0.0f;
+
+ // initial cost
+ f0=cudakernel_fns_f_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,x,Y,Z,admm_rho,y,coh,bbh,wtd,cbhandle,gWORK1);
+ // gradient at x0;
+ cudakernel_fns_fgrad_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,x,Y,Z,admm_rho,eta,y,coh,bbh,iw,wtd,1,cbhandle, gWORK1);
+ // normalize
+ float eta_nrm;
+ cublasScnrm2(cbhandle,4*N,eta,1,&eta_nrm);
+ alpha.x=1.0f/eta_nrm;alpha.y=0.0f;
+ cbstatus=cublasCscal(cbhandle,4*N,&alpha,eta,1);
+
+ cbstatus=cublasCcopy(cbhandle,4*N,eta,1,s,1);
+ alpha.x=delta_0;alpha.y=0.0f;
+ cbstatus=cublasCscal(cbhandle,4*N,&alpha,s,1);
+ /* Hessian at s */
+ cudakernel_fns_fhess_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,x,Y,Z,admm_rho,s,Heta,y,coh,bbh,iw,wtd,cbhandle,gWORK1);
+
+ /* constants used */
+ float gamma_1=0.0625f; float gamma_2=5.0f; float gamma_3=0.5f; float gamma_4=2.0f;
+ float mu_0=0.5f; float mu_1=0.5f; float mu_2=0.35f;
+ float teta=0.25f;
+
+
+ int MK=4;
+ int m;
+ for (m=0; m<MK; m++) {
+   /* x_prop=x0-s */
+   cbstatus=cublasCcopy(cbhandle,4*N,x,1,x_prop,1);
+   alpha.x=-1.0f;alpha.y=0.0f;
+   cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, s, 1, x_prop, 1);
+
+   /* model = f0 - g(x_prop,g0,s) - 0.5 g(x_prop,Hess,s) */
+   mk=f0-cudakernel_fns_g(N,x_prop,eta,s,cbhandle)-0.5f*cudakernel_fns_g(N,x_prop,Heta,s,cbhandle);
+   fk=cudakernel_fns_f_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,x_prop,Y,Z,admm_rho,y,coh,bbh,wtd,cbhandle,gWORK1);
+
+   if (f0==mk) {
+    rho=1e9f;
+   } else {
+    rho=(f0-fk)/(f0-mk);
+   }
+   rho1=fabsf(rho-1.0f);
+   
+   /* update max radius */
+   if (rho1<mu_0) {
+     delta_m=MAX(delta_m,delta_0);
+   }
+   if ((f0-fk)>delta) {
+     delta=f0-fk;
+     sigma=delta_0;
+   }
+   /* radius update */
+   float beta_1,beta_2,beta_i;
+   beta_1=0.0f;
+   beta_2=0.0f;
+   
+   if (m<MK) {
+     float g0_s=cudakernel_fns_g(N,x,eta,s,cbhandle);
+     float b1=(teta*(f0-g0_s)+(1.0f-teta)*mk-fk);
+     beta_1=(b1==0.0f?1e9f:-teta*g0_s/b1); 
+     
+     float b2=(-teta*(f0-g0_s)+(1.0f+teta)*mk-fk);
+     beta_2=(b2==0.0f?1e9f:teta*g0_s/b2); 
+    
+     float minbeta=MIN(beta_1,beta_2);
+     float maxbeta=MAX(beta_1,beta_2);
+     if (rho1>mu_1) {
+       if (minbeta>1.0f) {
+        beta_i=gamma_3;
+       } else if ((maxbeta<gamma_1) || (minbeta<gamma_1 && maxbeta>=1.0f)) {
+        beta_i=gamma_1;
+       } else if ((beta_1>=gamma_1 && beta_1<1.0f) && (beta_2<gamma_1 || beta_2>=1.0f)) {
+        beta_i=beta_1;
+       } else if ((beta_2>=gamma_1 && beta_2<1.0f) && (beta_1<gamma_1 || beta_1>=1.0f)) {
+        beta_i=beta_2;
+      } else {
+        beta_i=maxbeta;
+      }
+     } else if (rho1<=mu_2) {
+       if (maxbeta<1.0f) {
+         beta_i=gamma_4;
+       } else if (maxbeta>gamma_2) {
+         beta_i=gamma_2;
+       } else if ((beta_1>=1.0f && beta_1<=gamma_2) && beta_2<1.0f) {
+         beta_i=beta_1;
+       } else if ((beta_2>=1.0f && beta_2<=gamma_2) && beta_1<1.0f) {
+         beta_i=beta_2;
+       } else {
+         beta_i=maxbeta;
+       }
+     } else {
+       if (maxbeta<gamma_3) {
+         beta_i=gamma_3;
+       } else if (maxbeta>gamma_4) {
+         beta_i=gamma_4;
+       } else {
+         beta_i=maxbeta;
+       }
+     }
+     /* update radius */
+     delta_0=delta_0/beta_i;
+   }
+#ifdef DEBUG
+printf("m=%d delta_0=%e delta_max=%e beta=%e rho=%e\n",m,delta_0,delta_m,beta_i,rho);
+#endif
+
+   cbstatus=cublasCcopy(cbhandle,4*N,eta,1,s,1);
+   alpha.x=delta_0;alpha.y=0.0f;
+   cbstatus=cublasCscal(cbhandle,4*N,&alpha,s,1);
+ }
+
+ // update initial value
+ if (delta>0.0f) {
+  alpha.x=-sigma; alpha.y=0.0f;
+  cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, eta, 1, x, 1);
+ }
+
+ if (delta_m>0.0f) {
+  Delta0=delta_m;
+ } else {
+  Delta0=delta_0;
+ }
+
+ return Delta0;
+}
+
+
+
 /* truncated conjugate gradient method 
  x, grad, eta, r, z, delta, Hxd  : size 2N x 2  complex 
  so, vector size is 4N complex double
@ -399,6 +555,11 @@ tcg_solve_cuda(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComp
 }


+/* storage:
+  8N * 5 + N + 8M * 2 + 2M + M (base storage)
+  MAX( 8N+ 2 * Blocks + 4, 8N(6 + ceil(M/Threads)))  for functions
+  Blocks = ceil(M/Threads)
+*/
 int
 rtr_solve_cuda_robust_admm_fl(
  float *x0,         /* initial values and updated solution at output (size 8*N float) */
@ -407,8 +568,7 @@ rtr_solve_cuda_robust_admm_fl(
  float *y,         /* data vector (size 8*M float) */
  int N,              /* no of stations */
  int M,              /* no of constraints */
-  int itmax_sd,          /* maximum number of iterations RSD */
-  int itmax_rtr,          /* maximum number of iterations RTR */
+  int itmax_rtr,          /* maximum number of iterations */
  float Delta_bar, float Delta0, /* Trust region radius and initial value */
  float admm_rho, /* ADMM regularization */
  double robust_nulow, double robust_nuhigh, /* robust nu range */
@ -572,12 +732,6 @@ rtr_solve_cuda_robust_admm_fl(
 cudaMalloc((void **)&Yd, 4*N*sizeof(cuFloatComplex));
 cudaMalloc((void **)&Zd, 4*N*sizeof(cuFloatComplex));

-
- /* need 8N*(BlocksPerGrid+8) for tcg_solve+grad/hess storage,
-   so total storage needed is 
-   8N*(BlocksPerGrid+8) + 8N*5 + 8*M + 8*Nbase + 2*Nbase + N + M + M
-   plus 8N + 8N for ADMM params (Y and BZ)
- */
 /* remaining memory */
 float *gWORK1=&gWORK[moff];

@ -616,28 +770,39 @@ rtr_solve_cuda_robust_admm_fl(
 printf("Initial Cost=%g\n",fx0);
 #endif
 /***************************************************/
- int rsdstat=0;
- /* RSD solution */
- for (ci=0; ci<itmax_sd; ci++) {
-  /* Armijo step */
-  rsdstat=armijostep(ThreadsPerBlock, BlocksPerGrid, N, M, xd, Yd,Zd,admm_rho, etad, yd, cohd, bbd,iwd,wtd,&fx,cbhandle,gWORK1);
-  /* x=R(x,teta); */
-  cudakernel_fns_R(N,xd,etad,x_propd,cbhandle);
-  if (!rsdstat) {
-   /* cost reduced, update solution */
-   cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
-  } else {
-   /* no cost reduction, break loop */
-   break;
-  }
- }
+// int rsdstat=0;
+// /* RSD solution - disabled */
+// for (ci=0; ci<itmax_sd; ci++) {
+//  /* Armijo step */
+//  rsdstat=armijostep(ThreadsPerBlock, BlocksPerGrid, N, M, xd, Yd,Zd,admm_rho, etad, yd, cohd, bbd,iwd,wtd,&fx,cbhandle,gWORK1);
+//  /* x=R(x,teta); */
+//  cudakernel_fns_R(N,xd,etad,x_propd,cbhandle);
+//  if (!rsdstat) {
+//   /* cost reduced, update solution */
+//   cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
+//  } else {
+//   /* no cost reduction, break loop */
+//   break;
+//  }
+// }
+
+ float Delta_new=itrr(ThreadsPerBlock, BlocksPerGrid, N, M, xd, Yd,Zd,admm_rho, etad, Hetad, yd, cohd, bbd, iwd, wtd, cbhandle, gWORK1);
+#ifdef DEBUG
+ printf("TR radius given=%f est=%f\n",Delta0,Delta_new);
+#endif
+
+
+
+ //old values
+ //Delta_bar=MIN(fx,Delta_bar);
+ //Delta0=Delta_bar*0.125f;
+ Delta0=MIN(Delta_new,0.01f); /* need to be more restrictive for EM */
+ Delta_bar=Delta0*8.0f;
+
+//printf("fx=%g Delta_bar=%g Delta0=%g\n",fx,Delta_bar,Delta0);

 cudakernel_fns_fupdate_weights(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,robust_nu);

- Delta_bar=MIN(fx,Delta_bar);
- Delta0=Delta_bar*0.125f;
-//printf("fx=%g Delta_bar=%g Delta0=%g\n",fx,Delta_bar,Delta0);
-
 #ifdef DEBUG
 printf("NEW RSD cost=%g\n",fx);
 #endif
@ -787,7 +952,8 @@ printf("NEW RTR cost=%g\n",fx);

 /***************************************************/
 cudaDeviceSynchronize();
-   /* w <= (8+nu)/(1+error^2), q<=w-log(w) */
+   /* w <= (p+nu)/(1+error^2), q<=w-log(w) */
+   /* p = 2, use MAX() residual of XX,XY,YX,YY, not the sum */
   cudakernel_fns_fupdate_weights_q(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,qd,robust_nu);
   /* sumq<=sum(w-log(w))/N */
   cbstatus=cublasSasum(cbhandle, M, qd, 1, &q_sum);
@ -807,7 +973,14 @@ printf("NEW RTR cost=%g\n",fx);
 #ifdef DEBUG
   printf("nu updated %d from %f [%lf,%lf] to %f\n",ci,robust_nu,robust_nulow,robust_nuhigh,robust_nu1);
 #endif
-   dp->robust_nu=(double)robust_nu1;
+   /* seems pedantic, but make sure new value for robust_nu fits within bounds */
+   if (robust_nu1<robust_nulow) {
+    dp->robust_nu=robust_nulow;
+   } else if (robust_nu1>robust_nuhigh) {
+    dp->robust_nu=robust_nuhigh;
+   } else {
+    dp->robust_nu=(double)robust_nu1;
+   }
  
 #ifdef DEBUG
  printf("Cost final %g  initial %g\n",fx,fx0);
@ -846,3 +1019,387 @@ printf("NEW RTR cost=%g\n",fx);

  return 0;
 }
+
+
+
+
+/* storage:
+  8N * 6 + N + 8M * 2 + 2M + M (base storage)
+  MAX( 2 * Blocks + 4, 8N(1 + ceil(M/Threads)))  for functions
+  Blocks = ceil(M/Threads)
+*/
+int
+nsd_solve_cuda_robust_admm_fl(
+  float *x0,         /* initial values and updated solution at output (size 8*N float) */
+  float *Y, /* Lagrange multiplier size 8N */
+  float *Z, /* consensus term B Z  size 8N */
+  float *y,         /* data vector (size 8*M float) */
+  int N,              /* no of stations */
+  int M,              /* no of constraints */
+  int itmax,          /* maximum number of iterations */
+  float admm_rho, /* ADMM regularization */
+  double robust_nulow, double robust_nuhigh, /* robust nu range */
+  double *info, /* initial and final residuals */
+  cublasHandle_t cbhandle, /* device handle */
+  float *gWORK, /* GPU allocated memory */
+  int tileoff, /* tile offset when solving for many chunks */
+  int ntiles, /* total tile (data) size being solved for */
+  me_data_t *adata)
+{
+
+  /* general note: all device variables end with a 'd' */
+  cudaError_t err;
+  cublasStatus_t cbstatus;
+
+  /* ME data */
+  me_data_t *dp=(me_data_t*)adata;
+  int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
+  /* coherency on device */
+  float *cohd;
+  /* baseline-station map on device/host */
+  char *bbd;
+
+  /* calculate no of cuda threads and blocks */
+  int ThreadsPerBlock=128;
+  int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
+
+
+  /* reshape x to make J: 2Nx2 complex double 
+  */
+  complex float *x;
+  if ((x=(complex float*)malloc((size_t)4*N*sizeof(complex float)))==0) {
+#ifndef USE_MIC
+   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+#endif
+   exit(1);
+  }
+  /* map x: [(re,im)J_1(0,0) (re,im)J_1(0,1) (re,im)J_1(1,0) (re,im)J_1(1,1)...]
+   to
+  J: [J_1(0,0) J_1(1,0) J_2(0,0) J_2(1,0) ..... J_1(0,1) J_1(1,1) J_2(0,1) J_2(1,1)....]
+ */
+  float *Jd=(float*)x;
+  /* re J(0,0) */
+  my_fcopy(N, &x0[0], 8, &Jd[0], 4);
+  /* im J(0,0) */
+  my_fcopy(N, &x0[1], 8, &Jd[1], 4);
+  /* re J(1,0) */
+  my_fcopy(N, &x0[4], 8, &Jd[2], 4);
+  /* im J(1,0) */
+  my_fcopy(N, &x0[5], 8, &Jd[3], 4);
+  /* re J(0,1) */
+  my_fcopy(N, &x0[2], 8, &Jd[4*N], 4);
+  /* im J(0,1) */
+  my_fcopy(N, &x0[3], 8, &Jd[4*N+1], 4);
+  /* re J(1,1) */
+  my_fcopy(N, &x0[6], 8, &Jd[4*N+2], 4);
+  /* im J(1,1) */
+  my_fcopy(N, &x0[7], 8, &Jd[4*N+3], 4);
+
+
+  complex float *Zx,*Yx;
+  if ((Zx=(complex float*)malloc((size_t)4*N*sizeof(complex float)))==0) {
+#ifndef USE_MIC
+   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+#endif
+   exit(1);
+  }
+  if ((Yx=(complex float*)malloc((size_t)4*N*sizeof(complex float)))==0) {
+#ifndef USE_MIC
+   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+#endif
+   exit(1);
+  }
+  float *YY=(float*)Yx;
+  my_fcopy(N, &Y[0], 8, &YY[0], 4);
+  my_fcopy(N, &Y[1], 8, &YY[1], 4);
+  my_fcopy(N, &Y[4], 8, &YY[2], 4);
+  my_fcopy(N, &Y[5], 8, &YY[3], 4);
+  my_fcopy(N, &Y[2], 8, &YY[4*N], 4);
+  my_fcopy(N, &Y[3], 8, &YY[4*N+1], 4);
+  my_fcopy(N, &Y[6], 8, &YY[4*N+2], 4);
+  my_fcopy(N, &Y[7], 8, &YY[4*N+3], 4);
+  float *ZZ=(float*)Zx;
+  my_fcopy(N, &Z[0], 8, &ZZ[0], 4);
+  my_fcopy(N, &Z[1], 8, &ZZ[1], 4);
+  my_fcopy(N, &Z[4], 8, &ZZ[2], 4);
+  my_fcopy(N, &Z[5], 8, &ZZ[3], 4);
+  my_fcopy(N, &Z[2], 8, &ZZ[4*N], 4);
+  my_fcopy(N, &Z[3], 8, &ZZ[4*N+1], 4);
+  my_fcopy(N, &Z[6], 8, &ZZ[4*N+2], 4);
+  my_fcopy(N, &Z[7], 8, &ZZ[4*N+3], 4);
+
+
+  int ci;
+
+/***************************************************/
+ cuFloatComplex *xd,*fgradxd,*etad,*zd,*x_propd,*z_propd,*Yd,*Zd;
+ float *yd;
+ float *wtd,*qd; /* for robust weight and log(weight) */
+ float robust_nu=(float)dp->robust_nu;
+ float q_sum,robust_nu1;
+ float deltanu;
+ int Nd=100; /* no of points where nu is sampled, note Nd<N */
+ if (Nd>M) { Nd=M; }
+ deltanu=(float)(robust_nuhigh-robust_nulow)/(float)Nd;
+
+ /* for counting how many baselines contribute to each station
+   grad/hess calculation */
+ float *iwd,*iw;
+ if ((iw=(float*)malloc((size_t)N*sizeof(float)))==0) {
+#ifndef USE_MIC
+   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+#endif
+   exit(1);
+ }
+
+
+ unsigned long int moff=0;
+ fgradxd=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N; /* 4N complex means 8N float */
+ etad=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N;
+ zd=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N;
+ x_propd=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N;
+ xd=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N;
+ z_propd=(cuFloatComplex*)&gWORK[moff];
+ moff+=8*N;
+
+ yd=&gWORK[moff];
+ moff+=8*M;
+ cohd=&gWORK[moff];
+ moff+=Nbase*8;
+ bbd=(char*)&gWORK[moff];
+ unsigned long int charstor=(Nbase*2*sizeof(char))/sizeof(float);
+ if (!charstor || charstor%4) {
+  moff+=(charstor/4+1)*4; /* NOTE +4 multiple to align memory */
+ } else {
+  moff+=charstor;
+ }
+ iwd=&gWORK[moff];
+ if (!(N%4)) {
+  moff+=N;
+ } else {
+  moff+=(N/4+1)*4;
+ }
+ wtd=&gWORK[moff];
+ if (!(M%4)) {
+  moff+=M;
+ } else {
+  moff+=(M/4+1)*4;
+ }
+ qd=&gWORK[moff];
+ if (!(M%4)) {
+  moff+=M;
+ } else {
+  moff+=(M/4+1)*4;
+ }
+
+
+ cudaMalloc((void **)&Yd, 4*N*sizeof(cuFloatComplex));
+ cudaMalloc((void **)&Zd, 4*N*sizeof(cuFloatComplex));
+
+ /* remaining memory */
+ float *gWORK1=&gWORK[moff];
+
+ /* yd <=y : V */
+ err=cudaMemcpy(yd, y, 8*M*sizeof(float), cudaMemcpyHostToDevice);
+ checkCudaError(err,__FILE__,__LINE__);
+ /* need to give right offset for coherencies */
+ /* offset: cluster offset+time offset */
+ /* C */
+ err=cudaMemcpy(cohd, &(dp->ddcohf[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(float), cudaMemcpyHostToDevice);
+ checkCudaError(err,__FILE__,__LINE__);
+ /* correct offset for baselines */
+ err=cudaMemcpy(bbd, &(dp->ddbase[2*(dp->Nbase)*(tileoff)]), Nbase*2*sizeof(char), cudaMemcpyHostToDevice);
+ checkCudaError(err,__FILE__,__LINE__);
+ /* xd <=x : solution */
+ err=cudaMemcpy(xd, x, 8*N*sizeof(float), cudaMemcpyHostToDevice);
+ checkCudaError(err,__FILE__,__LINE__);
+ err=cudaMemcpy(Yd, Yx, 8*N*sizeof(float), cudaMemcpyHostToDevice);
+ checkCudaError(err,__FILE__,__LINE__);
+ err=cudaMemcpy(Zd, Zx, 8*N*sizeof(float), cudaMemcpyHostToDevice);
+ checkCudaError(err,__FILE__,__LINE__);
+
+ float fx,fx0;
+
+ /* count how many baselines contribute to each station, store (inverse) in iwd */
+ count_baselines(Nbase,N,iw,&(dp->ddbase[2*(dp->Nbase)*(tileoff)]),dp->Nt);
+ err=cudaMemcpy(iwd, iw, N*sizeof(float), cudaMemcpyHostToDevice);
+ checkCudaError(err,__FILE__,__LINE__);
+ free(iw);
+
+ /* set initial weights to 1 by a cuda kernel */
+ cudakernel_setweights_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, wtd, 1.0f);
+ fx=cudakernel_fns_f_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,xd,Yd,Zd,admm_rho,yd,cohd,bbd,wtd,cbhandle,gWORK1);
+ fx0=fx;
+#ifdef DEBUG
+printf("Initial Cost=%g\n",fx0);
+#endif
+/***************************************************/
+  // gradient at x0;
+  cudakernel_fns_fgrad_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,xd,Yd,Zd,admm_rho,fgradxd,yd,cohd,bbd,iwd,wtd,1,cbhandle,gWORK1);
+  // Hessian 
+  cudakernel_fns_fhess_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,xd,Yd,Zd,admm_rho,xd,zd,yd,cohd,bbd,iwd,wtd,cbhandle,gWORK1);
+  // initial step = 1/||Hess||
+  float hess_nrm;
+  cublasScnrm2(cbhandle,4*N,zd,1,&hess_nrm);
+  float t=1.0f/hess_nrm;
+  /* if initial step too small */
+  if (t<1e-6f) {
+   t=1e-6f;
+  }
+  
+  /* z <= x */
+  cbstatus=cublasCcopy(cbhandle,4*N,xd,1,zd,1);
+  float theta=1.0f;
+  float ALPHA = 1.01f; // step-size growth factor
+  float BETA = 0.5f; // step-size shrinkage factor
+  int k;
+  cuFloatComplex alpha;
+
+  for (k=0; k<itmax; k++) {
+    /* x_prop <= x */
+    cbstatus=cublasCcopy(cbhandle,4*N,xd,1,x_propd,1);
+    /* z_prop <= z */
+    cbstatus=cublasCcopy(cbhandle,4*N,zd,1,z_propd,1);
+    
+    /* x <= z - t * grad */
+    cbstatus=cublasCcopy(cbhandle,4*N,zd,1,xd,1);
+    alpha.x=-t;alpha.y=0.0f;
+    cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, fgradxd, 1, xd, 1);
+
+    /* if ||x-z|| == t||grad|| is below threshold, stop iteration */
+    float grad_nrm,x_nrm;
+    cublasScnrm2(cbhandle,4*N,fgradxd,1,&grad_nrm);
+    cublasScnrm2(cbhandle,4*N,xd,1,&x_nrm);
+    /* norm(y-x)/max(1,norm(x)); */
+    if (grad_nrm*t/MAX(1.0f,x_nrm) < 1e-6f) {
+      break;
+    }
+
+
+    /* theta = 2/(1 + sqrt(1+4/(theta^2))); */
+    theta=2.0f/(1.0f + sqrtf(1.0f+4.0f/(theta*theta)));
+
+    /* z = x + (1-theta)*(x-x_prop); 
+       z = (2-theta)*x  - (1-theta) * x_prop */
+    cbstatus=cublasCcopy(cbhandle,4*N,xd,1,zd,1);
+    alpha.x=(2.0f-theta);alpha.y=0.0f;
+    cbstatus=cublasCscal(cbhandle,4*N,&alpha,zd,1);
+    alpha.x=-(1.0f-theta);alpha.y=0.0f;
+    cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, x_propd, 1, zd, 1);
+
+    /* eta = grad_old;
+     grad  <= grad_f( z ) */
+    cbstatus=cublasCcopy(cbhandle,4*N,fgradxd,1,etad,1);
+    cudakernel_fns_fgrad_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,zd,Yd,Zd,admm_rho,fgradxd,yd,cohd,bbd,iwd,wtd,1,cbhandle,gWORK1);
+
+    /* z_prop <= z_prop - z */
+    alpha.x=-1.0f;alpha.y=0.0f;
+    cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, zd, 1, z_propd, 1);
+    /* eta <= eta - new_grad */
+    cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, fgradxd, 1, etad, 1);
+   
+    /* ||z-z_prop|| */
+    float ydiffnrm;
+    cublasScnrm2(cbhandle,4*N,z_propd,1,&ydiffnrm);
+    /* (z_zold)'*(grad-grad_old) */
+    float dot_ydiff_gdiff;
+    cbstatus=cublasSdot(cbhandle, 8*N, (float*)z_propd, 1, (float*)etad, 1, &dot_ydiff_gdiff);
+#ifdef DEBUG
+   printf("num=%e den=%e\n",ydiffnrm,dot_ydiff_gdiff);
+#endif
+    /* the above can be NAN, if so break loop */
+    if (isnan(dot_ydiff_gdiff) || isinf(dot_ydiff_gdiff)) {
+     break;
+    }
+
+
+    /* backtracking
+     t_hat = 0.5*(norm(y-y_old)^2)/abs((y - y_old)'*(g_old - g));
+     t = min( ALPHA*t, max( BETA*t, t_hat ));
+    */
+    float t_hat=0.5f*(ydiffnrm*ydiffnrm)/fabsf(dot_ydiff_gdiff);
+    t=MIN(ALPHA*t,MAX(BETA*t,t_hat));
+#ifdef DEBUG
+printf("k=%d theta=%e step=%e\n",k,theta,t);
+#endif
+  }
+
+  /* final residual */
+  fx=cudakernel_fns_f_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,xd,Yd,Zd,admm_rho,yd,cohd,bbd,wtd,cbhandle,gWORK1);
+  info[1]=fx;
+#ifdef DEBUG
+printf("NEW NSD cost=%g\n",fx);
+#endif
+
+/***************************************************/
+ cudaDeviceSynchronize();
+   /* w <= (p+nu)/(1+error^2), q<=w-log(w) */
+   /* p = 2, use MAX() residual of XX,XY,YX,YY, not the sum */
+   cudakernel_fns_fupdate_weights_q(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,qd,robust_nu);
+   /* sumq<=sum(w-log(w))/N */
+   cbstatus=cublasSasum(cbhandle, M, qd, 1, &q_sum);
+   q_sum/=(float)M;
+#ifdef DEBUG
+   printf("deltanu=%f sum(w-log(w))=%f\n",deltanu,q_sum);
+#endif
+  /* for nu range 2~numax evaluate, p-variate T
+     psi((nu0+p)/2)-ln((nu0+p)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1 
+     note: AECM not ECME
+     and find min(| |) */
+   int ThreadsPerBlock2=ThreadsPerBlock/4;
+   cudakernel_evaluatenu_fl_eight(ThreadsPerBlock2, (Nd+ThreadsPerBlock-1)/ThreadsPerBlock2, Nd, q_sum, qd, deltanu,(float)robust_nulow,robust_nu);
+   /* find min(abs()) value */
+   cbstatus=cublasIsamin(cbhandle, Nd, qd, 1, &ci); /* 1 based index */
+   robust_nu1=(float)robust_nulow+(float)(ci-1)*deltanu;
+#ifdef DEBUG
+   printf("nu updated %d from %f [%lf,%lf] to %f\n",ci,robust_nu,robust_nulow,robust_nuhigh,robust_nu1);
+#endif
+   /* seems pedantic, but make sure new value for robust_nu fits within bounds */
+   if (robust_nu1<robust_nulow) {
+    dp->robust_nu=robust_nulow;
+   } else if (robust_nu1>robust_nuhigh) {
+    dp->robust_nu=robust_nuhigh;
+   } else {
+    dp->robust_nu=(double)robust_nu1;
+   }
+  
+  if(fx0>fx) {
+  //printf("Cost final %g  initial %g\n",fx,fx0);
+  /* copy back current solution */
+  err=cudaMemcpy(x,xd,8*N*sizeof(float),cudaMemcpyDeviceToHost);
+  checkCudaError(err,__FILE__,__LINE__);
+
+
+  /* copy back solution to x0 : format checked*/
+  /* re J(0,0) */
+  my_fcopy(N, &Jd[0], 4, &x0[0], 8);
+  /* im J(0,0) */
+  my_fcopy(N, &Jd[1], 4, &x0[1], 8);
+  /* re J(1,0) */
+  my_fcopy(N, &Jd[2], 4, &x0[4], 8);
+  /* im J(1,0) */
+  my_fcopy(N, &Jd[3], 4, &x0[5], 8);
+  /* re J(0,1) */
+  my_fcopy(N, &Jd[4*N], 4, &x0[2], 8);
+  /* im J(0,1) */
+  my_fcopy(N, &Jd[4*N+1], 4, &x0[3], 8);
+  /* re J(1,1) */
+  my_fcopy(N, &Jd[4*N+2], 4, &x0[6], 8);
+  /* im J(1,1) */
+  my_fcopy(N, &Jd[4*N+3], 4, &x0[7], 8);
+
+  }
+  cudaFree(Yd);
+  cudaFree(Zd);
+
+  free(x);
+  free(Yx);
+  free(Zx);
+
+  return 0;
+}
--- a/src/lib/sagecal.h
+++ b/src/lib/sagecal.h
@ -71,6 +71,9 @@
 #define STYPE_RING 3
 #define STYPE_SHAPELET 4

+/* max source name length, increase it if names get longer */
+#define MAX_SNAME 2048
+
 /********* constants - from levmar ******************/
 #define CLM_INIT_MU       1E-03
 #define CLM_STOP_THRESH   1E-17
@ -192,6 +195,8 @@ typedef struct thread_data_base_ {

  /* following for ignoring clusters in simulation */
  int *ignlist; /* Mx1 array, if any value 1, that cluster will not be simulated */
+  /* flag for adding model to data */
+  int add_to_data;

  /* following used for multifrequency (channel) data */
  double *freqs;
@ -264,6 +269,18 @@ typedef struct thread_data_setwt_ {

 } thread_data_setwt_t;

+/* structure for weight calculation for baselines */
+typedef struct thread_data_baselinewt_ {
+ int Nb; /* no of baselines this handle */
+ int boff; /* baseline offset per thread */
+
+ double *wt; /* 8 values per baseline */
+ double *u,*v;
+ double freq0;
+
+} thread_data_baselinewt_t;
+
+

 /* structure for worker threads for jacobian calculation */
 typedef struct thread_data_jac_ {
@ -436,6 +453,21 @@ read_solutions(FILE *sfp,double *p,clus_source_t *carr,int N,int M);
 */ 
 extern int
 update_ignorelist(const char *ignfile, int *ignlist, int M, clus_source_t *carr);
+
+/* read ADMM regularization factor per cluster from text file, format:
+ cluster_id  hybrid_parameter admm_rho
+ ...
+ ...
+ (M values)
+ and store it in array arho : size Mtx1, taking into account the hybrid parameter
+ also in array arhoslave : size Mx1, without taking hybrid params into account
+
+ admm_rho : can be 0 to ignore consensus, just normal calibration
+*/
+ 
+extern int
+read_arho_fromfile(const char *admm_rho_file,int Mt,double *arho, int M, double *arhoslave);
+
 /****************************** dataio.c ****************************/
 /* open binary file for input/output
 datfile: data file descriptor id
@ -874,7 +906,9 @@ __attribute__ ((target(MIC)))
 extern int
 lbfgs_fit_robust(
   void (*func)(double *p, double *hx, int m, int n, void *adata),
-   double *p, double *x, int m, int n, int itmax, int lbfgs_m, int gpu_threads, void *adata);
+   double *p, double *x, int m, int n, int itmax, int lbfgs_m, int gpu_threads,
+ int whiten, /* if >0 whiten data 1: NCP, 2... */
+ void *adata);
 #ifdef HAVE_CUDA
 extern int
 lbfgs_fit_robust_cuda(
@ -925,15 +959,17 @@ calculate_residuals_multifreq(double *u,double *v,double *w,double *p,double *x,

 /* 
  calculate visibilities for multiple channels, no solutions are used
-  note: output column x is set to 0
+  note: output column x is set to 0 if add_to_data ==0, else model is added to data
 */
 extern int
-predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt);
+predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt,int add_to_data);


-/* predict with solutions in p , ignore clusters flagged in ignorelist (Mx1) array*/
+/* predict with solutions in p , ignore clusters flagged in ignorelist (Mx1) array
+ also correct final data with solutions for cluster ccid, if valid
+*/
 extern int
-predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,double *x,int *ignorelist,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt);
+predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,double *x,int *ignorelist,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt,int add_to_data, int ccid, double rho);
 /****************************** mderiv.cu ****************************/
 /* cuda driver for kernel */
 /* ThreadsPerBlock: keep <= 128
@ -1331,6 +1367,7 @@ osrlevmar_der_single_cuda_fl(
  int ntiles, /* total tile (data) size being solved for */
  double robust_nulow, double robust_nuhigh, /* robust nu range */
  int randomize, /* if >0 randomize */
+  int whiten, /* if >0 whiten data 1: NCP, 2... */
  void *adata);
 #endif /* HAVE_CUDA */

@ -1357,6 +1394,7 @@ rlevmar_der_single_nocuda(
  int linsolv, /* 0 Cholesky, 1 QR, 2 SVD */
  int Nt, /* no of threads */
  double robust_nulow, double robust_nuhigh, /* robust nu range */
+  int whiten, /* if >0 whiten data 1: NCP, 2... */
  void *adata);

 /* robust LM, OS acceleration */
@ -1396,6 +1434,7 @@ osrlevmar_der_single_nocuda(
  int Nt, /* no of threads */
  double robust_nulow, double robust_nuhigh, /* robust nu range */
  int randomize, /* if >0 randomize */
+  int whiten, /* if >0 whiten data 1: NCP, 2... */
  void *adata);

 /****************************** updatenu.c ****************************/
@ -1425,6 +1464,15 @@ __attribute__ ((target(MIC)))
 #endif
 extern double
 update_w_and_nu(double nu0, double *w, double *ed, int N, int Nt,  double nulow, double nuhigh);
+
+/* update weights array wt by multiplying it with the inverse density function
+  1/( 1+f(u,v) ) 
+ as u,v->inf, f(u,v) -> 0 so long baselines are not affected 
+ wt : Nbase*8 x 1
+ u,v : Nbase x 1
+ note: u = u/c, v=v/c here, so need freq to convert to wavelengths */
+extern void
+add_whitening_weights(int Nbase, double *wt, double *u, double *v, double freq0, int Nt);
 /****************************** clmfit_nocuda.c ****************************/
 /* LM with LAPACK */
 /** keep interface almost the same as in levmar **/
@ -1734,6 +1782,22 @@ rtr_solve_nocuda_robust(
  double *info, /* initial and final residuals */
  me_data_t *adata);

+/* Nesterov's SD */
+#ifdef USE_MIC
+__attribute__ ((target(MIC)))
+#endif
+extern int
+nsd_solve_nocuda_robust(
+  double *x,         /* initial values and updated solution at output (size 8*N double) */
+  double *y,         /* data vector (size 8*M double) */
+  int N,              /* no. of stations */
+  int M,              /* no. of constraints */
+  int itmax,          /* maximum number of iterations */
+  double robust_nulow, double robust_nuhigh, /* robust nu range */
+  double *info, /* initial and final residuals */
+  me_data_t *adata); /* pointer to additional data
+                */
+
 /****************************** rtr_solve_robust_admm.c ****************************/
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
@ -1824,6 +1888,24 @@ rtr_solve_cuda_robust_fl(
  int tileoff, /* tile offset when solving for many chunks */
  int ntiles, /* total tile (data) size being solved for */
  me_data_t *adata);
+
+
+/* Nesterov's steepest descent */
+extern int
+nsd_solve_cuda_robust_fl(
+  float *x0,         /* initial values and updated solution at output (size 8*N float) */
+  float *y,         /* data vector (size 8*M float) */
+  int N,              /* no of stations */
+  int M,              /* no of constraints */
+  int itmax,          /* maximum number of iterations */
+  double robust_nulow, double robust_nuhigh, /* robust nu range */
+  double *info, /* initial and final residuals */
+  cublasHandle_t cbhandle, /* device handle */
+  float *gWORK, /* GPU allocated memory */
+  int tileoff, /* tile offset when solving for many chunks */
+  int ntiles, /* total tile (data) size being solved for */
+  me_data_t *adata);
+
 /****************************** rtr_solve_robust_cuda_admm.c ****************************/
 /* ADMM solver */
 extern int
@ -1834,8 +1916,7 @@ rtr_solve_cuda_robust_admm_fl(
  float *y,         /* data vector (size 8*M float) */
  int N,              /* no of stations */
  int M,              /* no of constraints */
-  int itmax_sd,          /* maximum number of iterations RSD */
-  int itmax_rtr,          /* maximum number of iterations RTR */
+  int itmax_rtr,          /* maximum number of iterations */
  float Delta_bar, float Delta0, /* Trust region radius and initial value */
  float admm_rho, /* ADMM regularization */
  double robust_nulow, double robust_nuhigh, /* robust nu range */
@ -1846,6 +1927,26 @@ rtr_solve_cuda_robust_admm_fl(
  int tileoff, /* tile offset when solving for many chunks */
  int ntiles, /* total tile (data) size being solved for */
  me_data_t *adata);
+
+
+/* Nesterov's SD */
+extern int
+nsd_solve_cuda_robust_admm_fl(
+  float *x0,         /* initial values and updated solution at output (size 8*N float) */
+  float *Y, /* Lagrange multiplier size 8N */
+  float *Z, /* consensus term B Z  size 8N */
+  float *y,         /* data vector (size 8*M float) */
+  int N,              /* no of stations */
+  int M,              /* no of constraints */
+  int itmax,          /* maximum number of iterations */
+  float admm_rho, /* ADMM regularization */
+  double robust_nulow, double robust_nuhigh, /* robust nu range */
+  double *info, /* initial and final residuals */
+  cublasHandle_t cbhandle, /* device handle */
+  float *gWORK, /* GPU allocated memory */
+  int tileoff, /* tile offset when solving for many chunks */
+  int ntiles, /* total tile (data) size being solved for */
+  me_data_t *adata);
 #endif /* HAVE_CUDA */
 /****************************** lmfit.c ****************************/
 /****************************** lmfit_nocuda.c ****************************/
@ -1885,6 +1986,7 @@ random_permutation(int n, int weighted_iter, double *w);
 #define SM_RLM_RLBFGS 2
 #define SM_RTR_OSLM_LBFGS 4
 #define SM_RTR_OSRLM_RLBFGS 5
+#define SM_NSD_RLBFGS 6
 /* fit visibilities
  u,v,w: u,v,w coordinates (wavelengths) size Nbase*tilesz x 1 
  u,v,w are ordered with baselines, timeslots
@ -1913,6 +2015,7 @@ random_permutation(int n, int weighted_iter, double *w);
  nulow,nuhigh: robust nu search range
  randomize: if >0, randomize cluster selection in SAGE and OS subset selection

+  whiten : if >0 whiten data 1: NCP, 2... 
  mean_nu: output mean value of nu
  res_0,res_1: initial and final residuals (output)
  return val=0 if final residual< initial residual
@ -1923,7 +2026,7 @@ __attribute__ ((target(MIC)))
 #endif
 extern int
 sagefit_visibilities(double *u, double *v, double *w, double *x, int N, 
-   int Nbase, int tilesz,  baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt,int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv, int solver_mode, double nulow, double nuhigh, int randomize, double *mean_nu, double *res_0, double *res_1); 
+   int Nbase, int tilesz,  baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt,int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv, int solver_mode, double nulow, double nuhigh, int randomize, int whiten, double *mean_nu, double *res_0, double *res_1); 

 /* same as above, but uses 2 GPUS in the LM stage */
 extern int
@ -2010,6 +2113,9 @@ typedef struct pipeline_ {
 #ifndef PT_DO_WORK_RRTR /* Robust RTR */
 #define PT_DO_WORK_RRTR 8
 #endif
+#ifndef PT_DO_WORK_NSD /* Nesterov's SD */
+#define PT_DO_WORK_NSD 9
+#endif
 #ifndef PT_DO_MEMRESET 
 #define PT_DO_MEMRESET 99
 #endif
@ -2086,7 +2192,7 @@ typedef struct gb_data_admm_fl_ {
  float *p[2]; /* pointer to parameters being solved by each thread */
  float *Y[2]; /* pointer to Lagrange multiplier */
  float *Z[2]; /* pointer to consensus term */
-  float admm_rho;
+  float admm_rho[2];
  float *x[2]; /* pointer to data being fit by each thread */
  int M[2];
  int N[2];
@ -2233,23 +2339,23 @@ update_global_z(double *Z,int N,int M,int Npoly,double *z,double *Bi);
   Y : Lagrange multiplier
   BZ : consensus term
   Y,BZ : size same as pp : 8*N*Mt x1 double values (re,img) for each station/direction 
-   admm_rho : regularization factor 
+   admm_rho : regularization factor array size Mx1
 */ 
 extern int
 sagefit_visibilities_admm(double *u, double *v, double *w, double *x, int N,
-   int Nbase, int tilesz,  baseline_t *barr,  clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, double admm_rho, double *mean_nu, double *res_0, double *res_1);
+   int Nbase, int tilesz,  baseline_t *barr,  clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, double *admm_rho, double *mean_nu, double *res_0, double *res_1);

 /* ADMM cost function  = normal_cost + ||Y^H(J-BZ)|| + rho/2 ||J-BZ||^2 */
 /* extra params
   Y : Lagrange multiplier
   BZ : consensus term
   Y,BZ : size same as pp : 8*N*Mt x1 double values (re,img) for each station/direction 
-   admm_rho : regularization factor 
+   admm_rho : regularization factor  array size Mx1
 */ 
 #ifdef HAVE_CUDA
 extern int
 sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x, int N,
-   int Nbase, int tilesz,  baseline_t *barr,  clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,  double nulow, double nuhigh, int randomize, double admm_rho, double *mean_nu, double *res_0, double *res_1);
+   int Nbase, int tilesz,  baseline_t *barr,  clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,  double nulow, double nuhigh, int randomize, double *admm_rho, double *mean_nu, double *res_0, double *res_1);
 #endif

 #ifdef __cplusplus
--- a/src/lib/updatenu.c
+++ b/src/lib/updatenu.c
@ -336,3 +336,151 @@ update_nu(double logsumw, int Nd, int Nt, double nulow, double nuhigh, int p, do
  free(q);
  return thisnu;
 }
+
+
+/* x = sqrt(u^2+v^2) */
+static double
+ncp_weight(double ud) {
+/*    fo(x) = 
+              a1*exp(-((x-b1)/c1)^2) + a2*exp(-((x-b2)/c2)^2) + 
+              a3*exp(-((x-b3)/c3)^2) + a4*exp(-((x-b4)/c4)^2) + 
+              a5*exp(-((x-b5)/c5)^2) + a6*exp(-((x-b6)/c6)^2)
+    mean(fo(x)) is about 1
+*/
+ float x=(float)ud;
+ if (x<40.0f) { return 1.0; }
+ if (x>800.0f) {
+  return 1.0;
+ }
+ /* else [40,285] */
+ float r[6];
+ float a1 =-0.9415f;
+ float b1 =117.1f;
+ float c1 =15.08f;
+ float a2 =5.231f;
+ float b2 =49.57f;
+ float c2 =13.79f;
+ float a3 =2.209f;
+ float b3 =67.29f;
+ float c3 =14.86f;
+ float a4 =10.43f;
+ float b4 =72.19f;
+ float c4 =200.8f;
+ float a5 =104.9f;
+ float b5 =98.72f;
+ float c5 =65.8f;
+ float a6 =-101.3f;
+ float b6 =101.2f;
+ float c6 =66.63f;
+ r[0]=(x-b1)/c1;
+ r[1]=(x-b2)/c2;
+ r[2]=(x-b3)/c3;
+ r[3]=(x-b4)/c4;
+ r[4]=(x-b5)/c5;
+ r[5]=(x-b6)/c6;
+ r[0]*=-r[0];
+ r[1]*=-r[1];
+ r[2]*=-r[2];
+ r[3]*=-r[3];
+ r[4]*=-r[4];
+ r[5]*=-r[5];
+ float sum=0.0f;
+ sum+=a1*expf(r[0]);
+ sum+=a2*expf(r[1]);
+ sum+=a3*expf(r[2]);
+ sum+=a4*expf(r[3]);
+ sum+=a5*expf(r[4]);
+ sum+=a6*expf(r[5]);
+ return (1.0/((double)sum+1.0)); /* as x-> inf, goes to 1 */
+}
+
+static void *
+threadfn_setblweight(void *data) {
+ thread_data_baselinewt_t *t=(thread_data_baselinewt_t*)data;
+
+ int ci;
+ for (ci=0; ci<t->Nb; ci++) {
+  /* get sqrt(u^2+v^2) */
+  double uu=t->u[ci+t->boff]*t->freq0;
+  double vv=t->v[ci+t->boff]*t->freq0;
+  double a=ncp_weight(sqrt(uu*uu+vv*vv));
+  t->wt[8*(ci+t->boff)]*=a;
+  t->wt[8*(ci+t->boff)+1]*=a;
+  t->wt[8*(ci+t->boff)+2]*=a;
+  t->wt[8*(ci+t->boff)+3]*=a;
+  t->wt[8*(ci+t->boff)+4]*=a;
+  t->wt[8*(ci+t->boff)+5]*=a;
+  t->wt[8*(ci+t->boff)+6]*=a;
+  t->wt[8*(ci+t->boff)+7]*=a;
+  printf("%lf %lf %lf\n",uu,vv,a);
+ }
+
+ return NULL;
+}
+
+
+/* update weights array wt by multiplying it with the inverse density function
+  1/( 1+f(u,v) ) 
+ as u,v->inf, f(u,v) -> 0 so long baselines are not affected 
+ wt : Nbase*8 x 1
+ u,v : Nbase x 1
+ note: u = u/c, v=v/c here, so need freq to convert to wavelengths */
+void
+add_whitening_weights(int Nbase, double *wt, double *u, double *v, double freq0, int Nt) {
+ pthread_attr_t attr;
+ pthread_t *th_array;
+ thread_data_baselinewt_t *threaddata;
+
+ int ci,nth1,nth;
+ int Nthb0,Nthb;
+
+ Nthb0=(Nbase+Nt-1)/Nt;
+
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
+
+ if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
+#ifndef USE_MIC
+   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+#endif
+   exit(1);
+ }
+
+ if ((threaddata=(thread_data_baselinewt_t*)malloc((size_t)Nt*sizeof(thread_data_baselinewt_t)))==0) {
+#ifndef USE_MIC
+    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+#endif
+    exit(1);
+ }
+
+
+  /* iterate over threads, allocating baselines per thread */
+  ci=0;
+  for (nth=0;  nth<Nt && ci<Nbase; nth++) {
+    if (ci+Nthb0<Nbase) {
+     Nthb=Nthb0;
+    } else {
+     Nthb=Nbase-ci;
+    }
+
+    threaddata[nth].Nb=Nthb;
+    threaddata[nth].boff=ci;
+    threaddata[nth].wt=wt;
+    threaddata[nth].u=u;
+    threaddata[nth].v=v;
+    threaddata[nth].freq0=freq0;
+
+    pthread_create(&th_array[nth],&attr,threadfn_setblweight,(void*)(&threaddata[nth]));
+    /* next baseline set */
+    ci=ci+Nthb;
+  }
+
+  /* now wait for threads to finish */
+  for(nth1=0; nth1<nth; nth1++) {
+   pthread_join(th_array[nth1],NULL);
+  }
+
+ pthread_attr_destroy(&attr);
+ free(th_array);
+ free(threaddata);
+}