From fa1448be0197763c4f626f677f435c1e5c2cfd35 Mon Sep 17 00:00:00 2001
From: Sarod Yatawatta <sarod@dop373.nfra.nl>
Date: Mon, 30 Jul 2018 15:25:08 +0200
Subject: [PATCH 1/4] cleaning up LBFGS related code, removed duplicates

---
 README.md                           |   2 +-
 src/lib/Dirac/CMakeLists.txt        |   2 +-
 src/lib/Dirac/Dirac.h               | 138 ++--
 src/lib/Dirac/Makefile.gpu          |   4 +-
 src/lib/Dirac/admm_solve.c          | 168 -----
 src/lib/Dirac/lbfgs.c               | 119 +---
 src/lib/Dirac/lbfgs_nocuda.c        | 522 +++------------
 src/lib/Dirac/lmfit.c               |  22 +-
 src/lib/Dirac/lmfit_nocuda.c        |  10 +-
 src/lib/Dirac/robust_lbfgs_nocuda.c | 974 ++++++++++------------------
 10 files changed, 503 insertions(+), 1458 deletions(-)

diff --git a/README.md b/README.md
index 4ab434b..fa3ca5f 100644
--- a/README.md
+++ b/README.md
@@ -175,7 +175,7 @@ which can be used to count the row number. It will keep repeating this, for each
 The rows 0 to 7 belong to the solutions for the 1st station. The rows 8 to 15 for the 2nd station and so on. 
 Each 8 rows of any given column represent the 8 values of a 2x2 Jones matrix. Lets say these are ```S0,S1,S2,S3,S4,S5,S6``` and ```S7```. Then the Jones matrix is ```[S0+j*S1, S4+j*S5; S2+j*S3, S6+j*S7]``` (the ';' denotes the 1st row of the 2x2 matrix).
 
-When a luster has a chunk size > 1, there will be more than 1 solution per given time interval. 
+When a cluster has a chunk size > 1, there will be more than 1 solution per given time interval. 
 So for this cluster, there will be more than 1 column in the solution file, the exact number of columns being equal to the chunk size.
 
 
diff --git a/src/lib/Dirac/CMakeLists.txt b/src/lib/Dirac/CMakeLists.txt
index 814a333..ba648bd 100644
--- a/src/lib/Dirac/CMakeLists.txt
+++ b/src/lib/Dirac/CMakeLists.txt
@@ -8,7 +8,6 @@ set (objects
     manifold_average
     mdl
     myblas
-    robust_lbfgs_nocuda
     robustlm
     rtr_solve_robust_admm
     updatenu
@@ -50,6 +49,7 @@ else()
         lmfit_nocuda
         consensus_poly
         lbfgs_nocuda
+        robust_lbfgs_nocuda
         rtr_solve
         rtr_solve_robust
     )
diff --git a/src/lib/Dirac/Dirac.h b/src/lib/Dirac/Dirac.h
index 09504ad..5830600 100644
--- a/src/lib/Dirac/Dirac.h
+++ b/src/lib/Dirac/Dirac.h
@@ -139,8 +139,27 @@ find_sumproduct(int N, float *x, float *y, float *sum1, float *sum2, int Nt);
 /****************************** lbfgs.c ****************************/
 /****************************** lbfgs_nocuda.c ****************************/
 /* LBFGS routines */
-/* func: vector function to minimize, actual cost minimized is ||func-x||^2
-   NOTE: gradient function given seperately
+
+#ifndef HAVE_CUDA
+/* line search */
+/* func: scalar function
+   xk: parameter values size m x 1 (at which step is calculated)
+   pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
+   alpha1: initial value for step
+   sigma,rho,t1,t2,t3: line search parameters (from Fletcher) 
+   m: size or parameter vector
+   step: step size for differencing 
+   adata:  additional data passed to the function
+*/
+extern double 
+linesearch(
+   double (*func)(double *p, int m, void *adata),
+   double *xk, double *pk, double alpha1, double sigma, double rho, double t1, double t2, double t3, int m, double step, void *adata);
+ 
+/* cost function : return a scalar cost, input : p (mx1) parameters, m: no. of params, adata: additional data
+   grad function: return gradient (mx1): input : p (mx1) parameters, g (mx1) gradient vector, m: no. of params, adata: additional data
+*/
+/* 
    p: parameters m x 1 (used as initial value, output final value)
    x: data  n x 1
    itmax: max iterations
@@ -153,9 +172,21 @@ __attribute__ ((target(MIC)))
 #endif
 extern int
 lbfgs_fit(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
+   double (*cost_func)(double *p, int m, void *adata),
+   void (*grad_func)(double *p, double *g, int m, void *adata),
+   void (*change_batch)(int iter, void *adata),
+   double *p, int m, int itmax, int M, void *adata);
+#endif /* !HAVE_CUDA */
+
+#ifdef HAVE_CUDA
+extern int
+lbfgs_fit(
+   double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, void *adata);
+extern int
+lbfgs_fit_robust_cuda(
    double *p, double *x, int m, int n, int itmax, int lbfgs_m, int gpu_threads, void *adata);
- 
+#endif /* HAVE_CUDA */
+
 /****************************** robust_lbfgs_nocuda.c ****************************/
 typedef struct thread_data_logf_t_ {
   double *f;
@@ -164,22 +195,21 @@ typedef struct thread_data_logf_t_ {
   int start,end;
   double sum;
 } thread_data_logf_t;
-
 /* robust_nu: nu in T distribution */
+
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
 extern int
-lbfgs_fit_robust(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
+lbfgs_fit_robust_wrapper(
    double *p, double *x, int m, int n, int itmax, int lbfgs_m, int gpu_threads,
  void *adata);
-#ifdef HAVE_CUDA
+
 extern int
-lbfgs_fit_robust_cuda(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
-   double *p, double *x, int m, int n, int itmax, int lbfgs_m, int gpu_threads, void *adata);
-#endif
+lbfgs_fit_wrapper(
+   double *p, double *x, int m, int n, int itmax, int lbfgs_m, int gpu_threads,
+ void *adata);
+
 
 /****************************** mderiv.cu ****************************/
 /* cuda driver for kernel */
@@ -1300,6 +1330,15 @@ sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x
 extern void
 openblas_set_num_threads(int num_threads);
 
+/****************************** lmfit.c ****************************/
+/****************************** lmfit_nocuda.c ****************************/
+/* minimization (or vector cost) function (multithreaded) */
+/* p: size mx1 parameters
+   x: size nx1 model being calculated
+   data: extra info needed */
+extern void
+minimize_viz_full_pth(double *p, double *x, int m, int n, void *data);
+
 /********* solver modes *********/
 #define SM_LM_LBFGS 1
 #define SM_OSLM_LBFGS 0
@@ -1381,15 +1420,13 @@ bfgsfit_visibilities(double *u, double *v, double *w, double *x, int N,
    int Nbase, int tilesz,  baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt, int max_lbfgs, int lbfgs_m, int gpu_threads, int solver_mode, double mean_nu, double *res_0, double *res_1); 
 
 
+
+#ifdef HAVE_CUDA
 extern int
 bfgsfit_visibilities_gpu(double *u, double *v, double *w, double *x, int N, 
    int Nbase, int tilesz,  baseline_t *barr,  clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt, int max_lbfgs, int lbfgs_m, int gpu_threads, int solver_mode,  double mean_nu, double *res_0, double *res_1); 
 
 
-
-
-
-#ifdef HAVE_CUDA
 /* data struct shared by all threads */
 typedef struct gb_data_ {
   int status[2]; /* 0: do nothing, 
@@ -1492,75 +1529,6 @@ extern int
 sagefit_visibilities_dual_pt_flt(double *u, double *v, double *w, double *x, int N,
    int Nbase, int tilesz,  baseline_t *barr,  clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,  double nulow, double nuhigh, int randomize, double *mean_nu, double *res_0, double *res_1);
 
-/****************************** stationbeam.c ****************************/
-/* 
-  ra,dec: source direction (rad)
-  ra0,dec0: beam center (rad)
-  f: frequency (Hz)
-  f0: beam forming frequency (Hz)
-  
-  longitude,latitude : Nx1 array of station positions (rad,rad)
-  time_jd: JD (day) time
-  Nelem : Nx1 array, no. of elements used in each station
-  x,y,z: Nx1 pointer arrays to station positions, each station has Nelem[]x1 arrays
-
-  beamgain: Nx1 array of station beam gain along the source direction
-*/ 
-extern int
-arraybeam(double ra, double dec, double ra0, double dec0, double f, double f0, int N, double *longitude, double *latitude, double time_jd, int *Nelem, double **x, double **y, double **z, double *beamgain);
-
-
-/****************************** predict_withbeam.c ****************************/
-/* precalculate cluster coherencies
-  u,v,w: u,v,w coordinates (wavelengths) size Nbase*tilesz x 1 
-  u,v,w are ordered with baselines, timeslots
-  x: coherencies size Nbase*4*Mx 1
-   ordered by XX(re,im),XY(re,im),YX(re,im), YY(re,im), baseline, timeslots
-  N: no of stations
-  Nbase: total no of baselines (including more than one tile or timeslot)
-  barr: baseline to station map, size Nbase*tilesz x 1
-  carr: sky model/cluster info size Mx1 of clusters
-  M: no of clusters
-  freq0: frequency
-  fdelta: bandwidth for freq smearing
-  tdelta: integration time for time smearing
-  dec0: declination for time smearing
-  uvmin: baseline length sqrt(u^2+v^2) below which not to include in solution
-  uvmax: baseline length higher than this not included in solution
-
-  Station beam specific parameters
-  ph_ra0,ph_dec0: beam pointing rad,rad
-  ph_freq0: beam reference freq
-  longitude,latitude: Nx1 arrays (rad,rad) station locations
-  time_utc: JD (day) : tilesz x 1 
-  tilesz: how many tiles: == unique time_utc
-  Nelem: Nx1 array, size of stations (elements)
-  xx,yy,zz: Nx1 arrays of station element locations arrays xx[],yy[],zz[]
-  Nt: no of threads
-
-  NOTE: prediction is done for all baselines, even flagged ones
-  and flags are set to 2 for baselines lower than uvcut
-*/
-
-extern int
-precalculate_coherencies_withbeam(double *u, double *v, double *w, complex double *x, int N,
-   int Nbase, baseline_t *barr,  clus_source_t *carr, int M, double freq0, double fdelta, double tdelta, double dec0, double uvmin, double uvmax, 
- double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int tileze, int *Nelem, double **xx, double **yy, double **zz, int Nt);
-
-
-extern int
-predict_visibilities_multifreq_withbeam(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
- double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int Nt, int add_to_data);
-
-extern int
-calculate_residuals_multifreq_withbeam(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,
-double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int Nt, int ccid, double rho, int phase_only);
-
-
-/* change epoch of soure ra,dec from J2000 to JAPP */
-/* also the beam pointing ra_beam,dec_beam */
-extern int
-precess_source_locations(double jd_tdb, clus_source_t *carr, int M, double *ra_beam, double *dec_beam, int Nt);
 
 #ifdef __cplusplus
      } /* extern "C" */
diff --git a/src/lib/Dirac/Makefile.gpu b/src/lib/Dirac/Makefile.gpu
index 564defd..e01d9e0 100644
--- a/src/lib/Dirac/Makefile.gpu
+++ b/src/lib/Dirac/Makefile.gpu
@@ -21,7 +21,7 @@ INCLUDES=-I.
 LIBPATH= $(CUDALIB)
 
 
-OBJECTS=lmfit.o lbfgs.o myblas.o mderiv.o clmfit.o clmfit_nocuda.o barrier.o robust.o robustlm.o oslmfit.o mderiv_fl.o clmfit_fl.o updatenu.o robust_lbfgs_nocuda.o robust_fl.o manifold_fl.o rtr_solve_cuda.o rtr_solve_robust_cuda.o manifold_average.o consensus_poly.o rtr_solve_robust_cuda_admm.o rtr_solve_robust_admm.o admm_solve.o load_balance.o mdl.o 
+OBJECTS=lmfit.o lbfgs.o myblas.o mderiv.o clmfit.o clmfit_nocuda.o barrier.o robust.o robustlm.o oslmfit.o mderiv_fl.o clmfit_fl.o updatenu.o robust_fl.o manifold_fl.o rtr_solve_cuda.o rtr_solve_robust_cuda.o manifold_average.o consensus_poly.o rtr_solve_robust_cuda_admm.o rtr_solve_robust_admm.o admm_solve.o load_balance.o mdl.o 
 
 
 default:libdirac.a
@@ -47,8 +47,6 @@ robust_fl.o:robust_fl.cu
 	$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
 oslmfit.o:oslmfit.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
-robust_lbfgs_nocuda.o:robust_lbfgs_nocuda.c
-	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 clmfit_fl.o:clmfit_fl.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI)  -c $<
 updatenu.o:updatenu.c
diff --git a/src/lib/Dirac/admm_solve.c b/src/lib/Dirac/admm_solve.c
index 9628069..a8edb23 100644
--- a/src/lib/Dirac/admm_solve.c
+++ b/src/lib/Dirac/admm_solve.c
@@ -213,174 +213,6 @@ mylm_fit_single_pth(double *p, double *x, int m, int n, void *data) {
 
 
 
-/* worker thread function for prediction */
-static void *
-predict_threadfn_withgain_full(void *data) {
- thread_data_base_t *t=(thread_data_base_t*)data;
- 
- int ci,cm,sta1,sta2;
- double *pm;
- complex double C[4],G1[4],G2[4],T1[4],T2[4];
- int M=(t->M);
- int Ntilebase=(t->Nbase)*(t->tilesz);
- int px;
-
- for (ci=0; ci<t->Nb; ci++) {
-   /* iterate over the sky model and calculate contribution */
-   /* for this x[8*ci:8*(ci+1)-1] */
-   memset(&(t->x[8*ci]),0,sizeof(double)*8);
-
-   /* if this baseline is flagged, we do not compute */
-   if (!t->barr[ci+t->boff].flag) {
-
-   /* stations for this baseline */
-   sta1=t->barr[ci+t->boff].sta1;
-   sta2=t->barr[ci+t->boff].sta2;
-   for (cm=0; cm<M; cm++) { /* clusters */
-     /* gains for this cluster, for sta1,sta2 */
-     /* depending on the baseline index, and cluster chunk size,
-        select proper parameter addres */
-    /* depending on the chunk size and the baseline index,
-        select right set of parameters 
-       data x=[0,........,Nbase*tilesz]
-       divided into nchunk chunks
-       p[0] -> x[0.....Nbase*tilesz/nchunk-1]
-       p[1] -> x[Nbase*tilesz/nchunk......2*Nbase*tilesz-1]
-       ....
-       p[last] -> x[(nchunk-1)*Nbase*tilesz/nchunk......Nbase*tilesz]
-
-       so given bindex,  right p[] is bindex/((Nbase*tilesz+nchunk-1)/nchunk)
-       */
-
-     px=(ci+t->boff)/((Ntilebase+t->carr[cm].nchunk-1)/t->carr[cm].nchunk);
-     //pm=&(t->p[cm*8*N]);
-     pm=&(t->p[t->carr[cm].p[px]]);
-     G1[0]=(pm[sta1*8])+_Complex_I*(pm[sta1*8+1]);
-     G1[1]=(pm[sta1*8+2])+_Complex_I*(pm[sta1*8+3]);
-     G1[2]=(pm[sta1*8+4])+_Complex_I*(pm[sta1*8+5]);
-     G1[3]=(pm[sta1*8+6])+_Complex_I*(pm[sta1*8+7]);
-     G2[0]=(pm[sta2*8])+_Complex_I*(pm[sta2*8+1]);
-     G2[1]=(pm[sta2*8+2])+_Complex_I*(pm[sta2*8+3]);
-     G2[2]=(pm[sta2*8+4])+_Complex_I*(pm[sta2*8+5]);
-     G2[3]=(pm[sta2*8+6])+_Complex_I*(pm[sta2*8+7]);
-
-
-      /* use pre calculated values */
-      C[0]=t->coh[4*M*ci+4*cm];
-      C[1]=t->coh[4*M*ci+4*cm+1];
-      C[2]=t->coh[4*M*ci+4*cm+2];
-      C[3]=t->coh[4*M*ci+4*cm+3];
-
-     /* form G1*C*G2' */
-     /* T1=G1*C  */
-     amb(G1,C,T1);
-     /* T2=T1*G2' */
-     ambt(T1,G2,T2);
-
-     /* add to baseline visibilities */
-     t->x[8*ci]+=creal(T2[0]);
-     t->x[8*ci+1]+=cimag(T2[0]);
-     t->x[8*ci+2]+=creal(T2[1]);
-     t->x[8*ci+3]+=cimag(T2[1]);
-     t->x[8*ci+4]+=creal(T2[2]);
-     t->x[8*ci+5]+=cimag(T2[2]);
-     t->x[8*ci+6]+=creal(T2[3]);
-     t->x[8*ci+7]+=cimag(T2[3]);
-   }
-  }
- }
-
- return NULL;
-}
-
-
-/* minimization function (multithreaded) */
-/* p: size mx1 parameters
-   x: size nx1 data calculated
-   data: extra info needed */
-static void
-minimize_viz_full_pth(double *p, double *x, int m, int n, void *data) {
-
-  me_data_t *dp=(me_data_t*)data;
-  /* u,v,w : size Nbase*tilesz x 1  x: size Nbase*8*tilesz x 1 */
-  /* barr: size Nbase*tilesz x 1 carr: size Mx1 */
-  /* pp: size 8*N*M x 1 */
-  /* pm: size Mx1 of double */
-
-  int nth,nth1,ci;
-
-  /* no of threads */
-  int Nt=(dp->Nt);
-  int Nthb0,Nthb;
-  pthread_attr_t attr;
-  pthread_t *th_array;
-  thread_data_base_t *threaddata;
-
-  int Nbase1=(dp->Nbase)*(dp->tilesz);
-
-  /* calculate min baselines a thread can handle */
-  //Nthb0=ceil((double)Nbase1/(double)Nt);
-  Nthb0=(Nbase1+Nt-1)/Nt;
-
-  /* setup threads */
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
-
-  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
-   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-   exit(1);
-  }
-  if ((threaddata=(thread_data_base_t*)malloc((size_t)Nt*sizeof(thread_data_base_t)))==0) {
-    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-    exit(1);
-  }
-
-
-  /* iterate over threads, allocating baselines per thread */
-  ci=0;
-  for (nth=0;  nth<Nt && ci<Nbase1; nth++) {
-    /* this thread will handle baselines [ci:min(Nbase1-1,ci+Nthb0-1)] */
-    /* determine actual no. of baselines */
-    if (ci+Nthb0<Nbase1) {
-     Nthb=Nthb0;
-    } else {
-     Nthb=Nbase1-ci;
-    }
-
-    threaddata[nth].boff=ci;
-    threaddata[nth].Nb=Nthb;
-    threaddata[nth].barr=dp->barr;
-    threaddata[nth].u=&(dp->u[ci]);
-    threaddata[nth].v=&(dp->v[ci]);
-    threaddata[nth].w=&(dp->w[ci]);
-    threaddata[nth].carr=dp->carr;
-    threaddata[nth].M=dp->M;
-    threaddata[nth].x=&(x[8*ci]);
-    threaddata[nth].p=p;
-    threaddata[nth].N=dp->N;
-    threaddata[nth].Nbase=dp->Nbase;
-    threaddata[nth].tilesz=dp->tilesz;
-    threaddata[nth].coh=&(dp->coh[4*(dp->M)*ci]);
-    
-    //printf("thread %d predict  data from %d baselines %d\n",nth,8*ci,Nthb);
-    pthread_create(&th_array[nth],&attr,predict_threadfn_withgain_full,(void*)(&threaddata[nth]));
-    /* next baseline set */
-    ci=ci+Nthb;
-  }
-
-  /* now wait for threads to finish */
-  for(nth1=0; nth1<nth; nth1++) {
-   pthread_join(th_array[nth1],NULL);
-  }
-
- pthread_attr_destroy(&attr);
-
-
- free(th_array);
- free(threaddata);
-
-}
-
 
 /******************** CUP version *****************************/
 
diff --git a/src/lib/Dirac/lbfgs.c b/src/lib/Dirac/lbfgs.c
index d853fba..e30dd57 100644
--- a/src/lib/Dirac/lbfgs.c
+++ b/src/lib/Dirac/lbfgs.c
@@ -286,7 +286,7 @@ mult_hessian(int m, double *pk, double *gk, double *s, double *y, double *rho, i
 
 /* cubic interpolation in interval [a,b] (a>b is possible)
    to find step that minimizes cost function */
-/* func: vector function
+/*
    xk: parameter values size m x 1 (at which step is calculated)
    pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
    a/b:  interval for interpolation
@@ -299,7 +299,6 @@ mult_hessian(int m, double *pk, double *gk, double *s, double *y, double *rho, i
 */
 static double 
 cubic_interp(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
    double *xk, double *pk, double a, double b, double *x, double *xp,  double *xo, int m, int n, double step, void *adata,  th_pipeline *tp, gbdata_b *tpg) {
 
   double f0,f1,f0d,f1d; /* function values and derivatives at a,b */
@@ -308,10 +307,6 @@ cubic_interp(
 
   my_dcopy(m,xk,1,xp,1); /* xp<=xk */
   my_daxpy(m,pk,a,xp); /* xp<=xp+(a)*pk */
-//  func(xp,x,m,n,adata);
-//  my_daxpy(n,xo,-1.0,x);
-//  f0=my_dnrm2(n,x);
-//  f0*=f0;
   sync_barrier(&(tp->gate1));
   tpg->lmdata[0]->p=tpg->lmdata[1]->p=xp;
   tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
@@ -323,9 +318,6 @@ cubic_interp(
 
   /* grad(phi_0): evaluate at -step and +step */
   my_daxpy(m,pk,step,xp); /* xp<=xp+(a+step)*pk */
-//  func(xp,x,m,n,adata);
-//  my_daxpy(n,xo,-1.0,x);
-//  p01=my_dnrm2(n,x);
   sync_barrier(&(tp->gate1));
   tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
   sync_barrier(&(tp->gate2));
@@ -336,9 +328,6 @@ cubic_interp(
 
 
   my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(a-step)*pk */
-//  func(xp,x,m,n,adata);
-//  my_daxpy(n,xo,-1.0,x);
-//  p02=my_dnrm2(n,x);
   sync_barrier(&(tp->gate1));
   tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
   sync_barrier(&(tp->gate2));
@@ -348,16 +337,9 @@ cubic_interp(
   sync_barrier(&(tp->gate2));
 
 
-//  f0d=(p01*p01-p02*p02)/(2.0*step);
   f0d=(p01-p02)/(2.0*step);
 
-  //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp=xk+(a-step)*pk */
-  //my_daxpy(m,pk,b,xp); /* xp<=xp+(b)*pk */
   my_daxpy(m,pk,-a+step+b,xp); /* xp<=xp+(b)*pk */
-//  func(xp,x,m,n,adata);
-//  my_daxpy(n,xo,-1.0,x);
-//  f1=my_dnrm2(n,x);
-//  f1*=f1;
   sync_barrier(&(tp->gate1));
   tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
   sync_barrier(&(tp->gate2));
@@ -369,9 +351,6 @@ cubic_interp(
 
   /* grad(phi_1): evaluate at -step and +step */
   my_daxpy(m,pk,step,xp); /* xp<=xp+(b+step)*pk */
-//  func(xp,x,m,n,adata);
-//  my_daxpy(n,xo,-1.0,x);
-//  p01=my_dnrm2(n,x);
   sync_barrier(&(tp->gate1));
   tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
   sync_barrier(&(tp->gate2));
@@ -382,9 +361,6 @@ cubic_interp(
 
 
   my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(b-step)*pk */
-//  func(xp,x,m,n,adata);
-//  my_daxpy(n,xo,-1.0,x);
-//  p02=my_dnrm2(n,x);
   sync_barrier(&(tp->gate1));
   tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
   sync_barrier(&(tp->gate2));
@@ -394,7 +370,6 @@ cubic_interp(
   sync_barrier(&(tp->gate2));
 
 
-//  f1d=(p01*p01-p02*p02)/(2.0*step);
   f1d=(p01-p02)/(2.0*step);
 
 
@@ -417,13 +392,7 @@ cubic_interp(
     fz0=f0+f1;
    } else {
     /* evaluate function for this root */
-    //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp=xk+(b-step)*pk */
-    //my_daxpy(m,pk,a+z0*(b-a),xp); /* xp<=xp+(a+z0(b-a))*pk */
     my_daxpy(m,pk,-b+step+a+z0*(b-a),xp); /* xp<=xp+(a+z0(b-a))*pk */
-//    func(xp,x,m,n,adata);
-//    my_daxpy(n,xo,-1.0,x);
-//    fz0=my_dnrm2(n,x);
-//    fz0*=fz0;
     sync_barrier(&(tp->gate1));
     tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
     sync_barrier(&(tp->gate2));
@@ -460,7 +429,7 @@ cubic_interp(
 
 /*************** Fletcher line search **********************************/
 /* zoom function for line search */
-/* func: vector function
+/* 
    xk: parameter values size m x 1 (at which step is calculated)
    pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
    a/b: bracket interval [a,b] (a>b) is possible
@@ -475,7 +444,6 @@ cubic_interp(
 */
 static double 
 linesearch_zoom(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
    double *xk, double *pk, double a, double b, double *x, double *xp,  double phi_0, double gphi_0, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata,  th_pipeline *tp, gbdata_b *tpg) {
 
   double alphaj,phi_j,phi_aj;
@@ -490,17 +458,12 @@ linesearch_zoom(
     /* choose alphaj from [a+t2(b-a),b-t3(b-a)] */
     p01=aj+t2*(bj-aj);
     p02=bj-t3*(bj-aj);
-    alphaj=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata,tp,tpg);
+    alphaj=cubic_interp(xk,pk,p01,p02,x,xp,xo,m,n,step,adata,tp,tpg);
     //printf("cubic intep [%lf,%lf]->%lf\n",p01,p02,alphaj);
 
     /* evaluate phi(alphaj) */
     my_dcopy(m,xk,1,xp,1); /* xp<=xk */
     my_daxpy(m,pk,alphaj,xp); /* xp<=xp+(alphaj)*pk */
-//    func(xp,x,m,n,adata);
-    /* calculate x<=x-xo */
-//    my_daxpy(n,xo,-1.0,x);
-//    phi_j=my_dnrm2(n,x);
-//    phi_j*=phi_j;
     sync_barrier(&(tp->gate1));
     tpg->lmdata[0]->p=tpg->lmdata[1]->p=xp;
     tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
@@ -512,14 +475,7 @@ linesearch_zoom(
 
 
     /* evaluate phi(aj) */
-    //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp = xk+alphaj*pk */
-    //my_daxpy(m,pk,aj,xp); /* xp<=xp+(aj)*pk */
     my_daxpy(m,pk,-alphaj+aj,xp); /* xp<=xp+(aj)*pk */
-//    func(xp,x,m,n,adata);
-    /* calculate x<=x-xo */
-//    my_daxpy(n,xo,-1.0,x);
-//    phi_aj=my_dnrm2(n,x);
-//    phi_aj*=phi_aj;
     sync_barrier(&(tp->gate1));
     tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
     sync_barrier(&(tp->gate2));
@@ -533,13 +489,7 @@ linesearch_zoom(
       bj=alphaj; /* aj unchanged */
     } else {
      /* evaluate grad(alphaj) */
-     //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp = xk+aj*pk */
-     //my_daxpy(m,pk,alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */
      my_daxpy(m,pk,-aj+alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */
-//     func(xp,x,m,n,adata);
-     /* calculate x<=x-xo */
-//     my_daxpy(n,xo,-1.0,x);
-//     p01=my_dnrm2(n,x);
      sync_barrier(&(tp->gate1));
      tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
      sync_barrier(&(tp->gate2));
@@ -550,10 +500,6 @@ linesearch_zoom(
 
 
      my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphaj-step)*pk */
-//     func(xp,x,m,n,adata);
-     /* calculate x<=x-xo */
-//     my_daxpy(n,xo,-1.0,x);
-//     p02=my_dnrm2(n,x);
      sync_barrier(&(tp->gate1));
      tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
      sync_barrier(&(tp->gate2));
@@ -563,7 +509,6 @@ linesearch_zoom(
      sync_barrier(&(tp->gate2));
 
 
-//     gphi_j=(p01*p01-p02*p02)/(2.0*step);
      gphi_j=(p01-p02)/(2.0*step);
 
      /* termination due to roundoff/other errors pp. 38, Fletcher */
@@ -601,7 +546,7 @@ linesearch_zoom(
  
 
 /* line search */
-/* func: vector function
+/* 
    xk: parameter values size m x 1 (at which step is calculated)
    pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
    alpha1: initial value for step
@@ -613,7 +558,6 @@ linesearch_zoom(
 */
 static double 
 linesearch(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
    double *xk, double *pk, double alpha1, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata, th_pipeline *tp, gbdata_b *tpg) {
  /* phi(alpha)=f(xk+alpha pk)
   for vector function func 
@@ -642,11 +586,6 @@ linesearch(
 
   alphak=1.0;
   /* evaluate phi_0 and grad(phi_0) */
-//func(xk,x,m,n,adata);
-//my_daxpy(n,xo,-1.0,x);
-//phi_0=my_dnrm2(n,x);
-//phi_0*=phi_0;
-//printf("CPU cost=%lf\n",phi_0);
   sync_barrier(&(tp->gate1));
   tpg->lmdata[0]->p=tpg->lmdata[1]->p=xk;
   tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
@@ -663,10 +602,6 @@ linesearch(
   my_dcopy(m,xk,1,xp,1); /* xp<=xk */
   my_daxpy(m,pk,step,xp); /* xp<=xp+(0.0+step)*pk */
 
-//func(xp,x,m,n,adata);
-  /* calculate x<=x-xo */
-//my_daxpy(n,xo,-1.0,x);
-//p01=my_dnrm2(n,x);
   sync_barrier(&(tp->gate1));
   tpg->lmdata[0]->p=tpg->lmdata[1]->p=xp;
   tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
@@ -677,10 +612,6 @@ linesearch(
   sync_barrier(&(tp->gate2));
 
   my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(0.0-step)*pk */
-//func(xp,x,m,n,adata);
-  /* calculate x<=x-xo */
-//my_daxpy(n,xo,-1.0,x);
-//p02=my_dnrm2(n,x);
   sync_barrier(&(tp->gate1));
   tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
   sync_barrier(&(tp->gate2));
@@ -690,7 +621,6 @@ linesearch(
   sync_barrier(&(tp->gate2));
 
 
-//  gphi_0=(p01*p01-p02*p02)/(2.0*step);
   gphi_0=(p01-p02)/(2.0*step);
 
 
@@ -719,11 +649,6 @@ linesearch(
    /* evalualte phi(alpha(i))=f(xk+alphai pk) */
    my_dcopy(m,xk,1,xp,1); /* xp<=xk */
    my_daxpy(m,pk,alphai,xp); /* xp<=xp+alphai*pk */
-//   func(xp,x,m,n,adata);
-   /* calculate x<=x-xo */
-//   my_daxpy(n,xo,-1.0,x);
-//   phi_alphai=my_dnrm2(n,x);
-//   phi_alphai*=phi_alphai;
    sync_barrier(&(tp->gate1));
    tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
    sync_barrier(&(tp->gate2));
@@ -743,7 +668,7 @@ linesearch(
 
    if ((phi_alphai>phi_0+alphai*gphi_0) || (ci>1 && phi_alphai>=phi_alphai1)) {
       /* ai=alphai1, bi=alphai bracket */
-      alphak=linesearch_zoom(func,xk,pk,alphai1,alphai,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata,tp,tpg);
+      alphak=linesearch_zoom(xk,pk,alphai1,alphai,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata,tp,tpg);
 #ifdef DEBUG
       printf("Linesearch : Condition 1 met\n");
 #endif
@@ -751,13 +676,7 @@ linesearch(
    } 
 
    /* evaluate grad(phi(alpha(i))) */
-   //my_dcopy(m,xk,1,xp,1); /* NOT NEEDED here because already xp=xk+alphai*pk */
-   //my_daxpy(m,pk,alphai+step,xp); /* xp<=xp+(alphai+step)*pk */
    my_daxpy(m,pk,step,xp); /* xp<=xp+(alphai+step)*pk */
-//   func(xp,x,m,n,adata);
-   /* calculate x<=x-xo */
-//   my_daxpy(n,xo,-1.0,x);
-//   p01=my_dnrm2(n,x);
    sync_barrier(&(tp->gate1));
    tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
    sync_barrier(&(tp->gate2));
@@ -767,10 +686,6 @@ linesearch(
    sync_barrier(&(tp->gate2));
 
    my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphai-step)*pk */
-//   func(xp,x,m,n,adata);
-   /* calculate x<=x-xo */
-//   my_daxpy(n,xo,-1.0,x);
-//   p02=my_dnrm2(n,x);
    sync_barrier(&(tp->gate1));
    tpg->status[0]=tpg->status[1]=PT_DO_CCOST;
    sync_barrier(&(tp->gate2));
@@ -780,7 +695,6 @@ linesearch(
    sync_barrier(&(tp->gate2));
 
 
-//   gphi_i=(p01*p01-p02*p02)/(2.0*step);
    gphi_i=(p01-p02)/(2.0*step);
 
    if (fabs(gphi_i)<=-sigma*gphi_0) {
@@ -793,7 +707,7 @@ linesearch(
 
    if (gphi_i>=0) {
      /* ai=alphai, bi=alphai1 bracket */
-     alphak=linesearch_zoom(func,xk,pk,alphai,alphai1,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata,tp,tpg);
+     alphak=linesearch_zoom(xk,pk,alphai,alphai1,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata,tp,tpg);
 #ifdef DEBUG
      printf("Linesearch : Condition 3 met\n");
 #endif
@@ -809,7 +723,7 @@ linesearch(
      /* choose by interpolation in [2*alphai-alphai1,min(mu,alphai+t1*(alphai-alphai1)] */
      p01=2.0*alphai-alphai1;
      p02=MIN(mu,alphai+t1*(alphai-alphai1));
-     alphai=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata,tp,tpg);
+     alphai=cubic_interp(xk,pk,p01,p02,x,xp,xo,m,n,step,adata,tp,tpg);
      //printf("cubic interp [%lf,%lf]->%lf\n",p01,p02,alphai);
    }
    phi_alphai1=phi_alphai;
@@ -832,7 +746,6 @@ linesearch(
 /* note M here  is LBFGS memory size */
 static int
 lbfgs_fit_common(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
    double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, int do_robust, void *adata) {
 
   double *gk; /* gradients at both k+1 and k iter */
@@ -987,14 +900,6 @@ lbfgs_fit_common(
   sync_barrier(&(tp.gate1));
   tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
   sync_barrier(&(tp.gate2));
-//  for (ci=0; ci<20; ci++) {
-//   printf("GPU %d %lf\n",ci,gk[ci]);
-//  } 
-  /*  gradient gk=grad(f)_k */
-//  func_grad(func,xk,gk,x,m,n,step,gpu_threads,adata);
-//  for (ci=0; ci<20; ci++) {
-//   printf("CPU %d %lf\n",ci,gk[ci]);
-//  } 
   
   double gradnrm=my_dnrm2(m,gk);
   /* if gradient is too small, no need to solve, so stop */
@@ -1025,7 +930,7 @@ lbfgs_fit_common(
    /* linesearch to find step length */
    /* parameters alpha1=10.0,sigma=0.1, rho=0.01, t1=9, t2=0.1, t3=0.5 */
    /* FIXME: update paramters for GPU gradient */
-   alphak=linesearch(func,xk,pk,10.0,0.1,0.01,9,0.1,0.5,x,m,n,step,adata,&tp, &tpg);
+   alphak=linesearch(xk,pk,10.0,0.1,0.01,9,0.1,0.5,x,m,n,step,adata,&tp, &tpg);
    /* check if step size is too small, or nan, then stop */
    if (!isnormal(alphak) || fabs(alphak)<CLM_EPSILON) {
     break;
@@ -1053,7 +958,6 @@ lbfgs_fit_common(
   tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
   sync_barrier(&(tp.gate2));
 
-//   func_grad(func,xk1,gk,x,m,n,step,gpu_threads,adata);
    /* yk=yk+gk1 */
    my_daxpy(m,gk,1.0,&y[cm]);
 
@@ -1107,16 +1011,15 @@ lbfgs_fit_common(
 
 
 
+/* wrapper functions */
 int
 lbfgs_fit(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
    double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, void *adata) {
-  return lbfgs_fit_common(func, p, x, m, n, itmax, M, gpu_threads, 0, adata);
+  return lbfgs_fit_common(p, x, m, n, itmax, M, gpu_threads, 0, adata);
 }
 
 int
 lbfgs_fit_robust_cuda(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
    double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, void *adata) {
-  return lbfgs_fit_common(func, p, x, m, n, itmax, M, gpu_threads, 1, adata);
+  return lbfgs_fit_common(p, x, m, n, itmax, M, gpu_threads, 1, adata);
 }
diff --git a/src/lib/Dirac/lbfgs_nocuda.c b/src/lib/Dirac/lbfgs_nocuda.c
index d0e986e..6087f08 100644
--- a/src/lib/Dirac/lbfgs_nocuda.c
+++ b/src/lib/Dirac/lbfgs_nocuda.c
@@ -21,282 +21,6 @@
 #include "Dirac.h"
 #include <pthread.h>
 
-
-/**** repeated code here ********************/
-/* Jones matrix multiplication 
-   C=A*B
-*/
-#ifdef USE_MIC
-__attribute__ ((target(MIC)))
-#endif
-static void
-amb(complex double * __restrict a, complex double * __restrict b, complex double * __restrict c) {
- c[0]=a[0]*b[0]+a[1]*b[2];
- c[1]=a[0]*b[1]+a[1]*b[3];
- c[2]=a[2]*b[0]+a[3]*b[2];
- c[3]=a[2]*b[1]+a[3]*b[3];
-}
-
-/* Jones matrix multiplication 
-   C=A*B^H
-*/
-#ifdef USE_MIC
-__attribute__ ((target(MIC)))
-#endif
-static void
-ambt(complex double * __restrict a, complex double * __restrict b, complex double * __restrict c) {
- c[0]=a[0]*conj(b[0])+a[1]*conj(b[1]);
- c[1]=a[0]*conj(b[2])+a[1]*conj(b[3]);
- c[2]=a[2]*conj(b[0])+a[3]*conj(b[1]);
- c[3]=a[2]*conj(b[2])+a[3]*conj(b[3]);
-}
-
-/**** end repeated code ********************/
-
-/* worker thread for a cpu */
-#ifdef USE_MIC
-__attribute__ ((target(MIC)))
-#endif
-static void *
-cpu_calc_deriv(void *adata) {
- thread_data_grad_t *t=(thread_data_grad_t*)adata;
-
- int ci,nb;
- int stc,stoff,stm,sta1,sta2;
- int N=t->N; /* stations */
- int M=t->M; /* clusters */
- int Nbase=(t->Nbase)*(t->tilesz);
-
-
- complex double xr[4]; /* residuals */
- complex double G1[4],G2[4],C[4],T1[4],T2[4];
- double pp[8];
- complex double csum;
- int cli,tpchunk,pstart,nchunk,tilesperchunk,stci,ttile,tptile,poff;
-
- /* iterate over each paramter */
- for (ci=t->g_start; ci<=t->g_end; ++ci) {
-    t->g[ci]=0.0;
-    /* find station and parameter corresponding to this value of ci */
-    /* this parameter should correspond to the right baseline (x tilesz)
-        to contribute to the derivative */
-    cli=0;
-    while((cli<M) && (ci<t->carr[cli].p[0] || ci>t->carr[cli].p[0]+8*N*t->carr[cli].nchunk-1)) {
-     cli++;
-    }
-   /* now either cli>=M: cluster not found 
-       or cli<M and cli is the right cluster */
-   if (cli==M && ci>=t->carr[cli-1].p[0] && ci<=t->carr[cli-1].p[0]+8*N*t->carr[cli-1].nchunk-1) {
-    cli--;
-   }
-
-   if (cli<M) {
-    /* right parameter offset */
-    stci=ci-t->carr[cli].p[0];
- 
-    stc=(stci%(8*N))/8; /* 0..N-1 */
-    /* make sure this baseline contribute to this parameter */
-    tpchunk=stci/(8*N);
-    nchunk=t->carr[cli].nchunk;
-    pstart=t->carr[cli].p[0];
-    tilesperchunk=(t->tilesz+nchunk-1)/nchunk;
-
-
-    /* iterate over all baselines and accumulate sum */
-    for (nb=0; nb<Nbase; ++nb) {
-     /* which tile is this ? */
-     ttile=nb/t->Nbase;
-     /* which chunk this tile belongs to */
-     tptile=ttile/tilesperchunk;
-     /* now tptile has to match tpchunk, otherwise ignore calculation */
-     if (tptile==tpchunk) {
-
-     sta1=t->barr[nb].sta1;
-     sta2=t->barr[nb].sta2;
-    
-     if (((stc==sta1)||(stc==sta2))&& !t->barr[nb].flag) {
-      /* this baseline has a contribution */
-      /* which paramter of this station */
-      stoff=(stci%(8*N))%8; /* 0..7 */
-      /* which cluster */
-      stm=cli; /* 0..M-1 */
-
-      /* exact expression for derivative 
-         2 real( vec^H(residual_this_baseline) 
-            * vec(-J_{pm}C_{pqm} J_{qm}^H)
-        where m: chosen cluster
-        J_{pm},J_{qm} Jones matrices for baseline p-q
-        depending on the parameter, J ==> E 
-        E: zero matrix, except 1 at location of m
-   
-       residual : in x[8*nb:8*nb+7]
-       C coh: in coh[8*M*nb+m*8:8*M*nb+m*8+7] (double storage)
-           coh[4*M*nb+4*m:4*M*nb+4*m+3] (complex storage)
-       J_p,J_q: in p[sta1*8+m*8*N: sta1*8+m*8*N+7]
-        and p[sta2*8+m*8*N: sta2*8+m*8*N+ 7]
-     */
-     /* read in residual vector, conjugated */
-     xr[0]=(t->x[nb*8])-_Complex_I*(t->x[nb*8+1]);
-     xr[1]=(t->x[nb*8+2])-_Complex_I*(t->x[nb*8+3]);
-     xr[2]=(t->x[nb*8+4])-_Complex_I*(t->x[nb*8+5]);
-     xr[3]=(t->x[nb*8+6])-_Complex_I*(t->x[nb*8+7]);
-
-     /* read in coherency */
-     C[0]=t->coh[4*M*nb+4*stm];
-     C[1]=t->coh[4*M*nb+4*stm+1];
-     C[2]=t->coh[4*M*nb+4*stm+2];
-     C[3]=t->coh[4*M*nb+4*stm+3];
-
-     memset(pp,0,sizeof(double)*8); 
-     if (stc==sta1) {
-       /* this station parameter gradient */
-       pp[stoff]=1.0;
-       memset(G1,0,sizeof(complex double)*4); 
-       G1[0]=pp[0]+_Complex_I*pp[1];
-       G1[1]=pp[2]+_Complex_I*pp[3];
-       G1[2]=pp[4]+_Complex_I*pp[5];
-       G1[3]=pp[6]+_Complex_I*pp[7];
-       poff=pstart+tpchunk*8*N+sta2*8;
-       G2[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]);
-       G2[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]);
-       G2[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+4]);
-       G2[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]);
-
-     } else if (stc==sta2) {
-       memset(G2,0,sizeof(complex double)*4); 
-       pp[stoff]=1.0;
-       G2[0]=pp[0]+_Complex_I*pp[1];
-       G2[1]=pp[2]+_Complex_I*pp[3];
-       G2[2]=pp[4]+_Complex_I*pp[5];
-       G2[3]=pp[6]+_Complex_I*pp[7];
-       poff=pstart+tpchunk*8*N+sta1*8;
-       G1[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]);
-       G1[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]);
-       G1[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+5]);
-       G1[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]);
-     }
-
-     /* T1=G1*C */
-     amb(G1,C,T1);
-     /* T2=T1*G2' */
-     ambt(T1,G2,T2);
-
-     /* calculate product xr*vec(J_p C J_q^H ) */
-     csum=xr[0]*T2[0];
-     csum+=xr[1]*T2[1];
-     csum+=xr[2]*T2[2];
-     csum+=xr[3]*T2[3];
-
-     /* accumulate sum */
-     t->g[ci]+=-2.0*creal(csum);
-     }
-     }
-    }
-   }
- }
-
-
- return NULL;
-}
-
-#ifdef USE_MIC
-__attribute__ ((target(MIC)))
-#endif
-static int
-func_grad(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
-   double *p, double *g, double *xo, int m, int n, double step, void *adata) {
-  /* gradient for each parameter is
-     (||func(p+step*e_i)-x||^2-||func(p-step*e_i)-x||^2)/2*step
-    i=0,...,m-1 for all parameters
-    e_i: unit vector, 1 only at i-th location
-  */
-
-  double *x; /* array to store residual */
-  int ci;
-  me_data_t *dp=(me_data_t*)adata;
-
-  int Nt=dp->Nt;
-
-  pthread_attr_t attr;
-  pthread_t *th_array;
-  thread_data_grad_t *threaddata;
-
-
-  if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) {
-#ifndef USE_MIC
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
-     exit(1);
-  }
-  /* evaluate func once, store in x, and create threads */
-  /* and calculate the residual x=xo-func */
-  func(p,x,m,n,adata);
-  /* calculate x<=x-xo */
-  my_daxpy(n,xo,-1.0,x);
-
-  /* setup threads */
-  pthread_attr_init(&attr);
-  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
-
-  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
-#ifndef USE_MIC
-   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-#endif
-   exit(1);
-  }
-  if ((threaddata=(thread_data_grad_t*)malloc((size_t)Nt*sizeof(thread_data_grad_t)))==0) {
-#ifndef USE_MIC
-    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-#endif
-    exit(1);
-  }
-
-  int nth,nth1,Nparm;
-
-  /* parameters per thread */
-  Nparm=(m+Nt-1)/Nt;
-
-  /* each thread will calculate derivative of part of 
-     parameters */
-  ci=0;
-  for (nth=0;  nth<Nt; nth++) {
-   threaddata[nth].Nbase=dp->Nbase;
-   threaddata[nth].tilesz=dp->tilesz;
-   threaddata[nth].barr=dp->barr;
-   threaddata[nth].carr=dp->carr;
-   threaddata[nth].M=dp->M;
-   threaddata[nth].N=dp->N;
-   threaddata[nth].coh=dp->coh;
-   threaddata[nth].m=m;
-   threaddata[nth].n=n;
-   threaddata[nth].x=x;
-   threaddata[nth].p=p;
-   threaddata[nth].g=g;
-   threaddata[nth].g_start=ci;
-   threaddata[nth].g_end=ci+Nparm-1;
-   if (threaddata[nth].g_end>=m) {
-    threaddata[nth].g_end=m-1;
-   }
-   ci=ci+Nparm;
-   pthread_create(&th_array[nth],&attr,cpu_calc_deriv,(void*)(&threaddata[nth]));
-  }
-
-  /* now wait for threads to finish */
-  for(nth1=0; nth1<nth; nth1++) {
-   pthread_join(th_array[nth1],NULL);
-  }
-
-  pthread_attr_destroy(&attr);
-
-  free(th_array);
-  free(threaddata);
-
-  free(x);
-  return 0;
-}
-
-
 /* use algorithm 9.1 to compute pk=Hk gk */
 /* pk,gk: size m x 1
    s, y: size mM x 1 
@@ -381,24 +105,18 @@ mult_hessian(int m, double *pk, double *gk, double *s, double *y, double *rho, i
 
 /* cubic interpolation in interval [a,b] (a>b is possible)
    to find step that minimizes cost function */
-/* func: vector function
+/* func: scalar function
    xk: parameter values size m x 1 (at which step is calculated)
    pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
    a/b:  interval for interpolation
-   x: size n x 1 (storage)
    xp: size m x 1 (storage)
-   xo: observed data size n x 1
-   n: size of vector function
    step: step size for differencing 
    adata:  additional data passed to the function
 */
-#ifdef USE_MIC
-__attribute__ ((target(MIC)))
-#endif
 static double 
 cubic_interp(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
-   double *xk, double *pk, double a, double b, double *x, double *xp,  double *xo, int m, int n, double step, void *adata) {
+   double (*func)(double *p, int m, void *adata),
+   double *xk, double *pk, double a, double b, double *xp,  int m, double step, void *adata) {
 
   double f0,f1,f0d,f1d; /* function values and derivatives at a,b */
   double p01,p02,z0,fz0;
@@ -406,38 +124,24 @@ cubic_interp(
 
   my_dcopy(m,xk,1,xp,1); /* xp<=xk */
   my_daxpy(m,pk,a,xp); /* xp<=xp+(a)*pk */
-  func(xp,x,m,n,adata);
-  my_daxpy(n,xo,-1.0,x);
-  f0=my_dnrm2(n,x);
-  f0*=f0;
+  f0=func(xp,m,adata);
   /* grad(phi_0): evaluate at -step and +step */
   my_daxpy(m,pk,step,xp); /* xp<=xp+(a+step)*pk */
-  func(xp,x,m,n,adata);
-  my_daxpy(n,xo,-1.0,x);
-  p01=my_dnrm2(n,x);
+  p01=func(xp,m,adata);
   my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(a-step)*pk */
-  func(xp,x,m,n,adata);
-  my_daxpy(n,xo,-1.0,x);
-  p02=my_dnrm2(n,x);
-  f0d=(p01*p01-p02*p02)/(2.0*step);
+  p02=func(xp,m,adata);
+  f0d=(p01-p02)/(2.0*step);
 
-  //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp=xk+(a-step)*pk */
-  //my_daxpy(m,pk,b,xp); /* xp<=xp+(b)*pk */
+  //FIXME my_dcopy(m,xk,1,xp,1); /* not necessary because xp=xk+(a-step)*pk */
+  //FIXME my_daxpy(m,pk,b,xp); /* xp<=xp+(b)*pk */
   my_daxpy(m,pk,-a+step+b,xp); /* xp<=xp+(b)*pk */
-  func(xp,x,m,n,adata);
-  my_daxpy(n,xo,-1.0,x);
-  f1=my_dnrm2(n,x);
-  f1*=f1;
+  f1=func(xp,m,adata);
   /* grad(phi_1): evaluate at -step and +step */
   my_daxpy(m,pk,step,xp); /* xp<=xp+(b+step)*pk */
-  func(xp,x,m,n,adata);
-  my_daxpy(n,xo,-1.0,x);
-  p01=my_dnrm2(n,x);
+  p01=func(xp,m,adata);
   my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(b-step)*pk */
-  func(xp,x,m,n,adata);
-  my_daxpy(n,xo,-1.0,x);
-  p02=my_dnrm2(n,x);
-  f1d=(p01*p01-p02*p02)/(2.0*step);
+  p02=func(xp,m,adata);
+  f1d=(p01-p02)/(2.0*step);
 
 
   //printf("Interp a,f(a),f'(a): (%lf,%lf,%lf) (%lf,%lf,%lf)\n",a,f0,f0d,b,f1,f1d);
@@ -459,13 +163,10 @@ cubic_interp(
     fz0=f0+f1;
    } else {
     /* evaluate function for this root */
-    //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp=xk+(b-step)*pk */
-    //my_daxpy(m,pk,a+z0*(b-a),xp); /* xp<=xp+(a+z0(b-a))*pk */
-    my_daxpy(m,pk,-b+step+a+z0*(b-a),xp); /* xp<=xp+(a+z0(b-a))*pk */
-    func(xp,x,m,n,adata);
-    my_daxpy(n,xo,-1.0,x);
-    fz0=my_dnrm2(n,x);
-    fz0*=fz0;
+    //FIXME my_dcopy(m,xk,1,xp,1); /* not necessary because xp=xk+(b-step)*pk */
+    //my_daxpy(m,pk,a+z0*(b-a),xp); /* xp<=xp+(z0)*pk */
+    my_daxpy(m,pk,-b+step+a+z0*(b-a),xp); /* xp<=xp+(a+z0*(b-a))*pk */
+    fz0=func(xp,m,adata);
    }
 
    /* now choose between f0,f1,fz0,fz1 */
@@ -496,26 +197,21 @@ cubic_interp(
 
 /*************** Fletcher line search **********************************/
 /* zoom function for line search */
-/* func: vector function
+/* func: scalar function
    xk: parameter values size m x 1 (at which step is calculated)
    pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
    a/b: bracket interval [a,b] (a>b) is possible
-   x: size n x 1 (storage)
    xp: size m x 1 (storage)
    phi_0: phi(0)
    gphi_0: grad(phi(0))
-   xo: observed data size n x 1
-   n: size of vector function
+   sigma,rho,t1,t2,t3: line search parameters (from Fletcher) 
    step: step size for differencing 
    adata:  additional data passed to the function
 */
-#ifdef USE_MIC
-__attribute__ ((target(MIC)))
-#endif
 static double 
 linesearch_zoom(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
-   double *xk, double *pk, double a, double b, double *x, double *xp,  double phi_0, double gphi_0, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) {
+   double (*func)(double *p, int m, void *adata),
+   double *xk, double *pk, double a, double b, double *xp, double phi_0, double gphi_0, double sigma, double rho, double t1, double t2, double t3, int m, double step, void *adata) {
 
   double alphaj,phi_j,phi_aj;
   double gphi_j,p01,p02,aj,bj;
@@ -529,46 +225,31 @@ linesearch_zoom(
     /* choose alphaj from [a+t2(b-a),b-t3(b-a)] */
     p01=aj+t2*(bj-aj);
     p02=bj-t3*(bj-aj);
-    alphaj=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata);
+    alphaj=cubic_interp(func,xk,pk,p01,p02,xp,m,step,adata);
     //printf("cubic intep [%lf,%lf]->%lf\n",p01,p02,alphaj);
 
     /* evaluate phi(alphaj) */
     my_dcopy(m,xk,1,xp,1); /* xp<=xk */
     my_daxpy(m,pk,alphaj,xp); /* xp<=xp+(alphaj)*pk */
-    func(xp,x,m,n,adata);
-    /* calculate x<=x-xo */
-    my_daxpy(n,xo,-1.0,x);
-    phi_j=my_dnrm2(n,x);
-    phi_j*=phi_j;
+    phi_j=func(xp,m,adata);
 
     /* evaluate phi(aj) */
-    //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp = xk+alphaj*pk */
-    //my_daxpy(m,pk,aj,xp); /* xp<=xp+(aj)*pk */
+    //FIXME my_dcopy(m,xk,1,xp,1); /* xp<=xk : not necessary because already copied */
+    //FIXME my_daxpy(m,pk,aj,xp); /* xp<=xp+(aj)*pk */
     my_daxpy(m,pk,-alphaj+aj,xp); /* xp<=xp+(aj)*pk */
-    func(xp,x,m,n,adata);
-    /* calculate x<=x-xo */
-    my_daxpy(n,xo,-1.0,x);
-    phi_aj=my_dnrm2(n,x);
-    phi_aj*=phi_aj;
-
+    phi_aj=func(xp,m,adata);
 
     if ((phi_j>phi_0+rho*alphaj*gphi_0) || phi_j>=phi_aj) {
       bj=alphaj; /* aj unchanged */
     } else {
      /* evaluate grad(alphaj) */
-     //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp = xk+aj*pk */
-     //my_daxpy(m,pk,alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */
+     //FIXME my_dcopy(m,xk,1,xp,1); /* xp<=xk */
+     //FIXME my_daxpy(m,pk,alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */
      my_daxpy(m,pk,-aj+alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */
-     func(xp,x,m,n,adata);
-     /* calculate x<=x-xo */
-     my_daxpy(n,xo,-1.0,x);
-     p01=my_dnrm2(n,x);
+     p01=func(xp,m,adata);
      my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphaj-step)*pk */
-     func(xp,x,m,n,adata);
-     /* calculate x<=x-xo */
-     my_daxpy(n,xo,-1.0,x);
-     p02=my_dnrm2(n,x);
-     gphi_j=(p01*p01-p02*p02)/(2.0*step);
+     p02=func(xp,m,adata);
+     gphi_j=(p01-p02)/(2.0*step);
 
      /* termination due to roundoff/other errors pp. 38, Fletcher */
      if ((aj-alphaj)*gphi_j<=step) {
@@ -605,29 +286,25 @@ linesearch_zoom(
  
 
 /* line search */
-/* func: vector function
+/* func: scalar function
    xk: parameter values size m x 1 (at which step is calculated)
    pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
    alpha1: initial value for step
    sigma,rho,t1,t2,t3: line search parameters (from Fletcher) 
-   xo: observed data size n x 1
-   n: size of vector function
+   m: size or parameter vector
    step: step size for differencing 
    adata:  additional data passed to the function
 */
-#ifdef USE_MIC
-__attribute__ ((target(MIC)))
-#endif
-static double 
+double 
 linesearch(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
-   double *xk, double *pk, double alpha1, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) {
+   double (*func)(double *p, int m, void *adata),
+   double *xk, double *pk, double alpha1, double sigma, double rho, double t1, double t2, double t3, int m, double step, void *adata) {
  
  /* phi(alpha)=f(xk+alpha pk)
   for vector function func 
    f(xk) =||func(xk)||^2 */
   
-  double *x,*xp;
+  double *xp;
   double alphai,alphai1;
   double phi_0,phi_alphai,phi_alphai1;
   double p01,p02;
@@ -639,54 +316,36 @@ linesearch(
 
   int ci;
 
-  if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) {
-#ifndef USE_MIC
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
-     exit(1);
-  }
   if ((xp=(double*)calloc((size_t)m,sizeof(double)))==0) {
-#ifndef USE_MIC
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
      exit(1);
   }
 
   alphak=1.0;
   /* evaluate phi_0 and grad(phi_0) */
-  func(xk,x,m,n,adata);
-  my_daxpy(n,xo,-1.0,x);
-  phi_0=my_dnrm2(n,x);
-  phi_0*=phi_0;
+  phi_0=func(xk,m,adata);
   /* select tolarance 1/100 of current function value */
   tol=MIN(0.01*phi_0,1e-6);
 
   /* grad(phi_0): evaluate at -step and +step */
   my_dcopy(m,xk,1,xp,1); /* xp<=xk */
   my_daxpy(m,pk,step,xp); /* xp<=xp+(0.0+step)*pk */
-  func(xp,x,m,n,adata);
-  /* calculate x<=x-xo */
-  my_daxpy(n,xo,-1.0,x);
-  p01=my_dnrm2(n,x);
+  p01=func(xp,m,adata);
   my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(0.0-step)*pk */
-  func(xp,x,m,n,adata);
-  /* calculate x<=x-xo */
-  my_daxpy(n,xo,-1.0,x);
-  p02=my_dnrm2(n,x);
-  gphi_0=(p01*p01-p02*p02)/(2.0*step);
+  p02=func(xp,m,adata);
+  gphi_0=(p01-p02)/(2.0*step);
 
   /* estimate for mu */
   /* mu = (tol-phi_0)/(rho gphi_0) */
   mu=(tol-phi_0)/(rho*gphi_0);
 #ifdef DEBUG
-  printf("mu=%lf, alpha1=%lf\n",mu,alpha1);
+  printf("deltaphi=%lf, mu=%lf, alpha1=%lf\n",gphi_0,mu,alpha1);
 #endif
   /* catch if not finite (deltaphi=0 or nan) */
   if (!isnormal(mu)) {
-    free(x);
     free(xp);
 #ifdef DEBUG
-    printf("line interval too small\n");
+  printf("line interval too small\n");
 #endif
     return mu;
   }
@@ -699,11 +358,7 @@ linesearch(
    /* evalualte phi(alpha(i))=f(xk+alphai pk) */
    my_dcopy(m,xk,1,xp,1); /* xp<=xk */
    my_daxpy(m,pk,alphai,xp); /* xp<=xp+alphai*pk */
-   func(xp,x,m,n,adata);
-   /* calculate x<=x-xo */
-   my_daxpy(n,xo,-1.0,x);
-   phi_alphai=my_dnrm2(n,x);
-   phi_alphai*=phi_alphai;
+   phi_alphai=func(xp,m,adata);
 
    if (phi_alphai<tol) {
      alphak=alphai;
@@ -715,7 +370,7 @@ linesearch(
 
    if ((phi_alphai>phi_0+alphai*gphi_0) || (ci>1 && phi_alphai>=phi_alphai1)) {
       /* ai=alphai1, bi=alphai bracket */
-      alphak=linesearch_zoom(func,xk,pk,alphai1,alphai,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata);
+      alphak=linesearch_zoom(func,xk,pk,alphai1,alphai,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,m,step,adata);
 #ifdef DEBUG
       printf("Linesearch : Condition 1 met\n");
 #endif
@@ -723,19 +378,14 @@ linesearch(
    } 
 
    /* evaluate grad(phi(alpha(i))) */
-   //my_dcopy(m,xk,1,xp,1); /* NOT NEEDED here because already xp=xk+alphai*pk */
-   //my_daxpy(m,pk,alphai+step,xp); /* xp<=xp+(alphai+step)*pk */
+   //FIXME my_dcopy(m,xk,1,xp,1); /* NOT NEEDED here?? xp<=xk */
+   //FIXME my_daxpy(m,pk,alphai+step,xp); /* xp<=xp+(alphai+step)*pk */
+   /* note that xp  already is xk+alphai. pk, so only add the missing term */
    my_daxpy(m,pk,step,xp); /* xp<=xp+(alphai+step)*pk */
-   func(xp,x,m,n,adata);
-   /* calculate x<=x-xo */
-   my_daxpy(n,xo,-1.0,x);
-   p01=my_dnrm2(n,x);
+   p01=func(xp,m,adata);
    my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphai-step)*pk */
-   func(xp,x,m,n,adata);
-   /* calculate x<=x-xo */
-   my_daxpy(n,xo,-1.0,x);
-   p02=my_dnrm2(n,x);
-   gphi_i=(p01*p01-p02*p02)/(2.0*step);
+   p02=func(xp,m,adata);
+   gphi_i=(p01-p02)/(2.0*step);
 
    if (fabs(gphi_i)<=-sigma*gphi_0) {
      alphak=alphai;
@@ -747,7 +397,7 @@ linesearch(
 
    if (gphi_i>=0) {
      /* ai=alphai, bi=alphai1 bracket */
-     alphak=linesearch_zoom(func,xk,pk,alphai,alphai1,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata);
+     alphak=linesearch_zoom(func,xk,pk,alphai,alphai1,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,m,step,adata);
 #ifdef DEBUG
      printf("Linesearch : Condition 3 met\n");
 #endif
@@ -763,7 +413,7 @@ linesearch(
      /* choose by interpolation in [2*alphai-alphai1,min(mu,alphai+t1*(alphai-alphai1)] */
      p01=2.0*alphai-alphai1;
      p02=MIN(mu,alphai+t1*(alphai-alphai1));
-     alphai=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata);
+     alphai=cubic_interp(func,xk,pk,p01,p02,xp,m,step,adata);
      //printf("cubic interp [%lf,%lf]->%lf\n",p01,p02,alphai);
    }
    phi_alphai1=phi_alphai;
@@ -773,7 +423,6 @@ linesearch(
 
 
 
-  free(x);
   free(xp);
 #ifdef DEBUG
   printf("Step size=%g\n",alphak);
@@ -782,10 +431,26 @@ linesearch(
 }
 /*************** END Fletcher line search **********************************/
 
+
+
+/* cost function : return a scalar cost, input : p (mx1) parameters, m: no. of params, adata: additional data
+   grad function: return gradient (mx1): input : p (mx1) parameters, g (mx1) gradient vector, m: no. of params, adata: additional data
+*/ 
+/*
+   p: parameters m x 1 (used as initial value, output final value)
+   itmax: max iterations
+   M: BFGS memory size
+   adata: additional data
+*/
+
 int
 lbfgs_fit(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
-   double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, void *adata) {
+   double (*cost_func)(double *p, int m, void *adata),
+   void (*grad_func)(double *p, double *g, int m, void *adata),
+   /* the following is the mini batch function, called per eath iteration
+      to change the input/output data used for the next iteration, if =NULL, full batch mode */
+   void (*change_batch)(int iter, void *adata), /* iter= iteration number */
+   double *p, int m, int itmax, int M, void *adata) {
 
   double *gk; /* gradients at both k+1 and k iter */
   double *xk1,*xk; /* parameters at k+1 and k iter */
@@ -799,56 +464,45 @@ lbfgs_fit(
   
 
   if ((gk=(double*)calloc((size_t)m,sizeof(double)))==0) {
-#ifndef USE_MIC
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
      exit(1);
   }
   if ((xk1=(double*)calloc((size_t)m,sizeof(double)))==0) {
-#ifndef USE_MIC
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
      exit(1);
   }
   if ((xk=(double*)calloc((size_t)m,sizeof(double)))==0) {
-#ifndef USE_MIC
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
      exit(1);
   }
 
   if ((pk=(double*)calloc((size_t)m,sizeof(double)))==0) {
-#ifndef USE_MIC
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
      exit(1);
   }
 
 
   /* storage size mM x 1*/
   if ((s=(double*)calloc((size_t)m*M,sizeof(double)))==0) {
-#ifndef USE_MIC
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
      exit(1);
   }
   if ((y=(double*)calloc((size_t)m*M,sizeof(double)))==0) {
-#ifndef USE_MIC
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
      exit(1);
   }
   if ((rho=(double*)calloc((size_t)M,sizeof(double)))==0) {
-#ifndef USE_MIC
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
      exit(1);
   }
 
   /* initial value for params xk=p */
   my_dcopy(m,p,1,xk,1);
+  if (change_batch) {
+   change_batch(0,adata);
+  }
   /*  gradient gk=grad(f)_k */
-  func_grad(func,xk,gk,x,m,n,step,adata);
+  grad_func(xk,gk,m,adata);
   double gradnrm=my_dnrm2(m,gk);
   /* if gradient is too small, no need to solve, so stop */
   if (gradnrm<CLM_STOP_THRESH) {
@@ -877,15 +531,14 @@ lbfgs_fit(
 
    /* linesearch to find step length */
    /* parameters alpha1=10.0,sigma=0.1, rho=0.01, t1=9, t2=0.1, t3=0.5 */
-   alphak=linesearch(func,xk,pk,10.0,0.1,0.01,9,0.1,0.5,x,m,n,step,adata);
-   /* parameters c1=1e-4 c2=0.9, alpha1=1.0, alphamax=10.0, step (for alpha)=1e-4*/
-   //alphak=linesearch_nw(func,xk,pk,1.0,10.0,1e-4,0.9,x,m,n,1e-4,adata);
-   //alphak=1.0;
+   alphak=linesearch(cost_func,xk,pk,10.0,0.1,0.01,9,0.1,0.5,m,step,adata);
+   
+   /* Armijo line search */
+   ///alphak=linesearch_backtrack(cost_func,xk,pk,gk,m,1.0,adata);
    /* check if step size is too small, or nan, then stop */
    if (!isnormal(alphak) || fabs(alphak)<CLM_EPSILON) {
     break;
    }
-
    /* update parameters xk1=xk+alpha_k *pk */
    my_dcopy(m,xk,1,xk1,1);
    my_daxpy(m,pk,alphak,xk1);
@@ -900,7 +553,12 @@ lbfgs_fit(
    my_dscal(m,-1.0,&y[cm]);
 
    /* update gradient */
-   func_grad(func,xk1,gk,x,m,n,step,adata);
+   if (change_batch) {
+    change_batch(ck+1,adata);
+   }
+
+   grad_func(xk1,gk,m,adata);
+   gradnrm=my_dnrm2(m,gk);
    /* yk=yk+gk1 */
    my_daxpy(m,gk,1.0,&y[cm]);
 
@@ -920,15 +578,21 @@ lbfgs_fit(
    } else {
     cm=ci=0;
    }
+
+#ifdef DEBUG
+  printf("iter %d alpha=%g ||grad||=%g\n",ck,alphak,gradnrm);
+#endif
   }
 
 
  /* copy back solution to p */
  my_dcopy(m,xk,1,p,1);
 
- /* for (ci=0; ci<m; ci++) {
+#ifdef DEBUG
+  for (ci=0; ci<m; ci++) {
    printf("grad %d=%lf\n",ci,gk[ci]);
-  } */
+  } 
+#endif
 
   free(gk);
   free(xk1);
diff --git a/src/lib/Dirac/lmfit.c b/src/lib/Dirac/lmfit.c
index 1cfbc8e..89515ba 100644
--- a/src/lib/Dirac/lmfit.c
+++ b/src/lib/Dirac/lmfit.c
@@ -331,9 +331,9 @@ predict_threadfn_withgain_full(void *data) {
 
 /* minimization function (multithreaded) */
 /* p: size mx1 parameters
-   x: size nx1 data calculated
+   x: size nx1 model being calculated
    data: extra info needed */
-static void
+void
 minimize_viz_full_pth(double *p, double *x, int m, int n, void *data) {
 
   me_data_t *dp=(me_data_t*)data;
@@ -985,10 +985,10 @@ sagefit_visibilities_dual_pt(double *u, double *v, double *w, double *x, int N,
   /* use LBFGS */
    if (solver_mode==2) {
     lmdata0.robust_nu=robust_nu0;
-    lbfgs_fit_robust_cuda(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata0);
+    lbfgs_fit_robust_cuda(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata0);
     /* also print robust nu to output */
    } else {
-    lbfgs_fit(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata0);
+    lbfgs_fit(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata0);
    }
   } 
 
@@ -1380,9 +1380,9 @@ sagefit_visibilities_dual_pt_one_gpu(double *u, double *v, double *w, double *x,
    /* use LBFGS */
    if (solver_mode==2 || solver_mode==3) {
     lmdata.robust_nu=robust_nu0;
-    lbfgs_fit_robust_cuda(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
+    lbfgs_fit_robust_cuda(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
    } else {
-    lbfgs_fit(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
+    lbfgs_fit(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
    }
   }
   /* final residual calculation */
@@ -1457,9 +1457,9 @@ bfgsfit_visibilities_gpu(double *u, double *v, double *w, double *x, int N,
    /* use LBFGS */
    if (solver_mode==2 || solver_mode==3) {
     lmdata.robust_nu=mean_nu;
-    lbfgs_fit_robust_cuda(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
+    lbfgs_fit_robust_cuda(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
    } else {
-    lbfgs_fit(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
+    lbfgs_fit(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
    }
   }
   /* final residual calculation */
@@ -2145,13 +2145,13 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
   /* use LBFGS */
    if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
     lmdata0.robust_nu=robust_nu0;
-    lbfgs_fit_robust_cuda(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata0);
+    lbfgs_fit_robust_cuda(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata0);
    } else {
-    lbfgs_fit(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata0);
+    lbfgs_fit(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata0);
    }
   } 
 
-  /* final residual calculation */
+  /* final residual calculation: FIXME: possible GPU accel here? */
   minimize_viz_full_pth(p, xsub, m, n, (void*)&lmdata0);
   my_daxpy(n, xsub, -1.0, x);
 
diff --git a/src/lib/Dirac/lmfit_nocuda.c b/src/lib/Dirac/lmfit_nocuda.c
index 35fd301..9f617cc 100644
--- a/src/lib/Dirac/lmfit_nocuda.c
+++ b/src/lib/Dirac/lmfit_nocuda.c
@@ -688,7 +688,7 @@ predict_threadfn_withgain_full(void *data) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
-static void
+void
 minimize_viz_full_pth(double *p, double *x, int m, int n, void *data) {
 
   me_data_t *dp=(me_data_t*)data;
@@ -1023,9 +1023,9 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
   /* use LBFGS */
    if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS ||  solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
     lmdata.robust_nu=robust_nu0;
-    lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
+    lbfgs_fit_robust_wrapper(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
    } else {
-    lbfgs_fit(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
+    lbfgs_fit_wrapper(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
    }
 #ifdef USE_MIC
   lmdata.Nt=Nt; /* reset threads for MIC */
@@ -1181,9 +1181,9 @@ bfgsfit_visibilities(double *u, double *v, double *w, double *x, int N,
   /* use LBFGS */
    if (solver_mode==2 || solver_mode==3) {
     lmdata.robust_nu=mean_nu;
-    lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata); 
+    lbfgs_fit_robust_wrapper(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata); 
    } else {
-    lbfgs_fit(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
+    lbfgs_fit_wrapper(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
    }
 #ifdef USE_MIC
   lmdata.Nt=Nt; /* reset threads for MIC */
diff --git a/src/lib/Dirac/robust_lbfgs_nocuda.c b/src/lib/Dirac/robust_lbfgs_nocuda.c
index 53e61c7..570c2bb 100644
--- a/src/lib/Dirac/robust_lbfgs_nocuda.c
+++ b/src/lib/Dirac/robust_lbfgs_nocuda.c
@@ -54,6 +54,8 @@ ambt(complex double * __restrict a, complex double * __restrict b, complex doubl
 }
 
 /**** end repeated code ********************/
+
+/*************************************** ROBUST ***************************/
 /***************************************************************/
 /* worker thread to calculate
    sum ( log(1+ (y_i-f_i)^2/nu) )
@@ -78,7 +80,7 @@ func_robust_th(void *data) {
 /* recalculate log(1+ (y_i-f_i)^2/nu) 
    from function() that calculates f_i 
    y (data)
-   f=function()
+   f=model vector
    x=log(1+(y_i-f_i)^2/nu) output
    all size n x 1
    Nt: no of threads
@@ -144,518 +146,7 @@ func_robust(
 }
 /***************************************************************/
 
-/* use algorithm 9.1 to compute pk=Hk gk */
-/* pk,gk: size m x 1
-   s, y: size mM x 1 
-   rho: size M x 1 
-   ii: true location of the k th values in s,y */
-#ifdef USE_MIC
-__attribute__ ((target(MIC)))
-#endif
-static void
-mult_hessian(int m, double *pk, double *gk, double *s, double *y, double *rho, int M, int ii) {
- int ci;
- double *alphai;
- int *idx; /* store sorted locations of s, y here */
- double gamma,beta;
 
- if ((alphai=(double*)calloc((size_t)M,sizeof(double)))==0) {
-#ifndef USE_MIC
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
-     exit(1);
- }
- if ((idx=(int*)calloc((size_t)M,sizeof(int)))==0) {
-#ifndef USE_MIC
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
-     exit(1);
- }
- if (M>0) {
-  /* find the location of k-1 th value */
-  if (ii>0) {
-   ii=ii-1;
-  } else {
-   ii=M-1;
-  }
- /* s,y will have 0,1,...,ii,ii+1,...M-1 */
- /* map this to  ii+1,ii+2,...,M-1,0,1,..,ii */
-  for (ci=0; ci<M-ii-1; ci++){
-   idx[ci]=(ii+ci+1);
-  }
-  for(ci=M-ii-1; ci<M; ci++) {
-   idx[ci]=(ci-M+ii+1);
-  }
- }
-
-#ifdef DEBUG
- printf("prod M=%d, current ii=%d\n",M,ii);
- for(ci=0; ci<M; ci++) {
-  printf("%d->%d ",ci,idx[ci]);
- }
- printf("\n");
-#endif
- /* q = grad(f)k : pk<=gk */
- my_dcopy(m,gk,1,pk,1);
- /* this should be done in the right order */
- for (ci=0; ci<M; ci++) {
-  /* alphai=rhoi si^T*q */
-  alphai[M-ci-1]=rho[idx[M-ci-1]]*my_ddot(m,&s[m*idx[M-ci-1]],pk);
-  /* q=q-alphai yi */
-  my_daxpy(m,&y[m*idx[M-ci-1]],-alphai[M-ci-1],pk);
- }
- /* r=Hk(0) q : initial hessian */
- /* gamma=s(k-1)^T*y(k-1)/y(k-1)^T*y(k-1)*/
- gamma=1.0;
- if (M>0) {
-  gamma=my_ddot(m,&s[m*idx[M-1]],&y[m*idx[M-1]]);
-  gamma/=my_ddot(m,&y[m*idx[M-1]],&y[m*idx[M-1]]);
-  /* Hk(0)=gamma I, so scale q by gamma */
-  /* r= Hk(0) q */
-  my_dscal(m,gamma,pk);
- } 
-
- for (ci=0; ci<M; ci++) {
-  /* beta=rhoi yi^T * r */
-  beta=rho[idx[ci]]*my_ddot(m,&y[m*idx[ci]],pk);
-  /* r = r + (alphai-beta)*si */
-  my_daxpy(m,&s[m*idx[ci]],alphai[ci]-beta,pk);
- }
-
- free(alphai);
- free(idx);
-}
-
-/* cubic interpolation in interval [a,b] (a>b is possible)
-   to find step that minimizes cost function */
-/* func: vector function
-   xk: parameter values size m x 1 (at which step is calculated)
-   pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
-   a/b:  interval for interpolation
-   x: size n x 1 (storage)
-   xp: size m x 1 (storage)
-   xo: observed data size n x 1
-   n: size of vector function
-   step: step size for differencing 
-   adata:  additional data passed to the function
-*/
-#ifdef USE_MIC
-__attribute__ ((target(MIC)))
-#endif
-static double 
-cubic_interp(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
-   double *xk, double *pk, double a, double b, double *x, double *xp,  double *xo, int m, int n, double step, void *adata) {
-
-  me_data_t *dp=(me_data_t*)adata;
-  double f0,f1,f0d,f1d; /* function values and derivatives at a,b */
-  double p01,p02,z0,fz0;
-  double aa,cc;
-
-  my_dcopy(m,xk,1,xp,1); /* xp<=xk */
-  my_daxpy(m,pk,a,xp); /* xp<=xp+(a)*pk */
-  func(xp,x,m,n,adata);
-  //my_daxpy(n,xo,-1.0,x);
-  //f0=my_dnrm2(n,x);
-  //f0*=f0;
-  f0=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-  /* grad(phi_0): evaluate at -step and +step */
-  my_daxpy(m,pk,step,xp); /* xp<=xp+(a+step)*pk */
-  func(xp,x,m,n,adata);
-  //my_daxpy(n,xo,-1.0,x);
-  //p01=my_dnrm2(n,x);
-  p01=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-  my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(a-step)*pk */
-  func(xp,x,m,n,adata);
-  //my_daxpy(n,xo,-1.0,x);
-  //p02=my_dnrm2(n,x);
-  p02=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-  f0d=(p01-p02)/(2.0*step);
-
-  //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp=xk+(a-step)*pk */
-  //my_daxpy(m,pk,b,xp); /* xp<=xp+(b)*pk */
-  my_daxpy(m,pk,-a+step+b,xp); /* xp<=xp+(b)*pk */
-  func(xp,x,m,n,adata);
-  //my_daxpy(n,xo,-1.0,x);
-  //f1=my_dnrm2(n,x);
-  //f1*=f1;
-  f1=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-  /* grad(phi_1): evaluate at -step and +step */
-  my_daxpy(m,pk,step,xp); /* xp<=xp+(b+step)*pk */
-  func(xp,x,m,n,adata);
-  //my_daxpy(n,xo,-1.0,x);
-  //p01=my_dnrm2(n,x);
-  p01=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-  my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(b-step)*pk */
-  func(xp,x,m,n,adata);
-  //my_daxpy(n,xo,-1.0,x);
-  //p02=my_dnrm2(n,x);
-  p02=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-  f1d=(p01-p02)/(2.0*step);
-
-
-  //printf("Interp a,f(a),f'(a): (%lf,%lf,%lf) (%lf,%lf,%lf)\n",a,f0,f0d,b,f1,f1d);
-  /* cubic poly in [0,1] is f0+f0d z+eta z^2+xi z^3 
-    where eta=3(f1-f0)-2f0d-f1d, xi=f0d+f1d-2(f1-f0) 
-    derivative f0d+2 eta z+3 xi z^2 => cc+bb z+aa z^2 */
-   aa=3.0*(f0-f1)/(b-a)+(f1d-f0d);
-   p01=aa*aa-f0d*f1d;
-  /* root exist? */
-  if (p01>0.0) {
-   /* root */
-   cc=sqrt(p01);
-   z0=b-(f1d+cc-aa)*(b-a)/(f1d-f0d+2.0*cc);
-   /* FIXME: check if this is within boundary */
-   aa=MAX(a,b);
-   cc=MIN(a,b);
-   //printf("Root=%lf, in [%lf,%lf]\n",z0,cc,aa);
-   if (z0>aa || z0<cc) {
-    fz0=f0+f1;
-   } else {
-    /* evaluate function for this root */
-    //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp=xk+(b-step)*pk */
-    //my_daxpy(m,pk,a+z0*(b-a),xp); /* xp<=xp+(a+z0(b-a))*pk */
-    my_daxpy(m,pk,-b+step+a+z0*(b-a),xp); /* xp<=xp+(a+z0(b-a))*pk */
-    func(xp,x,m,n,adata);
-    //my_daxpy(n,xo,-1.0,x);
-    //fz0=my_dnrm2(n,x);
-    //fz0*=fz0;
-    fz0=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-   }
-
-   /* now choose between f0,f1,fz0,fz1 */
-   if (f0<f1 && f0<fz0) {
-     return a;
-   }
-   if (f1<fz0) {
-     return b;
-   }
-   /* else */
-   return (z0);
-  } else { 
-
-   /* find the value from a or b that minimizes func */
-   if (f0<f1) {
-    return a;
-   } else {
-    return b;
-   }
-  }
-
-  return 0;
-}
-
-
-/*************** Fletcher line search **********************************/
-/* zoom function for line search */
-/* func: vector function
-   xk: parameter values size m x 1 (at which step is calculated)
-   pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
-   a/b: bracket interval [a,b] (a>b) is possible
-   x: size n x 1 (storage)
-   xp: size m x 1 (storage)
-   phi_0: phi(0)
-   gphi_0: grad(phi(0))
-   xo: observed data size n x 1
-   n: size of vector function
-   step: step size for differencing 
-   adata:  additional data passed to the function
-*/
-#ifdef USE_MIC
-__attribute__ ((target(MIC)))
-#endif
-static double 
-linesearch_zoom(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
-   double *xk, double *pk, double a, double b, double *x, double *xp,  double phi_0, double gphi_0, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) {
-
-  me_data_t *dp=(me_data_t*)adata;
-  double alphaj,phi_j,phi_aj;
-  double gphi_j,p01,p02,aj,bj;
-  double alphak=1.0;
-  int ci,found_step=0;
-
-  aj=a;
-  bj=b;
-  ci=0;
-  while(ci<10) {
-    /* choose alphaj from [a+t2(b-a),b-t3(b-a)] */
-    p01=aj+t2*(bj-aj);
-    p02=bj-t3*(bj-aj);
-    alphaj=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata);
-    //printf("cubic intep [%lf,%lf]->%lf\n",p01,p02,alphaj);
-
-    /* evaluate phi(alphaj) */
-    my_dcopy(m,xk,1,xp,1); /* xp<=xk */
-    my_daxpy(m,pk,alphaj,xp); /* xp<=xp+(alphaj)*pk */
-    func(xp,x,m,n,adata);
-    /* calculate x<=x-xo */
-    //my_daxpy(n,xo,-1.0,x);
-    //phi_j=my_dnrm2(n,x);
-    //phi_j*=phi_j;
-    phi_j=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-
-    /* evaluate phi(aj) */
-    //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp = xk+alphaj*pk */
-    //my_daxpy(m,pk,aj,xp); /* xp<=xp+(aj)*pk */
-    my_daxpy(m,pk,-alphaj+aj,xp); /* xp<=xp+(aj)*pk */
-    func(xp,x,m,n,adata);
-    /* calculate x<=x-xo */
-    //my_daxpy(n,xo,-1.0,x);
-    //phi_aj=my_dnrm2(n,x);
-    //phi_aj*=phi_aj;
-    phi_aj=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-
-#ifdef DEBUG
-    printf("phi_j=%lf, phi_aj=%lf\n",phi_j,phi_aj);
-#endif
-    if ((phi_j>phi_0+rho*alphaj*gphi_0) || phi_j>=phi_aj) {
-      bj=alphaj; /* aj unchanged */
-    } else {
-     /* evaluate grad(alphaj) */
-     //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp = xk+aj*pk */
-     //my_daxpy(m,pk,alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */
-     my_daxpy(m,pk,-aj+alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */
-     func(xp,x,m,n,adata);
-     /* calculate x<=x-xo */
-     //my_daxpy(n,xo,-1.0,x);
-     //p01=my_dnrm2(n,x);
-     p01=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-
-     my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphaj-step)*pk */
-     func(xp,x,m,n,adata);
-     /* calculate x<=x-xo */
-     //my_daxpy(n,xo,-1.0,x);
-     //p02=my_dnrm2(n,x);
-     p02=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-     gphi_j=(p01-p02)/(2.0*step);
-#ifdef DEBUG
-    printf("p01=%lf, p02=%lf\n",p01,p02);
-#endif
-
-     /* termination due to roundoff/other errors pp. 38, Fletcher */
-     if ((aj-alphaj)*gphi_j<=step) {
-      alphak=alphaj;
-      found_step=1;
-      break;
-     }
-    
-     if (fabs(gphi_j)<=-sigma*gphi_0) {
-      alphak=alphaj;
-      found_step=1;
-      break;
-     }
-     
-     if (gphi_j*(bj-aj)>=0) {
-       bj=aj;
-     } /* else bj unchanged */
-     aj=alphaj;
-   }
-   ci++;
-  }
-
-  if (!found_step) {
-   /* use bound to find possible step */
-   alphak=alphaj;
-  }
-   
-#ifdef DEBUG
-  printf("Found %lf Interval [%lf,%lf]\n",alphak,a,b);
-#endif
-  return alphak;
-}
- 
- 
-
-/* line search */
-/* func: vector function
-   xk: parameter values size m x 1 (at which step is calculated)
-   pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
-   alpha1: initial value for step
-   sigma,rho,t1,t2,t3: line search parameters (from Fletcher) 
-   xo: observed data size n x 1
-   n: size of vector function
-   step: step size for differencing 
-   adata:  additional data passed to the function
-*/
-#ifdef USE_MIC
-__attribute__ ((target(MIC)))
-#endif
-static double 
-linesearch(
-   void (*func)(double *p, double *hx, int m, int n, void *adata),
-   double *xk, double *pk, double alpha1, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) {
- 
- /* phi(alpha)=f(xk+alpha pk)
-  for vector function func 
-   f(xk) =||func(xk)||^2 */
-  
-  me_data_t *dp=(me_data_t*)adata;
-  double *x,*xp;
-  double alphai,alphai1;
-  double phi_0,phi_alphai,phi_alphai1;
-  double p01,p02;
-  double gphi_0,gphi_i;
-  double alphak;
-
-  double mu;
-  double tol; /* lower limit for minimization */
-
-  int ci;
-
-  if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) {
-#ifndef USE_MIC
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
-     exit(1);
-  }
-  if ((xp=(double*)calloc((size_t)m,sizeof(double)))==0) {
-#ifndef USE_MIC
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
-     exit(1);
-  }
-
-  alphak=1.0;
-  /* evaluate phi_0 and grad(phi_0) */
-  func(xk,x,m,n,adata);
-  //my_daxpy(n,xo,-1.0,x);
-  //phi_0=my_dnrm2(n,x);
-  //phi_0*=phi_0;
-  phi_0=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-  /* select tolarance 1/100 of current function value */
-  tol=MIN(0.01*phi_0,1e-6);
-
-
-  /* grad(phi_0): evaluate at -step and +step */
-  my_dcopy(m,xk,1,xp,1); /* xp<=xk */
-  my_daxpy(m,pk,step,xp); /* xp<=xp+(0.0+step)*pk */
-  func(xp,x,m,n,adata);
-  /* calculate x<=x-xo */
-  //my_daxpy(n,xo,-1.0,x);
-  //p01=my_dnrm2(n,x);
-  p01=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-  my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(0.0-step)*pk */
-  func(xp,x,m,n,adata);
-  /* calculate x<=x-xo */
-  //my_daxpy(n,xo,-1.0,x);
-  //p02=my_dnrm2(n,x);
-  p02=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-  gphi_0=(p01-p02)/(2.0*step);
-
-
-  /* estimate for mu */
-  /* mu = (tol-phi_0)/(rho gphi_0) */
-  mu=(tol-phi_0)/(rho*gphi_0);
-#ifdef DEBUG
-  printf("cost=%lf grad=%lf mu=%lf, alpha1=%lf\n",phi_0,gphi_0,mu,alpha1);
-#endif
-  /* catch if not finite (deltaphi=0 or nan) */
-  if (!isnormal(mu)) {
-    free(x);
-    free(xp);
-#ifdef DEBUG
-    printf("line interval too small\n");
-#endif
-    return mu;
-  }
-
-
-  ci=1;
-  alphai=alpha1; /* initial value for alpha(i) : check if 0<alphai<=mu */
-  alphai1=0.0;
-  phi_alphai1=phi_0;
-  while(ci<10) {
-   /* evalualte phi(alpha(i))=f(xk+alphai pk) */
-   my_dcopy(m,xk,1,xp,1); /* xp<=xk */
-   my_daxpy(m,pk,alphai,xp); /* xp<=xp+alphai*pk */
-   func(xp,x,m,n,adata);
-   /* calculate x<=x-xo */
-   //my_daxpy(n,xo,-1.0,x);
-   //phi_alphai=my_dnrm2(n,x);
-   //phi_alphai*=phi_alphai;
-   phi_alphai=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-
-   if (phi_alphai<tol) {
-     alphak=alphai;
-#ifdef DEBUG
-     printf("Linesearch : Condition 0 met\n");
-#endif
-     break;
-   }
-
-   if ((phi_alphai>phi_0+alphai*gphi_0) || (ci>1 && phi_alphai>=phi_alphai1)) {
-      /* ai=alphai1, bi=alphai bracket */
-      alphak=linesearch_zoom(func,xk,pk,alphai1,alphai,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata);
-#ifdef DEBUG
-      printf("Linesearch : Condition 1 met\n");
-#endif
-      break;
-   } 
-
-   /* evaluate grad(phi(alpha(i))) */
-   //my_dcopy(m,xk,1,xp,1); /* NOT NEEDED here because already xp=xk+alphai*pk */
-   //my_daxpy(m,pk,alphai+step,xp); /* xp<=xp+(alphai+step)*pk */
-   my_daxpy(m,pk,step,xp); /* xp<=xp+(alphai+step)*pk */
-   func(xp,x,m,n,adata);
-   /* calculate x<=x-xo */
-   //my_daxpy(n,xo,-1.0,x);
-   //p01=my_dnrm2(n,x);
-   p01=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-   my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphai-step)*pk */
-   func(xp,x,m,n,adata);
-   /* calculate x<=x-xo */
-   //my_daxpy(n,xo,-1.0,x);
-   //p02=my_dnrm2(n,x);
-   p01=func_robust(x,xo,n,dp->robust_nu,dp->Nt);
-   gphi_i=(p01-p02)/(2.0*step);
-
-   if (fabs(gphi_i)<=-sigma*gphi_0) {
-     alphak=alphai;
-#ifdef DEBUG
-     printf("Linesearch : Condition 2 met\n");
-#endif
-     break;
-   }
-
-   if (gphi_i>=0) {
-     /* ai=alphai, bi=alphai1 bracket */
-     alphak=linesearch_zoom(func,xk,pk,alphai,alphai1,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata);
-#ifdef DEBUG
-     printf("Linesearch : Condition 3 met\n");
-#endif
-     break;
-   }
-
-   /* else preserve old values */
-   if (mu<=(2*alphai-alphai1)) {
-     /* next step */
-     alphai1=alphai;
-     alphai=mu;
-   } else {
-     /* choose by interpolation in [2*alphai-alphai1,min(mu,alphai+t1*(alphai-alphai1)] */
-     p01=2*alphai-alphai1;
-     p02=MIN(mu,alphai+t1*(alphai-alphai1));
-     alphai=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata);
-     //printf("cubic interp [%lf,%lf]->%lf\n",p01,p02,alphai);
-   }
-   phi_alphai1=phi_alphai;
-
-   ci++;
-  }
-
-
-
-  free(x);
-  free(xp);
-#ifdef DEBUG
-  printf("Step size=%lf\n",alphak);
-#endif
-  return alphak;
-}
-/*************** END Fletcher line search **********************************/
-
-/*************************************** ROBUST ***************************/
 /* worker thread for a cpu */
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
@@ -816,13 +307,12 @@ cpu_calc_deriv_robust(void *adata) {
  return NULL;
 }
 /* calculate gradient */
-/* func: vector function
+/* func: vector function to predict model
    p: parameter values size m x 1 (at which grad is calculated)
    g: gradient size m x 1 
    xo: observed data size n x 1
    robust_nu: nu in T distribution
    n: size of vector function
-   step: step size for differencing 
    adata:  additional data passed to the function
 */
 #ifdef USE_MIC
@@ -831,8 +321,8 @@ __attribute__ ((target(MIC)))
 static int
 func_grad_robust(
    void (*func)(double *p, double *hx, int m, int n, void *adata),
-   double *p, double *g, double *xo, int m, int n, double step, void *adata) {
-  /* gradient for each parameter is
+   double *p, double *g, double *xo, int m, int n, void *adata) {
+  /* numerical gradient for each parameter is
      (||func(p+step*e_i)-x||^2-||func(p-step*e_i)-x||^2)/2*step
     i=0,...,m-1 for all parameters
     e_i: unit vector, 1 only at i-th location
@@ -922,162 +412,352 @@ func_grad_robust(
   free(x);
   return 0;
 }
+/*************************************** END ROBUST ***************************/
 
-int
-lbfgs_fit_robust(
+
+
+/* worker thread for a cpu */
+#ifdef USE_MIC
+__attribute__ ((target(MIC)))
+#endif
+static void *
+cpu_calc_deriv(void *adata) {
+ thread_data_grad_t *t=(thread_data_grad_t*)adata;
+
+ int ci,nb;
+ int stc,stoff,stm,sta1,sta2;
+ int N=t->N; /* stations */
+ int M=t->M; /* clusters */
+ int Nbase=(t->Nbase)*(t->tilesz);
+
+
+ complex double xr[4]; /* residuals */
+ complex double G1[4],G2[4],C[4],T1[4],T2[4];
+ double pp[8];
+ complex double csum;
+ int cli,tpchunk,pstart,nchunk,tilesperchunk,stci,ttile,tptile,poff;
+
+ /* iterate over each paramter */
+ for (ci=t->g_start; ci<=t->g_end; ++ci) {
+    t->g[ci]=0.0;
+    /* find station and parameter corresponding to this value of ci */
+    /* this parameter should correspond to the right baseline (x tilesz)
+        to contribute to the derivative */
+    cli=0;
+    while((cli<M) && (ci<t->carr[cli].p[0] || ci>t->carr[cli].p[0]+8*N*t->carr[cli].nchunk-1)) {
+     cli++;
+    }
+   /* now either cli>=M: cluster not found 
+       or cli<M and cli is the right cluster */
+   if (cli==M && ci>=t->carr[cli-1].p[0] && ci<=t->carr[cli-1].p[0]+8*N*t->carr[cli-1].nchunk-1) {
+    cli--;
+   }
+
+   if (cli<M) {
+    /* right parameter offset */
+    stci=ci-t->carr[cli].p[0];
+ 
+    stc=(stci%(8*N))/8; /* 0..N-1 */
+    /* make sure this baseline contribute to this parameter */
+    tpchunk=stci/(8*N);
+    nchunk=t->carr[cli].nchunk;
+    pstart=t->carr[cli].p[0];
+    tilesperchunk=(t->tilesz+nchunk-1)/nchunk;
+
+
+    /* iterate over all baselines and accumulate sum */
+    for (nb=0; nb<Nbase; ++nb) {
+     /* which tile is this ? */
+     ttile=nb/t->Nbase;
+     /* which chunk this tile belongs to */
+     tptile=ttile/tilesperchunk;
+     /* now tptile has to match tpchunk, otherwise ignore calculation */
+     if (tptile==tpchunk) {
+
+     sta1=t->barr[nb].sta1;
+     sta2=t->barr[nb].sta2;
+    
+     if (((stc==sta1)||(stc==sta2))&& !t->barr[nb].flag) {
+      /* this baseline has a contribution */
+      /* which paramter of this station */
+      stoff=(stci%(8*N))%8; /* 0..7 */
+      /* which cluster */
+      stm=cli; /* 0..M-1 */
+
+      /* exact expression for derivative 
+         2 real( vec^H(residual_this_baseline) 
+            * vec(-J_{pm}C_{pqm} J_{qm}^H)
+        where m: chosen cluster
+        J_{pm},J_{qm} Jones matrices for baseline p-q
+        depending on the parameter, J ==> E 
+        E: zero matrix, except 1 at location of m
+   
+       residual : in x[8*nb:8*nb+7]
+       C coh: in coh[8*M*nb+m*8:8*M*nb+m*8+7] (double storage)
+           coh[4*M*nb+4*m:4*M*nb+4*m+3] (complex storage)
+       J_p,J_q: in p[sta1*8+m*8*N: sta1*8+m*8*N+7]
+        and p[sta2*8+m*8*N: sta2*8+m*8*N+ 7]
+     */
+     /* read in residual vector, conjugated */
+     xr[0]=(t->x[nb*8])-_Complex_I*(t->x[nb*8+1]);
+     xr[1]=(t->x[nb*8+2])-_Complex_I*(t->x[nb*8+3]);
+     xr[2]=(t->x[nb*8+4])-_Complex_I*(t->x[nb*8+5]);
+     xr[3]=(t->x[nb*8+6])-_Complex_I*(t->x[nb*8+7]);
+
+     /* read in coherency */
+     C[0]=t->coh[4*M*nb+4*stm];
+     C[1]=t->coh[4*M*nb+4*stm+1];
+     C[2]=t->coh[4*M*nb+4*stm+2];
+     C[3]=t->coh[4*M*nb+4*stm+3];
+
+     memset(pp,0,sizeof(double)*8); 
+     if (stc==sta1) {
+       /* this station parameter gradient */
+       pp[stoff]=1.0;
+       memset(G1,0,sizeof(complex double)*4); 
+       G1[0]=pp[0]+_Complex_I*pp[1];
+       G1[1]=pp[2]+_Complex_I*pp[3];
+       G1[2]=pp[4]+_Complex_I*pp[5];
+       G1[3]=pp[6]+_Complex_I*pp[7];
+       poff=pstart+tpchunk*8*N+sta2*8;
+       G2[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]);
+       G2[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]);
+       G2[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+4]);
+       G2[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]);
+
+     } else if (stc==sta2) {
+       memset(G2,0,sizeof(complex double)*4); 
+       pp[stoff]=1.0;
+       G2[0]=pp[0]+_Complex_I*pp[1];
+       G2[1]=pp[2]+_Complex_I*pp[3];
+       G2[2]=pp[4]+_Complex_I*pp[5];
+       G2[3]=pp[6]+_Complex_I*pp[7];
+       poff=pstart+tpchunk*8*N+sta1*8;
+       G1[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]);
+       G1[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]);
+       G1[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+5]);
+       G1[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]);
+     }
+
+     /* T1=G1*C */
+     amb(G1,C,T1);
+     /* T2=T1*G2' */
+     ambt(T1,G2,T2);
+
+     /* calculate product xr*vec(J_p C J_q^H ) */
+     csum=xr[0]*T2[0];
+     csum+=xr[1]*T2[1];
+     csum+=xr[2]*T2[2];
+     csum+=xr[3]*T2[3];
+
+     /* accumulate sum */
+     t->g[ci]+=-2.0*creal(csum);
+     }
+     }
+    }
+   }
+ }
+
+
+ return NULL;
+}
+
+#ifdef USE_MIC
+__attribute__ ((target(MIC)))
+#endif
+static int
+func_grad(
    void (*func)(double *p, double *hx, int m, int n, void *adata),
-   double *p, double *x, int m, int n, int itmax, int M, int gpu_threads,
- void *adata) {
+   double *p, double *g, double *xo, int m, int n,void *adata) {
+  /* numerical gradient for each parameter is
+     (||func(p+step*e_i)-x||^2-||func(p-step*e_i)-x||^2)/2*step
+    i=0,...,m-1 for all parameters
+    e_i: unit vector, 1 only at i-th location
+  */
 
-  double *gk; /* gradients at both k+1 and k iter */
-  double *xk1,*xk; /* parameters at k+1 and k iter */
-  double *pk; /* step direction H_k * grad(f) */
+  double *x; /* array to store residual */
+  int ci;
+  me_data_t *dp=(me_data_t*)adata;
 
-  double step=1e-6;
-  double *y, *s; /* storage for delta(grad) and delta(p) */
-  double *rho; /* storage for 1/yk^T*sk */
-  int ci,ck,cm;
-  double alphak=1.0;
-  
+  int Nt=dp->Nt;
 
-  if ((gk=(double*)calloc((size_t)m,sizeof(double)))==0) {
+  pthread_attr_t attr;
+  pthread_t *th_array;
+  thread_data_grad_t *threaddata;
+
+
+  if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) {
 #ifndef USE_MIC
      fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
 #endif
      exit(1);
   }
-  if ((xk1=(double*)calloc((size_t)m,sizeof(double)))==0) {
+  /* evaluate func once, store in x, and create threads */
+  /* and calculate the residual x=xo-func */
+  func(p,x,m,n,adata);
+  /* calculate x<=x-xo */
+  my_daxpy(n,xo,-1.0,x);
+
+  /* setup threads */
+  pthread_attr_init(&attr);
+  pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
+
+  if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
 #ifndef USE_MIC
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
 #endif
-     exit(1);
+   exit(1);
   }
-  if ((xk=(double*)calloc((size_t)m,sizeof(double)))==0) {
+  if ((threaddata=(thread_data_grad_t*)malloc((size_t)Nt*sizeof(thread_data_grad_t)))==0) {
 #ifndef USE_MIC
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
 #endif
-     exit(1);
+    exit(1);
   }
 
-  if ((pk=(double*)calloc((size_t)m,sizeof(double)))==0) {
-#ifndef USE_MIC
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
-     exit(1);
-  }
+  int nth,nth1,Nparm;
 
+  /* parameters per thread */
+  Nparm=(m+Nt-1)/Nt;
 
-  /* storage size mM x 1*/
-  if ((s=(double*)calloc((size_t)m*M,sizeof(double)))==0) {
-#ifndef USE_MIC
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
-     exit(1);
-  }
-  if ((y=(double*)calloc((size_t)m*M,sizeof(double)))==0) {
-#ifndef USE_MIC
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
-     exit(1);
-  }
-  if ((rho=(double*)calloc((size_t)M,sizeof(double)))==0) {
-#ifndef USE_MIC
-     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
-#endif
-     exit(1);
-  }
-
-  /* initial value for params xk=p */
-  my_dcopy(m,p,1,xk,1);
-  /*  gradient gk=grad(f)_k */
-  func_grad_robust(func,xk,gk,x,m,n,step,adata);
-  double gradnrm=my_dnrm2(m,gk);
-  /* if gradient is too small, no need to solve, so stop */
-  if (gradnrm<CLM_STOP_THRESH) {
-   ck=itmax;
-   step=0.0;
-  } else {
-   ck=0;
-   /* step in [1e-6,1e-9] */
-   step=MAX(1e-9,MIN(1e-3/gradnrm,1e-6));
-  }
-#ifdef DEBUG
-  printf("||grad||=%g step=%g\n",gradnrm,step);
-#endif
-
-  cm=0;
+  /* each thread will calculate derivative of part of 
+     parameters */
   ci=0;
-  
-  while (ck<itmax && isnormal(gradnrm) && gradnrm>CLM_STOP_THRESH) {
-   /* mult with hessian  pk=-H_k*gk */
-   if (ck<M) {
-    mult_hessian(m,pk,gk,s,y,rho,ck,ci);
-   } else {
-    mult_hessian(m,pk,gk,s,y,rho,M,ci);
-   }
-   my_dscal(m,-1.0,pk);
-
-   /* linesearch to find step length */
-   /* parameters alpha1=10.0,sigma=0.1, rho=0.01, t1=9, t2=0.1, t3=0.5 */
-   alphak=linesearch(func,xk,pk,10.0,0.1,0.01,9,0.1,0.5,x,m,n,step,adata);
-   /* parameters c1=1e-4 c2=0.9, alpha1=1.0, alphamax=10.0, step (for alpha)=1e-4*/
-   //alphak=linesearch_nw(func_robust,xk,pk,1.0,10.0,1e-4,0.9,x,m,n,1e-4,adata);
-   //alphak=1.0;
-   /* check if step size is too small, or nan, then stop */
-   if (!isnormal(alphak) || fabs(alphak)<CLM_EPSILON) {
-    break;
-   }
-
-   /* update parameters xk1=xk+alpha_k *pk */
-   my_dcopy(m,xk,1,xk1,1);
-   my_daxpy(m,pk,alphak,xk1);
-  
-   /* calculate sk=xk1-xk and yk=gk1-gk */
-   /* sk=xk1 */ 
-   my_dcopy(m,xk1,1,&s[cm],1); 
-   /* sk=sk-xk */
-   my_daxpy(m,xk,-1.0,&s[cm]);
-   /* yk=-gk */ 
-   my_dcopy(m,gk,1,&y[cm],1); 
-   my_dscal(m,-1.0,&y[cm]);
-
-   /* update gradient */
-   func_grad_robust(func,xk1,gk,x,m,n,step,adata);
-   /* yk=yk+gk1 */
-   my_daxpy(m,gk,1.0,&y[cm]);
-
-   /* calculate 1/yk^T*sk */
-   rho[ci]=1.0/my_ddot(m,&y[cm],&s[cm]);
-
-   /* update xk=xk1 */
-   my_dcopy(m,xk1,1,xk,1); 
-  
-   //printf("iter %d store %d\n",ck,cm);
-   ck++;
-   /* increment storage appropriately */
-   if (cm<(M-1)*m) {
-    /* offset of m */
-    cm=cm+m;
-    ci++;
-   } else {
-    cm=ci=0;
+  for (nth=0;  nth<Nt; nth++) {
+   threaddata[nth].Nbase=dp->Nbase;
+   threaddata[nth].tilesz=dp->tilesz;
+   threaddata[nth].barr=dp->barr;
+   threaddata[nth].carr=dp->carr;
+   threaddata[nth].M=dp->M;
+   threaddata[nth].N=dp->N;
+   threaddata[nth].coh=dp->coh;
+   threaddata[nth].m=m;
+   threaddata[nth].n=n;
+   threaddata[nth].x=x;
+   threaddata[nth].p=p;
+   threaddata[nth].g=g;
+   threaddata[nth].g_start=ci;
+   threaddata[nth].g_end=ci+Nparm-1;
+   if (threaddata[nth].g_end>=m) {
+    threaddata[nth].g_end=m-1;
    }
+   ci=ci+Nparm;
+   pthread_create(&th_array[nth],&attr,cpu_calc_deriv,(void*)(&threaddata[nth]));
   }
 
+  /* now wait for threads to finish */
+  for(nth1=0; nth1<nth; nth1++) {
+   pthread_join(th_array[nth1],NULL);
+  }
 
- /* copy back solution to p */
- my_dcopy(m,xk,1,p,1);
+  pthread_attr_destroy(&attr);
 
- /* for (ci=0; ci<m; ci++) {
-   printf("grad %d=%lf\n",ci,gk[ci]);
-  } */
+  free(th_array);
+  free(threaddata);
 
-  free(gk);
-  free(xk1);
-  free(xk);
-  free(pk);
-  free(s);
-  free(y);
-  free(rho);
+  free(x);
+  return 0;
+}
+
+
+
+
+
+typedef struct wrapper_me_data_t_ {
+ me_data_t *adata;
+ void (*func)(double *p, double *hx, int m, int n, void *adata);
+ double *x; /* nx1 data vector */
+ int n;
+} wrapper_me_data_t;
+
+static double
+cost_func(double *p, int m, void *adata) {
+  wrapper_me_data_t *dp=(wrapper_me_data_t*)adata;
+  double *x=dp->x; /* input data */
+  int n=dp->n;
+  double *xmodel;
+
+  if ((xmodel=(double*)calloc((size_t)n,sizeof(double)))==0) {
+     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+     exit(1);
+  }
+
+  /* predict model */
+  minimize_viz_full_pth(p, xmodel, m, n, dp->adata);
+  /* find ||x-xmodel||^2 */ 
+  my_daxpy(n,x,-1.0,xmodel);
+  double f0=my_dnrm2(n,xmodel);
+  f0*=f0;
+
+  free(xmodel);
+  return f0;
+}
+
+static void 
+grad_func(double *p, double *g, int m, void *adata) {
+  wrapper_me_data_t *dp=(wrapper_me_data_t*)adata;
+  double *x=dp->x; /* input data */
+  int n=dp->n;
+
+  func_grad(&minimize_viz_full_pth,p,g,x, m, n, dp->adata);
+}
+
+
+static double
+robust_cost_func(double *p, int m, void *adata) {
+  wrapper_me_data_t *dp=(wrapper_me_data_t*)adata;
+  me_data_t *d1=(me_data_t *)dp->adata;
+  double *x=dp->x; /* input data */
+  int n=dp->n;
+  double *xmodel;
+
+  if ((xmodel=(double*)calloc((size_t)n,sizeof(double)))==0) {
+     fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
+     exit(1);
+  }
+
+  /* predict model */
+  minimize_viz_full_pth(p, xmodel, m, n, dp->adata);
+
+  double f0=func_robust(x,xmodel,n, d1->robust_nu, d1->Nt);
+
+  free(xmodel);
+  return f0;
+}
+
+static void 
+robust_grad_func(double *p, double *g, int m, void *adata) {
+  wrapper_me_data_t *dp=(wrapper_me_data_t*)adata;
+  double *x=dp->x; /* input data */
+  int n=dp->n;
+
+  func_grad_robust(&minimize_viz_full_pth,p,g,x, m, n, dp->adata);
+}
+
+int
+lbfgs_fit_wrapper(
+   double *p, double *x, int m, int n, int itmax, int M, int gpu_threads,
+ void *adata) {
+  wrapper_me_data_t data1;
+  data1.adata=(me_data_t *)adata;
+  data1.x=x; 
+  data1.n=n;
+
+   
+  lbfgs_fit(&cost_func,&grad_func,NULL,p,m,itmax,M,&data1);
+  return 0;
+}
+
+int
+lbfgs_fit_robust_wrapper(
+   double *p, double *x, int m, int n, int itmax, int M, int gpu_threads,
+ void *adata) {
+  wrapper_me_data_t data1;
+  data1.adata=(me_data_t *)adata;
+  data1.x=x; 
+  data1.n=n;
+
+   
+  lbfgs_fit(&robust_cost_func,&robust_grad_func,NULL,p,m,itmax,M,&data1);
   return 0;
 }

From d11e1653fde199b8b831a397feaa1ef4aa2950a2 Mon Sep 17 00:00:00 2001
From: Sarod Yatawatta <sarod@dop373.nfra.nl>
Date: Mon, 30 Jul 2018 16:16:03 +0200
Subject: [PATCH 2/4] make sure rho text file exactly matches no. clusters

---
 src/MPI/main.cpp        | 4 ++--
 src/lib/Radio/readsky.c | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/MPI/main.cpp b/src/MPI/main.cpp
index 26e1d6b..655e4d2 100644
--- a/src/MPI/main.cpp
+++ b/src/MPI/main.cpp
@@ -34,7 +34,7 @@ using namespace Data;
 
 void
 print_copyright(void) {
-  cout<<"SAGECal-MPI 0.5.0 (C) 2011-2018 Sarod Yatawatta"<<endl;
+  cout<<"SAGECal-MPI 0.5.1 (C) 2011-2018 Sarod Yatawatta"<<endl;
 }
 
 
@@ -67,7 +67,7 @@ print_help(void) {
    cout << "-P consensus polynomial terms: default " <<Data::Npoly<< endl;
    cout << "-Q consensus polynomial type (0,1,2,3): default " <<Data::PolyType<< endl;
    cout << "-r regularization factor: default " <<Data::admm_rho<< endl;
-   cout << "-G regularization factor of each cluster (text file instead of -r, has to match exactly the cluster file first 2 columns): default : None" << endl;
+   cout << "-G regularization factor of each cluster (text file instead of -r, has to match _exactly_ the cluster file's first 2 columns): default : None" << endl;
    cout << "-C if >0, adaptive update of regularization factor: default "<<Data::aadmm<< endl;
    cout << "-x exclude baselines length (lambda) lower than this in calibration : default "<<Data::min_uvcut << endl;
    cout << "-y exclude baselines length (lambda) higher than this in calibration : default "<<Data::max_uvcut << endl;
diff --git a/src/lib/Radio/readsky.c b/src/lib/Radio/readsky.c
index 941973b..a2fccba 100644
--- a/src/lib/Radio/readsky.c
+++ b/src/lib/Radio/readsky.c
@@ -777,8 +777,10 @@ read_arho_fromfile(const char *admm_rho_file,int Mt,double *arho, int M, double
   c=skip_lines(cfp);
   ci=0; /* store it in reverse order */
   cj=0;
-  while(c>=0) {
+//printf("Mt=%d M=%d\n",Mt,M);
+  while(c>=0 && cj<M) {
     c=fscanf(cfp,"%d %d %lf",&cluster_id,&hybrid,&admm_rho);
+//printf("c=%d ci=%d cj=%d\n",c,ci,cj);
     /* add this value to arho array */
     if (c!=EOF && c>0) {
       /* found a valid line */
@@ -804,8 +806,9 @@ read_arho_fromfile(const char *admm_rho_file,int Mt,double *arho, int M, double
     c=skip_restof_line(cfp);
     c=skip_lines(cfp);
   }
+//printf("c=%d ci=%d cj=%d\n",c,ci,cj);
   /* report any errors */
-  if (!(c==EOF && ci==Mt-1)) {
+  if (!(c==EOF && ci==Mt-1 && cj=M)) {
     fprintf(stderr,"%s: %d: Error: cluster numbers in cluster file and regularization file do not match up.\n",__FILE__,__LINE__);
     exit(1);
   }

From 39d3ed8ad3bbb18080d3c40b894415f171c76e3f Mon Sep 17 00:00:00 2001
From: Sarod Yatawatta <sarod@dop373.nfra.nl>
Date: Mon, 30 Jul 2018 16:33:46 +0200
Subject: [PATCH 3/4] fix typo

---
 src/lib/Radio/readsky.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib/Radio/readsky.c b/src/lib/Radio/readsky.c
index a2fccba..1a9cde1 100644
--- a/src/lib/Radio/readsky.c
+++ b/src/lib/Radio/readsky.c
@@ -808,7 +808,7 @@ read_arho_fromfile(const char *admm_rho_file,int Mt,double *arho, int M, double
   }
 //printf("c=%d ci=%d cj=%d\n",c,ci,cj);
   /* report any errors */
-  if (!(c==EOF && ci==Mt-1 && cj=M)) {
+  if (!(c==EOF && ci==Mt-1 && cj==M)) {
     fprintf(stderr,"%s: %d: Error: cluster numbers in cluster file and regularization file do not match up.\n",__FILE__,__LINE__);
     exit(1);
   }

From 0b072da621f8217270c29e5a63048a3a95ac336c Mon Sep 17 00:00:00 2001
From: Sarod Yatawatta <sarod@dop373.nfra.nl>
Date: Tue, 31 Jul 2018 11:49:23 +0200
Subject: [PATCH 4/4] fixing issue #52

---
 src/lib/Dirac/Dirac.h                 |   7 ++
 src/lib/Dirac/rtr_solve.c             |  18 +++--
 src/lib/Dirac/rtr_solve_robust.c      | 110 --------------------------
 src/lib/Dirac/rtr_solve_robust_admm.c | 110 --------------------------
 4 files changed, 19 insertions(+), 226 deletions(-)

diff --git a/src/lib/Dirac/Dirac.h b/src/lib/Dirac/Dirac.h
index 5830600..5269206 100644
--- a/src/lib/Dirac/Dirac.h
+++ b/src/lib/Dirac/Dirac.h
@@ -967,6 +967,13 @@ typedef struct global_data_rtr_ {
   pthread_attr_t attr;
 } global_data_rtr_t;
 
+/* function to count how many baselines contribute to the calculation of
+   grad and hess, so normalization can be made */
+#ifdef USE_MIC
+__attribute__ ((target(MIC)))
+#endif
+extern void
+fns_fcount(global_data_rtr_t *gdata);
 
 /* RTR (ICASSP 2013) */
 #ifdef USE_MIC
diff --git a/src/lib/Dirac/rtr_solve.c b/src/lib/Dirac/rtr_solve.c
index 6218212..c1aefdf 100644
--- a/src/lib/Dirac/rtr_solve.c
+++ b/src/lib/Dirac/rtr_solve.c
@@ -81,13 +81,9 @@ threadfn_fns_fcount(void *data) {
    sta1=t->barr[ci+t->boff].sta1;
    sta2=t->barr[ci+t->boff].sta2;
 
-   pthread_mutex_lock(&t->mx_array[sta1]);
    t->bcount[sta1]+=1;
-   pthread_mutex_unlock(&t->mx_array[sta1]);
 
-   pthread_mutex_lock(&t->mx_array[sta2]);
    t->bcount[sta2]+=1;
-   pthread_mutex_unlock(&t->mx_array[sta2]);
    }
  }
 
@@ -100,7 +96,7 @@ threadfn_fns_fcount(void *data) {
 #ifdef USE_MIC
 __attribute__ ((target(MIC)))
 #endif
-static void
+void
 fns_fcount(global_data_rtr_t *gdata) {
 
   me_data_t *dp=(me_data_t*)gdata->medata;
@@ -146,7 +142,12 @@ fns_fcount(global_data_rtr_t *gdata) {
     threaddata[nth].boff=ci+boff;
     threaddata[nth].Nb=Nthb;
     threaddata[nth].barr=dp->barr;
-    threaddata[nth].bcount=bcount; /* note this should be 0 first */
+    /* note bcount[] should be 0 first */
+    if ((threaddata[nth].bcount=(int*)calloc((size_t)dp->N,sizeof(int)))==0) {
+     fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
+     exit(1);
+    }
+
     threaddata[nth].mx_array=gdata->mx_array;
     
     pthread_create(&gdata->th_array[nth],&gdata->attr,threadfn_fns_fcount,(void*)(&threaddata[nth]));
@@ -157,6 +158,11 @@ fns_fcount(global_data_rtr_t *gdata) {
   /* now wait for threads to finish */
   for(nth1=0; nth1<nth; nth1++) {
    pthread_join(gdata->th_array[nth1],NULL);
+   /* add all thread bcount[] arrays into bcount[] */
+   for (ci=0; ci<dp->N; ci++) {
+    bcount[ci]+=threaddata[nth1].bcount[ci];
+   }
+   free(threaddata[nth1].bcount);
   }
  free(threaddata);
 
diff --git a/src/lib/Dirac/rtr_solve_robust.c b/src/lib/Dirac/rtr_solve_robust.c
index a19fced..6d7ba68 100644
--- a/src/lib/Dirac/rtr_solve_robust.c
+++ b/src/lib/Dirac/rtr_solve_robust.c
@@ -64,116 +64,6 @@ atmb(complex double * __restrict a, complex double * __restrict b, complex doubl
 }
 
 
-/* worker thread function for counting */
-#ifdef USE_MIC
-__attribute__ ((target(MIC)))
-#endif
-static void *
-threadfn_fns_fcount(void *data) {
- thread_data_rtr_t *t=(thread_data_rtr_t*)data;
- 
- int ci,sta1,sta2;
- for (ci=0; ci<t->Nb; ci++) {
-   /* if this baseline is flagged, we do not compute */
-   if (!t->barr[ci+t->boff].flag) {
-
-   /* stations for this baseline */
-   sta1=t->barr[ci+t->boff].sta1;
-   sta2=t->barr[ci+t->boff].sta2;
-
-   pthread_mutex_lock(&t->mx_array[sta1]);
-   t->bcount[sta1]+=1;
-   pthread_mutex_unlock(&t->mx_array[sta1]);
-
-   pthread_mutex_lock(&t->mx_array[sta2]);
-   t->bcount[sta2]+=1;
-   pthread_mutex_unlock(&t->mx_array[sta2]);
-   }
- }
-
- return NULL;
-}
-
-
-/* function to count how many baselines contribute to the calculation of
-   grad and hess, so normalization can be made */
-#ifdef USE_MIC
-__attribute__ ((target(MIC)))
-#endif
-static void
-fns_fcount(global_data_rtr_t *gdata) {
-
-  me_data_t *dp=(me_data_t*)gdata->medata;
-
-  int nth,nth1,ci;
-
-  /* no of threads */
-  int Nt=(dp->Nt);
-  int Nthb0,Nthb;
-  thread_data_rtr_t *threaddata;
-
-  int Nbase1=(dp->Nbase)*(dp->tilesz);
-  int boff=(dp->Nbase)*(dp->tileoff);
-
-  /* calculate min baselines a thread can handle */
-  Nthb0=(Nbase1+Nt-1)/Nt;
-
-  if ((threaddata=(thread_data_rtr_t*)malloc((size_t)Nt*sizeof(thread_data_rtr_t)))==0) {
-#ifndef USE_MIC
-    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-#endif
-    exit(1);
-  }
-  int *bcount;
-  if ((bcount=(int*)calloc((size_t)dp->N,sizeof(int)))==0) {
-#ifndef USE_MIC
-   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-#endif
-   exit(1);
-  }
-
-  /* iterate over threads, allocating baselines per thread */
-  ci=0;
-  for (nth=0;  nth<Nt && ci<Nbase1; nth++) {
-    /* this thread will handle baselines [ci:min(Nbase1-1,ci+Nthb0-1)] */
-    /* determine actual no. of baselines */
-    if (ci+Nthb0<Nbase1) {
-     Nthb=Nthb0;
-    } else {
-     Nthb=Nbase1-ci;
-    }
-
-    threaddata[nth].boff=ci+boff;
-    threaddata[nth].Nb=Nthb;
-    threaddata[nth].barr=dp->barr;
-    threaddata[nth].bcount=bcount; /* note this should be 0 first */
-    threaddata[nth].mx_array=gdata->mx_array;
-    
-    pthread_create(&gdata->th_array[nth],&gdata->attr,threadfn_fns_fcount,(void*)(&threaddata[nth]));
-    /* next baseline set */
-    ci=ci+Nthb;
-  }
-
-  /* now wait for threads to finish */
-  for(nth1=0; nth1<nth; nth1++) {
-   pthread_join(gdata->th_array[nth1],NULL);
-  }
- free(threaddata);
-
- /* calculate inverse count */
- for (nth1=0; nth1<dp->N; nth1++) {
-   gdata->iw[nth1]=(bcount[nth1]>0?1.0/(double)bcount[nth1]:0.0);
- }
- free(bcount);
- /* scale back weight such that max value is 1 */
- nth1=my_idamax(dp->N,gdata->iw,1);
- double maxw=gdata->iw[nth1-1]; /* 1 based index */
- if (maxw>0.0) { /* all baselines can be flagged */
-  my_dscal(dp->N,1.0/maxw,gdata->iw);
- }
-}
-
-
 
 /* worker thread function for cost function calculation */
 #ifdef USE_MIC
diff --git a/src/lib/Dirac/rtr_solve_robust_admm.c b/src/lib/Dirac/rtr_solve_robust_admm.c
index 18773d4..008430e 100644
--- a/src/lib/Dirac/rtr_solve_robust_admm.c
+++ b/src/lib/Dirac/rtr_solve_robust_admm.c
@@ -64,116 +64,6 @@ atmb(complex double * __restrict a, complex double * __restrict b, complex doubl
 }
 
 
-/* worker thread function for counting */
-#ifdef USE_MIC
-__attribute__ ((target(MIC)))
-#endif
-static void *
-threadfn_fns_fcount(void *data) {
- thread_data_rtr_t *t=(thread_data_rtr_t*)data;
- 
- int ci,sta1,sta2;
- for (ci=0; ci<t->Nb; ci++) {
-   /* if this baseline is flagged, we do not compute */
-   if (!t->barr[ci+t->boff].flag) {
-
-   /* stations for this baseline */
-   sta1=t->barr[ci+t->boff].sta1;
-   sta2=t->barr[ci+t->boff].sta2;
-
-   pthread_mutex_lock(&t->mx_array[sta1]);
-   t->bcount[sta1]+=1;
-   pthread_mutex_unlock(&t->mx_array[sta1]);
-
-   pthread_mutex_lock(&t->mx_array[sta2]);
-   t->bcount[sta2]+=1;
-   pthread_mutex_unlock(&t->mx_array[sta2]);
-   }
- }
-
- return NULL;
-}
-
-
-/* function to count how many baselines contribute to the calculation of
-   grad and hess, so normalization can be made */
-#ifdef USE_MIC
-__attribute__ ((target(MIC)))
-#endif
-static void
-fns_fcount(global_data_rtr_t *gdata) {
-
-  me_data_t *dp=(me_data_t*)gdata->medata;
-
-  int nth,nth1,ci;
-
-  /* no of threads */
-  int Nt=(dp->Nt);
-  int Nthb0,Nthb;
-  thread_data_rtr_t *threaddata;
-
-  int Nbase1=(dp->Nbase)*(dp->tilesz);
-  int boff=(dp->Nbase)*(dp->tileoff);
-
-  /* calculate min baselines a thread can handle */
-  Nthb0=(Nbase1+Nt-1)/Nt;
-
-  if ((threaddata=(thread_data_rtr_t*)malloc((size_t)Nt*sizeof(thread_data_rtr_t)))==0) {
-#ifndef USE_MIC
-    fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-#endif
-    exit(1);
-  }
-  int *bcount;
-  if ((bcount=(int*)calloc((size_t)dp->N,sizeof(int)))==0) {
-#ifndef USE_MIC
-   fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
-#endif
-   exit(1);
-  }
-
-  /* iterate over threads, allocating baselines per thread */
-  ci=0;
-  for (nth=0;  nth<Nt && ci<Nbase1; nth++) {
-    /* this thread will handle baselines [ci:min(Nbase1-1,ci+Nthb0-1)] */
-    /* determine actual no. of baselines */
-    if (ci+Nthb0<Nbase1) {
-     Nthb=Nthb0;
-    } else {
-     Nthb=Nbase1-ci;
-    }
-
-    threaddata[nth].boff=ci+boff;
-    threaddata[nth].Nb=Nthb;
-    threaddata[nth].barr=dp->barr;
-    threaddata[nth].bcount=bcount; /* note this should be 0 first */
-    threaddata[nth].mx_array=gdata->mx_array;
-    
-    pthread_create(&gdata->th_array[nth],&gdata->attr,threadfn_fns_fcount,(void*)(&threaddata[nth]));
-    /* next baseline set */
-    ci=ci+Nthb;
-  }
-
-  /* now wait for threads to finish */
-  for(nth1=0; nth1<nth; nth1++) {
-   pthread_join(gdata->th_array[nth1],NULL);
-  }
- free(threaddata);
-
- /* calculate inverse count */
- for (nth1=0; nth1<dp->N; nth1++) {
-   gdata->iw[nth1]=(bcount[nth1]>0?1.0/(double)bcount[nth1]:0.0);
- }
- free(bcount);
- /* scale back weight such that max value is 1 */
- nth1=my_idamax(dp->N,gdata->iw,1);
- double maxw=gdata->iw[nth1-1]; /* 1 based index */
- if (maxw>0.0) { /* all baselines can be flagged */
-  my_dscal(dp->N,1.0/maxw,gdata->iw);
- }
-}
-
-
 
 /* worker thread function for cost function calculation */
 #ifdef USE_MIC