From fa1448be0197763c4f626f677f435c1e5c2cfd35 Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Mon, 30 Jul 2018 15:25:08 +0200 Subject: [PATCH 1/4] cleaning up LBFGS related code, removed duplicates --- README.md | 2 +- src/lib/Dirac/CMakeLists.txt | 2 +- src/lib/Dirac/Dirac.h | 138 ++-- src/lib/Dirac/Makefile.gpu | 4 +- src/lib/Dirac/admm_solve.c | 168 ----- src/lib/Dirac/lbfgs.c | 119 +--- src/lib/Dirac/lbfgs_nocuda.c | 522 +++------------ src/lib/Dirac/lmfit.c | 22 +- src/lib/Dirac/lmfit_nocuda.c | 10 +- src/lib/Dirac/robust_lbfgs_nocuda.c | 974 ++++++++++------------------ 10 files changed, 503 insertions(+), 1458 deletions(-) diff --git a/README.md b/README.md index 4ab434b..fa3ca5f 100644 --- a/README.md +++ b/README.md @@ -175,7 +175,7 @@ which can be used to count the row number. It will keep repeating this, for each The rows 0 to 7 belong to the solutions for the 1st station. The rows 8 to 15 for the 2nd station and so on. Each 8 rows of any given column represent the 8 values of a 2x2 Jones matrix. Lets say these are ```S0,S1,S2,S3,S4,S5,S6``` and ```S7```. Then the Jones matrix is ```[S0+j*S1, S4+j*S5; S2+j*S3, S6+j*S7]``` (the ';' denotes the 1st row of the 2x2 matrix). -When a luster has a chunk size > 1, there will be more than 1 solution per given time interval. +When a cluster has a chunk size > 1, there will be more than 1 solution per given time interval. So for this cluster, there will be more than 1 column in the solution file, the exact number of columns being equal to the chunk size. diff --git a/src/lib/Dirac/CMakeLists.txt b/src/lib/Dirac/CMakeLists.txt index 814a333..ba648bd 100644 --- a/src/lib/Dirac/CMakeLists.txt +++ b/src/lib/Dirac/CMakeLists.txt @@ -8,7 +8,6 @@ set (objects manifold_average mdl myblas - robust_lbfgs_nocuda robustlm rtr_solve_robust_admm updatenu @@ -50,6 +49,7 @@ else() lmfit_nocuda consensus_poly lbfgs_nocuda + robust_lbfgs_nocuda rtr_solve rtr_solve_robust ) diff --git a/src/lib/Dirac/Dirac.h b/src/lib/Dirac/Dirac.h index 09504ad..5830600 100644 --- a/src/lib/Dirac/Dirac.h +++ b/src/lib/Dirac/Dirac.h @@ -139,8 +139,27 @@ find_sumproduct(int N, float *x, float *y, float *sum1, float *sum2, int Nt); /****************************** lbfgs.c ****************************/ /****************************** lbfgs_nocuda.c ****************************/ /* LBFGS routines */ -/* func: vector function to minimize, actual cost minimized is ||func-x||^2 - NOTE: gradient function given seperately + +#ifndef HAVE_CUDA +/* line search */ +/* func: scalar function + xk: parameter values size m x 1 (at which step is calculated) + pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk) + alpha1: initial value for step + sigma,rho,t1,t2,t3: line search parameters (from Fletcher) + m: size or parameter vector + step: step size for differencing + adata: additional data passed to the function +*/ +extern double +linesearch( + double (*func)(double *p, int m, void *adata), + double *xk, double *pk, double alpha1, double sigma, double rho, double t1, double t2, double t3, int m, double step, void *adata); + +/* cost function : return a scalar cost, input : p (mx1) parameters, m: no. of params, adata: additional data + grad function: return gradient (mx1): input : p (mx1) parameters, g (mx1) gradient vector, m: no. of params, adata: additional data +*/ +/* p: parameters m x 1 (used as initial value, output final value) x: data n x 1 itmax: max iterations @@ -153,9 +172,21 @@ __attribute__ ((target(MIC))) #endif extern int lbfgs_fit( - void (*func)(double *p, double *hx, int m, int n, void *adata), + double (*cost_func)(double *p, int m, void *adata), + void (*grad_func)(double *p, double *g, int m, void *adata), + void (*change_batch)(int iter, void *adata), + double *p, int m, int itmax, int M, void *adata); +#endif /* !HAVE_CUDA */ + +#ifdef HAVE_CUDA +extern int +lbfgs_fit( + double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, void *adata); +extern int +lbfgs_fit_robust_cuda( double *p, double *x, int m, int n, int itmax, int lbfgs_m, int gpu_threads, void *adata); - +#endif /* HAVE_CUDA */ + /****************************** robust_lbfgs_nocuda.c ****************************/ typedef struct thread_data_logf_t_ { double *f; @@ -164,22 +195,21 @@ typedef struct thread_data_logf_t_ { int start,end; double sum; } thread_data_logf_t; - /* robust_nu: nu in T distribution */ + #ifdef USE_MIC __attribute__ ((target(MIC))) #endif extern int -lbfgs_fit_robust( - void (*func)(double *p, double *hx, int m, int n, void *adata), +lbfgs_fit_robust_wrapper( double *p, double *x, int m, int n, int itmax, int lbfgs_m, int gpu_threads, void *adata); -#ifdef HAVE_CUDA + extern int -lbfgs_fit_robust_cuda( - void (*func)(double *p, double *hx, int m, int n, void *adata), - double *p, double *x, int m, int n, int itmax, int lbfgs_m, int gpu_threads, void *adata); -#endif +lbfgs_fit_wrapper( + double *p, double *x, int m, int n, int itmax, int lbfgs_m, int gpu_threads, + void *adata); + /****************************** mderiv.cu ****************************/ /* cuda driver for kernel */ @@ -1300,6 +1330,15 @@ sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x extern void openblas_set_num_threads(int num_threads); +/****************************** lmfit.c ****************************/ +/****************************** lmfit_nocuda.c ****************************/ +/* minimization (or vector cost) function (multithreaded) */ +/* p: size mx1 parameters + x: size nx1 model being calculated + data: extra info needed */ +extern void +minimize_viz_full_pth(double *p, double *x, int m, int n, void *data); + /********* solver modes *********/ #define SM_LM_LBFGS 1 #define SM_OSLM_LBFGS 0 @@ -1381,15 +1420,13 @@ bfgsfit_visibilities(double *u, double *v, double *w, double *x, int N, int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt, int max_lbfgs, int lbfgs_m, int gpu_threads, int solver_mode, double mean_nu, double *res_0, double *res_1); + +#ifdef HAVE_CUDA extern int bfgsfit_visibilities_gpu(double *u, double *v, double *w, double *x, int N, int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt, int max_lbfgs, int lbfgs_m, int gpu_threads, int solver_mode, double mean_nu, double *res_0, double *res_1); - - - -#ifdef HAVE_CUDA /* data struct shared by all threads */ typedef struct gb_data_ { int status[2]; /* 0: do nothing, @@ -1492,75 +1529,6 @@ extern int sagefit_visibilities_dual_pt_flt(double *u, double *v, double *w, double *x, int N, int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode, double nulow, double nuhigh, int randomize, double *mean_nu, double *res_0, double *res_1); -/****************************** stationbeam.c ****************************/ -/* - ra,dec: source direction (rad) - ra0,dec0: beam center (rad) - f: frequency (Hz) - f0: beam forming frequency (Hz) - - longitude,latitude : Nx1 array of station positions (rad,rad) - time_jd: JD (day) time - Nelem : Nx1 array, no. of elements used in each station - x,y,z: Nx1 pointer arrays to station positions, each station has Nelem[]x1 arrays - - beamgain: Nx1 array of station beam gain along the source direction -*/ -extern int -arraybeam(double ra, double dec, double ra0, double dec0, double f, double f0, int N, double *longitude, double *latitude, double time_jd, int *Nelem, double **x, double **y, double **z, double *beamgain); - - -/****************************** predict_withbeam.c ****************************/ -/* precalculate cluster coherencies - u,v,w: u,v,w coordinates (wavelengths) size Nbase*tilesz x 1 - u,v,w are ordered with baselines, timeslots - x: coherencies size Nbase*4*Mx 1 - ordered by XX(re,im),XY(re,im),YX(re,im), YY(re,im), baseline, timeslots - N: no of stations - Nbase: total no of baselines (including more than one tile or timeslot) - barr: baseline to station map, size Nbase*tilesz x 1 - carr: sky model/cluster info size Mx1 of clusters - M: no of clusters - freq0: frequency - fdelta: bandwidth for freq smearing - tdelta: integration time for time smearing - dec0: declination for time smearing - uvmin: baseline length sqrt(u^2+v^2) below which not to include in solution - uvmax: baseline length higher than this not included in solution - - Station beam specific parameters - ph_ra0,ph_dec0: beam pointing rad,rad - ph_freq0: beam reference freq - longitude,latitude: Nx1 arrays (rad,rad) station locations - time_utc: JD (day) : tilesz x 1 - tilesz: how many tiles: == unique time_utc - Nelem: Nx1 array, size of stations (elements) - xx,yy,zz: Nx1 arrays of station element locations arrays xx[],yy[],zz[] - Nt: no of threads - - NOTE: prediction is done for all baselines, even flagged ones - and flags are set to 2 for baselines lower than uvcut -*/ - -extern int -precalculate_coherencies_withbeam(double *u, double *v, double *w, complex double *x, int N, - int Nbase, baseline_t *barr, clus_source_t *carr, int M, double freq0, double fdelta, double tdelta, double dec0, double uvmin, double uvmax, - double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int tileze, int *Nelem, double **xx, double **yy, double **zz, int Nt); - - -extern int -predict_visibilities_multifreq_withbeam(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0, - double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int Nt, int add_to_data); - -extern int -calculate_residuals_multifreq_withbeam(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0, -double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int Nt, int ccid, double rho, int phase_only); - - -/* change epoch of soure ra,dec from J2000 to JAPP */ -/* also the beam pointing ra_beam,dec_beam */ -extern int -precess_source_locations(double jd_tdb, clus_source_t *carr, int M, double *ra_beam, double *dec_beam, int Nt); #ifdef __cplusplus } /* extern "C" */ diff --git a/src/lib/Dirac/Makefile.gpu b/src/lib/Dirac/Makefile.gpu index 564defd..e01d9e0 100644 --- a/src/lib/Dirac/Makefile.gpu +++ b/src/lib/Dirac/Makefile.gpu @@ -21,7 +21,7 @@ INCLUDES=-I. LIBPATH= $(CUDALIB) -OBJECTS=lmfit.o lbfgs.o myblas.o mderiv.o clmfit.o clmfit_nocuda.o barrier.o robust.o robustlm.o oslmfit.o mderiv_fl.o clmfit_fl.o updatenu.o robust_lbfgs_nocuda.o robust_fl.o manifold_fl.o rtr_solve_cuda.o rtr_solve_robust_cuda.o manifold_average.o consensus_poly.o rtr_solve_robust_cuda_admm.o rtr_solve_robust_admm.o admm_solve.o load_balance.o mdl.o +OBJECTS=lmfit.o lbfgs.o myblas.o mderiv.o clmfit.o clmfit_nocuda.o barrier.o robust.o robustlm.o oslmfit.o mderiv_fl.o clmfit_fl.o updatenu.o robust_fl.o manifold_fl.o rtr_solve_cuda.o rtr_solve_robust_cuda.o manifold_average.o consensus_poly.o rtr_solve_robust_cuda_admm.o rtr_solve_robust_admm.o admm_solve.o load_balance.o mdl.o default:libdirac.a @@ -47,8 +47,6 @@ robust_fl.o:robust_fl.cu $(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $< oslmfit.o:oslmfit.c $(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $< -robust_lbfgs_nocuda.o:robust_lbfgs_nocuda.c - $(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $< clmfit_fl.o:clmfit_fl.c $(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $< updatenu.o:updatenu.c diff --git a/src/lib/Dirac/admm_solve.c b/src/lib/Dirac/admm_solve.c index 9628069..a8edb23 100644 --- a/src/lib/Dirac/admm_solve.c +++ b/src/lib/Dirac/admm_solve.c @@ -213,174 +213,6 @@ mylm_fit_single_pth(double *p, double *x, int m, int n, void *data) { -/* worker thread function for prediction */ -static void * -predict_threadfn_withgain_full(void *data) { - thread_data_base_t *t=(thread_data_base_t*)data; - - int ci,cm,sta1,sta2; - double *pm; - complex double C[4],G1[4],G2[4],T1[4],T2[4]; - int M=(t->M); - int Ntilebase=(t->Nbase)*(t->tilesz); - int px; - - for (ci=0; ciNb; ci++) { - /* iterate over the sky model and calculate contribution */ - /* for this x[8*ci:8*(ci+1)-1] */ - memset(&(t->x[8*ci]),0,sizeof(double)*8); - - /* if this baseline is flagged, we do not compute */ - if (!t->barr[ci+t->boff].flag) { - - /* stations for this baseline */ - sta1=t->barr[ci+t->boff].sta1; - sta2=t->barr[ci+t->boff].sta2; - for (cm=0; cm x[0.....Nbase*tilesz/nchunk-1] - p[1] -> x[Nbase*tilesz/nchunk......2*Nbase*tilesz-1] - .... - p[last] -> x[(nchunk-1)*Nbase*tilesz/nchunk......Nbase*tilesz] - - so given bindex, right p[] is bindex/((Nbase*tilesz+nchunk-1)/nchunk) - */ - - px=(ci+t->boff)/((Ntilebase+t->carr[cm].nchunk-1)/t->carr[cm].nchunk); - //pm=&(t->p[cm*8*N]); - pm=&(t->p[t->carr[cm].p[px]]); - G1[0]=(pm[sta1*8])+_Complex_I*(pm[sta1*8+1]); - G1[1]=(pm[sta1*8+2])+_Complex_I*(pm[sta1*8+3]); - G1[2]=(pm[sta1*8+4])+_Complex_I*(pm[sta1*8+5]); - G1[3]=(pm[sta1*8+6])+_Complex_I*(pm[sta1*8+7]); - G2[0]=(pm[sta2*8])+_Complex_I*(pm[sta2*8+1]); - G2[1]=(pm[sta2*8+2])+_Complex_I*(pm[sta2*8+3]); - G2[2]=(pm[sta2*8+4])+_Complex_I*(pm[sta2*8+5]); - G2[3]=(pm[sta2*8+6])+_Complex_I*(pm[sta2*8+7]); - - - /* use pre calculated values */ - C[0]=t->coh[4*M*ci+4*cm]; - C[1]=t->coh[4*M*ci+4*cm+1]; - C[2]=t->coh[4*M*ci+4*cm+2]; - C[3]=t->coh[4*M*ci+4*cm+3]; - - /* form G1*C*G2' */ - /* T1=G1*C */ - amb(G1,C,T1); - /* T2=T1*G2' */ - ambt(T1,G2,T2); - - /* add to baseline visibilities */ - t->x[8*ci]+=creal(T2[0]); - t->x[8*ci+1]+=cimag(T2[0]); - t->x[8*ci+2]+=creal(T2[1]); - t->x[8*ci+3]+=cimag(T2[1]); - t->x[8*ci+4]+=creal(T2[2]); - t->x[8*ci+5]+=cimag(T2[2]); - t->x[8*ci+6]+=creal(T2[3]); - t->x[8*ci+7]+=cimag(T2[3]); - } - } - } - - return NULL; -} - - -/* minimization function (multithreaded) */ -/* p: size mx1 parameters - x: size nx1 data calculated - data: extra info needed */ -static void -minimize_viz_full_pth(double *p, double *x, int m, int n, void *data) { - - me_data_t *dp=(me_data_t*)data; - /* u,v,w : size Nbase*tilesz x 1 x: size Nbase*8*tilesz x 1 */ - /* barr: size Nbase*tilesz x 1 carr: size Mx1 */ - /* pp: size 8*N*M x 1 */ - /* pm: size Mx1 of double */ - - int nth,nth1,ci; - - /* no of threads */ - int Nt=(dp->Nt); - int Nthb0,Nthb; - pthread_attr_t attr; - pthread_t *th_array; - thread_data_base_t *threaddata; - - int Nbase1=(dp->Nbase)*(dp->tilesz); - - /* calculate min baselines a thread can handle */ - //Nthb0=ceil((double)Nbase1/(double)Nt); - Nthb0=(Nbase1+Nt-1)/Nt; - - /* setup threads */ - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE); - - if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) { - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); - exit(1); - } - if ((threaddata=(thread_data_base_t*)malloc((size_t)Nt*sizeof(thread_data_base_t)))==0) { - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); - exit(1); - } - - - /* iterate over threads, allocating baselines per thread */ - ci=0; - for (nth=0; nthbarr; - threaddata[nth].u=&(dp->u[ci]); - threaddata[nth].v=&(dp->v[ci]); - threaddata[nth].w=&(dp->w[ci]); - threaddata[nth].carr=dp->carr; - threaddata[nth].M=dp->M; - threaddata[nth].x=&(x[8*ci]); - threaddata[nth].p=p; - threaddata[nth].N=dp->N; - threaddata[nth].Nbase=dp->Nbase; - threaddata[nth].tilesz=dp->tilesz; - threaddata[nth].coh=&(dp->coh[4*(dp->M)*ci]); - - //printf("thread %d predict data from %d baselines %d\n",nth,8*ci,Nthb); - pthread_create(&th_array[nth],&attr,predict_threadfn_withgain_full,(void*)(&threaddata[nth])); - /* next baseline set */ - ci=ci+Nthb; - } - - /* now wait for threads to finish */ - for(nth1=0; nth1b is possible) to find step that minimizes cost function */ -/* func: vector function +/* xk: parameter values size m x 1 (at which step is calculated) pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk) a/b: interval for interpolation @@ -299,7 +299,6 @@ mult_hessian(int m, double *pk, double *gk, double *s, double *y, double *rho, i */ static double cubic_interp( - void (*func)(double *p, double *hx, int m, int n, void *adata), double *xk, double *pk, double a, double b, double *x, double *xp, double *xo, int m, int n, double step, void *adata, th_pipeline *tp, gbdata_b *tpg) { double f0,f1,f0d,f1d; /* function values and derivatives at a,b */ @@ -308,10 +307,6 @@ cubic_interp( my_dcopy(m,xk,1,xp,1); /* xp<=xk */ my_daxpy(m,pk,a,xp); /* xp<=xp+(a)*pk */ -// func(xp,x,m,n,adata); -// my_daxpy(n,xo,-1.0,x); -// f0=my_dnrm2(n,x); -// f0*=f0; sync_barrier(&(tp->gate1)); tpg->lmdata[0]->p=tpg->lmdata[1]->p=xp; tpg->status[0]=tpg->status[1]=PT_DO_CCOST; @@ -323,9 +318,6 @@ cubic_interp( /* grad(phi_0): evaluate at -step and +step */ my_daxpy(m,pk,step,xp); /* xp<=xp+(a+step)*pk */ -// func(xp,x,m,n,adata); -// my_daxpy(n,xo,-1.0,x); -// p01=my_dnrm2(n,x); sync_barrier(&(tp->gate1)); tpg->status[0]=tpg->status[1]=PT_DO_CCOST; sync_barrier(&(tp->gate2)); @@ -336,9 +328,6 @@ cubic_interp( my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(a-step)*pk */ -// func(xp,x,m,n,adata); -// my_daxpy(n,xo,-1.0,x); -// p02=my_dnrm2(n,x); sync_barrier(&(tp->gate1)); tpg->status[0]=tpg->status[1]=PT_DO_CCOST; sync_barrier(&(tp->gate2)); @@ -348,16 +337,9 @@ cubic_interp( sync_barrier(&(tp->gate2)); -// f0d=(p01*p01-p02*p02)/(2.0*step); f0d=(p01-p02)/(2.0*step); - //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp=xk+(a-step)*pk */ - //my_daxpy(m,pk,b,xp); /* xp<=xp+(b)*pk */ my_daxpy(m,pk,-a+step+b,xp); /* xp<=xp+(b)*pk */ -// func(xp,x,m,n,adata); -// my_daxpy(n,xo,-1.0,x); -// f1=my_dnrm2(n,x); -// f1*=f1; sync_barrier(&(tp->gate1)); tpg->status[0]=tpg->status[1]=PT_DO_CCOST; sync_barrier(&(tp->gate2)); @@ -369,9 +351,6 @@ cubic_interp( /* grad(phi_1): evaluate at -step and +step */ my_daxpy(m,pk,step,xp); /* xp<=xp+(b+step)*pk */ -// func(xp,x,m,n,adata); -// my_daxpy(n,xo,-1.0,x); -// p01=my_dnrm2(n,x); sync_barrier(&(tp->gate1)); tpg->status[0]=tpg->status[1]=PT_DO_CCOST; sync_barrier(&(tp->gate2)); @@ -382,9 +361,6 @@ cubic_interp( my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(b-step)*pk */ -// func(xp,x,m,n,adata); -// my_daxpy(n,xo,-1.0,x); -// p02=my_dnrm2(n,x); sync_barrier(&(tp->gate1)); tpg->status[0]=tpg->status[1]=PT_DO_CCOST; sync_barrier(&(tp->gate2)); @@ -394,7 +370,6 @@ cubic_interp( sync_barrier(&(tp->gate2)); -// f1d=(p01*p01-p02*p02)/(2.0*step); f1d=(p01-p02)/(2.0*step); @@ -417,13 +392,7 @@ cubic_interp( fz0=f0+f1; } else { /* evaluate function for this root */ - //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp=xk+(b-step)*pk */ - //my_daxpy(m,pk,a+z0*(b-a),xp); /* xp<=xp+(a+z0(b-a))*pk */ my_daxpy(m,pk,-b+step+a+z0*(b-a),xp); /* xp<=xp+(a+z0(b-a))*pk */ -// func(xp,x,m,n,adata); -// my_daxpy(n,xo,-1.0,x); -// fz0=my_dnrm2(n,x); -// fz0*=fz0; sync_barrier(&(tp->gate1)); tpg->status[0]=tpg->status[1]=PT_DO_CCOST; sync_barrier(&(tp->gate2)); @@ -460,7 +429,7 @@ cubic_interp( /*************** Fletcher line search **********************************/ /* zoom function for line search */ -/* func: vector function +/* xk: parameter values size m x 1 (at which step is calculated) pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk) a/b: bracket interval [a,b] (a>b) is possible @@ -475,7 +444,6 @@ cubic_interp( */ static double linesearch_zoom( - void (*func)(double *p, double *hx, int m, int n, void *adata), double *xk, double *pk, double a, double b, double *x, double *xp, double phi_0, double gphi_0, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata, th_pipeline *tp, gbdata_b *tpg) { double alphaj,phi_j,phi_aj; @@ -490,17 +458,12 @@ linesearch_zoom( /* choose alphaj from [a+t2(b-a),b-t3(b-a)] */ p01=aj+t2*(bj-aj); p02=bj-t3*(bj-aj); - alphaj=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata,tp,tpg); + alphaj=cubic_interp(xk,pk,p01,p02,x,xp,xo,m,n,step,adata,tp,tpg); //printf("cubic intep [%lf,%lf]->%lf\n",p01,p02,alphaj); /* evaluate phi(alphaj) */ my_dcopy(m,xk,1,xp,1); /* xp<=xk */ my_daxpy(m,pk,alphaj,xp); /* xp<=xp+(alphaj)*pk */ -// func(xp,x,m,n,adata); - /* calculate x<=x-xo */ -// my_daxpy(n,xo,-1.0,x); -// phi_j=my_dnrm2(n,x); -// phi_j*=phi_j; sync_barrier(&(tp->gate1)); tpg->lmdata[0]->p=tpg->lmdata[1]->p=xp; tpg->status[0]=tpg->status[1]=PT_DO_CCOST; @@ -512,14 +475,7 @@ linesearch_zoom( /* evaluate phi(aj) */ - //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp = xk+alphaj*pk */ - //my_daxpy(m,pk,aj,xp); /* xp<=xp+(aj)*pk */ my_daxpy(m,pk,-alphaj+aj,xp); /* xp<=xp+(aj)*pk */ -// func(xp,x,m,n,adata); - /* calculate x<=x-xo */ -// my_daxpy(n,xo,-1.0,x); -// phi_aj=my_dnrm2(n,x); -// phi_aj*=phi_aj; sync_barrier(&(tp->gate1)); tpg->status[0]=tpg->status[1]=PT_DO_CCOST; sync_barrier(&(tp->gate2)); @@ -533,13 +489,7 @@ linesearch_zoom( bj=alphaj; /* aj unchanged */ } else { /* evaluate grad(alphaj) */ - //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp = xk+aj*pk */ - //my_daxpy(m,pk,alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */ my_daxpy(m,pk,-aj+alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */ -// func(xp,x,m,n,adata); - /* calculate x<=x-xo */ -// my_daxpy(n,xo,-1.0,x); -// p01=my_dnrm2(n,x); sync_barrier(&(tp->gate1)); tpg->status[0]=tpg->status[1]=PT_DO_CCOST; sync_barrier(&(tp->gate2)); @@ -550,10 +500,6 @@ linesearch_zoom( my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphaj-step)*pk */ -// func(xp,x,m,n,adata); - /* calculate x<=x-xo */ -// my_daxpy(n,xo,-1.0,x); -// p02=my_dnrm2(n,x); sync_barrier(&(tp->gate1)); tpg->status[0]=tpg->status[1]=PT_DO_CCOST; sync_barrier(&(tp->gate2)); @@ -563,7 +509,6 @@ linesearch_zoom( sync_barrier(&(tp->gate2)); -// gphi_j=(p01*p01-p02*p02)/(2.0*step); gphi_j=(p01-p02)/(2.0*step); /* termination due to roundoff/other errors pp. 38, Fletcher */ @@ -601,7 +546,7 @@ linesearch_zoom( /* line search */ -/* func: vector function +/* xk: parameter values size m x 1 (at which step is calculated) pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk) alpha1: initial value for step @@ -613,7 +558,6 @@ linesearch_zoom( */ static double linesearch( - void (*func)(double *p, double *hx, int m, int n, void *adata), double *xk, double *pk, double alpha1, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata, th_pipeline *tp, gbdata_b *tpg) { /* phi(alpha)=f(xk+alpha pk) for vector function func @@ -642,11 +586,6 @@ linesearch( alphak=1.0; /* evaluate phi_0 and grad(phi_0) */ -//func(xk,x,m,n,adata); -//my_daxpy(n,xo,-1.0,x); -//phi_0=my_dnrm2(n,x); -//phi_0*=phi_0; -//printf("CPU cost=%lf\n",phi_0); sync_barrier(&(tp->gate1)); tpg->lmdata[0]->p=tpg->lmdata[1]->p=xk; tpg->status[0]=tpg->status[1]=PT_DO_CCOST; @@ -663,10 +602,6 @@ linesearch( my_dcopy(m,xk,1,xp,1); /* xp<=xk */ my_daxpy(m,pk,step,xp); /* xp<=xp+(0.0+step)*pk */ -//func(xp,x,m,n,adata); - /* calculate x<=x-xo */ -//my_daxpy(n,xo,-1.0,x); -//p01=my_dnrm2(n,x); sync_barrier(&(tp->gate1)); tpg->lmdata[0]->p=tpg->lmdata[1]->p=xp; tpg->status[0]=tpg->status[1]=PT_DO_CCOST; @@ -677,10 +612,6 @@ linesearch( sync_barrier(&(tp->gate2)); my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(0.0-step)*pk */ -//func(xp,x,m,n,adata); - /* calculate x<=x-xo */ -//my_daxpy(n,xo,-1.0,x); -//p02=my_dnrm2(n,x); sync_barrier(&(tp->gate1)); tpg->status[0]=tpg->status[1]=PT_DO_CCOST; sync_barrier(&(tp->gate2)); @@ -690,7 +621,6 @@ linesearch( sync_barrier(&(tp->gate2)); -// gphi_0=(p01*p01-p02*p02)/(2.0*step); gphi_0=(p01-p02)/(2.0*step); @@ -719,11 +649,6 @@ linesearch( /* evalualte phi(alpha(i))=f(xk+alphai pk) */ my_dcopy(m,xk,1,xp,1); /* xp<=xk */ my_daxpy(m,pk,alphai,xp); /* xp<=xp+alphai*pk */ -// func(xp,x,m,n,adata); - /* calculate x<=x-xo */ -// my_daxpy(n,xo,-1.0,x); -// phi_alphai=my_dnrm2(n,x); -// phi_alphai*=phi_alphai; sync_barrier(&(tp->gate1)); tpg->status[0]=tpg->status[1]=PT_DO_CCOST; sync_barrier(&(tp->gate2)); @@ -743,7 +668,7 @@ linesearch( if ((phi_alphai>phi_0+alphai*gphi_0) || (ci>1 && phi_alphai>=phi_alphai1)) { /* ai=alphai1, bi=alphai bracket */ - alphak=linesearch_zoom(func,xk,pk,alphai1,alphai,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata,tp,tpg); + alphak=linesearch_zoom(xk,pk,alphai1,alphai,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata,tp,tpg); #ifdef DEBUG printf("Linesearch : Condition 1 met\n"); #endif @@ -751,13 +676,7 @@ linesearch( } /* evaluate grad(phi(alpha(i))) */ - //my_dcopy(m,xk,1,xp,1); /* NOT NEEDED here because already xp=xk+alphai*pk */ - //my_daxpy(m,pk,alphai+step,xp); /* xp<=xp+(alphai+step)*pk */ my_daxpy(m,pk,step,xp); /* xp<=xp+(alphai+step)*pk */ -// func(xp,x,m,n,adata); - /* calculate x<=x-xo */ -// my_daxpy(n,xo,-1.0,x); -// p01=my_dnrm2(n,x); sync_barrier(&(tp->gate1)); tpg->status[0]=tpg->status[1]=PT_DO_CCOST; sync_barrier(&(tp->gate2)); @@ -767,10 +686,6 @@ linesearch( sync_barrier(&(tp->gate2)); my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphai-step)*pk */ -// func(xp,x,m,n,adata); - /* calculate x<=x-xo */ -// my_daxpy(n,xo,-1.0,x); -// p02=my_dnrm2(n,x); sync_barrier(&(tp->gate1)); tpg->status[0]=tpg->status[1]=PT_DO_CCOST; sync_barrier(&(tp->gate2)); @@ -780,7 +695,6 @@ linesearch( sync_barrier(&(tp->gate2)); -// gphi_i=(p01*p01-p02*p02)/(2.0*step); gphi_i=(p01-p02)/(2.0*step); if (fabs(gphi_i)<=-sigma*gphi_0) { @@ -793,7 +707,7 @@ linesearch( if (gphi_i>=0) { /* ai=alphai, bi=alphai1 bracket */ - alphak=linesearch_zoom(func,xk,pk,alphai,alphai1,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata,tp,tpg); + alphak=linesearch_zoom(xk,pk,alphai,alphai1,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata,tp,tpg); #ifdef DEBUG printf("Linesearch : Condition 3 met\n"); #endif @@ -809,7 +723,7 @@ linesearch( /* choose by interpolation in [2*alphai-alphai1,min(mu,alphai+t1*(alphai-alphai1)] */ p01=2.0*alphai-alphai1; p02=MIN(mu,alphai+t1*(alphai-alphai1)); - alphai=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata,tp,tpg); + alphai=cubic_interp(xk,pk,p01,p02,x,xp,xo,m,n,step,adata,tp,tpg); //printf("cubic interp [%lf,%lf]->%lf\n",p01,p02,alphai); } phi_alphai1=phi_alphai; @@ -832,7 +746,6 @@ linesearch( /* note M here is LBFGS memory size */ static int lbfgs_fit_common( - void (*func)(double *p, double *hx, int m, int n, void *adata), double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, int do_robust, void *adata) { double *gk; /* gradients at both k+1 and k iter */ @@ -987,14 +900,6 @@ lbfgs_fit_common( sync_barrier(&(tp.gate1)); tpg.status[0]=tpg.status[1]=PT_DO_NOTHING; sync_barrier(&(tp.gate2)); -// for (ci=0; ci<20; ci++) { -// printf("GPU %d %lf\n",ci,gk[ci]); -// } - /* gradient gk=grad(f)_k */ -// func_grad(func,xk,gk,x,m,n,step,gpu_threads,adata); -// for (ci=0; ci<20; ci++) { -// printf("CPU %d %lf\n",ci,gk[ci]); -// } double gradnrm=my_dnrm2(m,gk); /* if gradient is too small, no need to solve, so stop */ @@ -1025,7 +930,7 @@ lbfgs_fit_common( /* linesearch to find step length */ /* parameters alpha1=10.0,sigma=0.1, rho=0.01, t1=9, t2=0.1, t3=0.5 */ /* FIXME: update paramters for GPU gradient */ - alphak=linesearch(func,xk,pk,10.0,0.1,0.01,9,0.1,0.5,x,m,n,step,adata,&tp, &tpg); + alphak=linesearch(xk,pk,10.0,0.1,0.01,9,0.1,0.5,x,m,n,step,adata,&tp, &tpg); /* check if step size is too small, or nan, then stop */ if (!isnormal(alphak) || fabs(alphak) - -/**** repeated code here ********************/ -/* Jones matrix multiplication - C=A*B -*/ -#ifdef USE_MIC -__attribute__ ((target(MIC))) -#endif -static void -amb(complex double * __restrict a, complex double * __restrict b, complex double * __restrict c) { - c[0]=a[0]*b[0]+a[1]*b[2]; - c[1]=a[0]*b[1]+a[1]*b[3]; - c[2]=a[2]*b[0]+a[3]*b[2]; - c[3]=a[2]*b[1]+a[3]*b[3]; -} - -/* Jones matrix multiplication - C=A*B^H -*/ -#ifdef USE_MIC -__attribute__ ((target(MIC))) -#endif -static void -ambt(complex double * __restrict a, complex double * __restrict b, complex double * __restrict c) { - c[0]=a[0]*conj(b[0])+a[1]*conj(b[1]); - c[1]=a[0]*conj(b[2])+a[1]*conj(b[3]); - c[2]=a[2]*conj(b[0])+a[3]*conj(b[1]); - c[3]=a[2]*conj(b[2])+a[3]*conj(b[3]); -} - -/**** end repeated code ********************/ - -/* worker thread for a cpu */ -#ifdef USE_MIC -__attribute__ ((target(MIC))) -#endif -static void * -cpu_calc_deriv(void *adata) { - thread_data_grad_t *t=(thread_data_grad_t*)adata; - - int ci,nb; - int stc,stoff,stm,sta1,sta2; - int N=t->N; /* stations */ - int M=t->M; /* clusters */ - int Nbase=(t->Nbase)*(t->tilesz); - - - complex double xr[4]; /* residuals */ - complex double G1[4],G2[4],C[4],T1[4],T2[4]; - double pp[8]; - complex double csum; - int cli,tpchunk,pstart,nchunk,tilesperchunk,stci,ttile,tptile,poff; - - /* iterate over each paramter */ - for (ci=t->g_start; ci<=t->g_end; ++ci) { - t->g[ci]=0.0; - /* find station and parameter corresponding to this value of ci */ - /* this parameter should correspond to the right baseline (x tilesz) - to contribute to the derivative */ - cli=0; - while((clicarr[cli].p[0] || ci>t->carr[cli].p[0]+8*N*t->carr[cli].nchunk-1)) { - cli++; - } - /* now either cli>=M: cluster not found - or cli=t->carr[cli-1].p[0] && ci<=t->carr[cli-1].p[0]+8*N*t->carr[cli-1].nchunk-1) { - cli--; - } - - if (clicarr[cli].p[0]; - - stc=(stci%(8*N))/8; /* 0..N-1 */ - /* make sure this baseline contribute to this parameter */ - tpchunk=stci/(8*N); - nchunk=t->carr[cli].nchunk; - pstart=t->carr[cli].p[0]; - tilesperchunk=(t->tilesz+nchunk-1)/nchunk; - - - /* iterate over all baselines and accumulate sum */ - for (nb=0; nbNbase; - /* which chunk this tile belongs to */ - tptile=ttile/tilesperchunk; - /* now tptile has to match tpchunk, otherwise ignore calculation */ - if (tptile==tpchunk) { - - sta1=t->barr[nb].sta1; - sta2=t->barr[nb].sta2; - - if (((stc==sta1)||(stc==sta2))&& !t->barr[nb].flag) { - /* this baseline has a contribution */ - /* which paramter of this station */ - stoff=(stci%(8*N))%8; /* 0..7 */ - /* which cluster */ - stm=cli; /* 0..M-1 */ - - /* exact expression for derivative - 2 real( vec^H(residual_this_baseline) - * vec(-J_{pm}C_{pqm} J_{qm}^H) - where m: chosen cluster - J_{pm},J_{qm} Jones matrices for baseline p-q - depending on the parameter, J ==> E - E: zero matrix, except 1 at location of m - - residual : in x[8*nb:8*nb+7] - C coh: in coh[8*M*nb+m*8:8*M*nb+m*8+7] (double storage) - coh[4*M*nb+4*m:4*M*nb+4*m+3] (complex storage) - J_p,J_q: in p[sta1*8+m*8*N: sta1*8+m*8*N+7] - and p[sta2*8+m*8*N: sta2*8+m*8*N+ 7] - */ - /* read in residual vector, conjugated */ - xr[0]=(t->x[nb*8])-_Complex_I*(t->x[nb*8+1]); - xr[1]=(t->x[nb*8+2])-_Complex_I*(t->x[nb*8+3]); - xr[2]=(t->x[nb*8+4])-_Complex_I*(t->x[nb*8+5]); - xr[3]=(t->x[nb*8+6])-_Complex_I*(t->x[nb*8+7]); - - /* read in coherency */ - C[0]=t->coh[4*M*nb+4*stm]; - C[1]=t->coh[4*M*nb+4*stm+1]; - C[2]=t->coh[4*M*nb+4*stm+2]; - C[3]=t->coh[4*M*nb+4*stm+3]; - - memset(pp,0,sizeof(double)*8); - if (stc==sta1) { - /* this station parameter gradient */ - pp[stoff]=1.0; - memset(G1,0,sizeof(complex double)*4); - G1[0]=pp[0]+_Complex_I*pp[1]; - G1[1]=pp[2]+_Complex_I*pp[3]; - G1[2]=pp[4]+_Complex_I*pp[5]; - G1[3]=pp[6]+_Complex_I*pp[7]; - poff=pstart+tpchunk*8*N+sta2*8; - G2[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]); - G2[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]); - G2[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+4]); - G2[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]); - - } else if (stc==sta2) { - memset(G2,0,sizeof(complex double)*4); - pp[stoff]=1.0; - G2[0]=pp[0]+_Complex_I*pp[1]; - G2[1]=pp[2]+_Complex_I*pp[3]; - G2[2]=pp[4]+_Complex_I*pp[5]; - G2[3]=pp[6]+_Complex_I*pp[7]; - poff=pstart+tpchunk*8*N+sta1*8; - G1[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]); - G1[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]); - G1[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+5]); - G1[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]); - } - - /* T1=G1*C */ - amb(G1,C,T1); - /* T2=T1*G2' */ - ambt(T1,G2,T2); - - /* calculate product xr*vec(J_p C J_q^H ) */ - csum=xr[0]*T2[0]; - csum+=xr[1]*T2[1]; - csum+=xr[2]*T2[2]; - csum+=xr[3]*T2[3]; - - /* accumulate sum */ - t->g[ci]+=-2.0*creal(csum); - } - } - } - } - } - - - return NULL; -} - -#ifdef USE_MIC -__attribute__ ((target(MIC))) -#endif -static int -func_grad( - void (*func)(double *p, double *hx, int m, int n, void *adata), - double *p, double *g, double *xo, int m, int n, double step, void *adata) { - /* gradient for each parameter is - (||func(p+step*e_i)-x||^2-||func(p-step*e_i)-x||^2)/2*step - i=0,...,m-1 for all parameters - e_i: unit vector, 1 only at i-th location - */ - - double *x; /* array to store residual */ - int ci; - me_data_t *dp=(me_data_t*)adata; - - int Nt=dp->Nt; - - pthread_attr_t attr; - pthread_t *th_array; - thread_data_grad_t *threaddata; - - - if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } - /* evaluate func once, store in x, and create threads */ - /* and calculate the residual x=xo-func */ - func(p,x,m,n,adata); - /* calculate x<=x-xo */ - my_daxpy(n,xo,-1.0,x); - - /* setup threads */ - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE); - - if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } - if ((threaddata=(thread_data_grad_t*)malloc((size_t)Nt*sizeof(thread_data_grad_t)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } - - int nth,nth1,Nparm; - - /* parameters per thread */ - Nparm=(m+Nt-1)/Nt; - - /* each thread will calculate derivative of part of - parameters */ - ci=0; - for (nth=0; nthNbase; - threaddata[nth].tilesz=dp->tilesz; - threaddata[nth].barr=dp->barr; - threaddata[nth].carr=dp->carr; - threaddata[nth].M=dp->M; - threaddata[nth].N=dp->N; - threaddata[nth].coh=dp->coh; - threaddata[nth].m=m; - threaddata[nth].n=n; - threaddata[nth].x=x; - threaddata[nth].p=p; - threaddata[nth].g=g; - threaddata[nth].g_start=ci; - threaddata[nth].g_end=ci+Nparm-1; - if (threaddata[nth].g_end>=m) { - threaddata[nth].g_end=m-1; - } - ci=ci+Nparm; - pthread_create(&th_array[nth],&attr,cpu_calc_deriv,(void*)(&threaddata[nth])); - } - - /* now wait for threads to finish */ - for(nth1=0; nth1b is possible) to find step that minimizes cost function */ -/* func: vector function +/* func: scalar function xk: parameter values size m x 1 (at which step is calculated) pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk) a/b: interval for interpolation - x: size n x 1 (storage) xp: size m x 1 (storage) - xo: observed data size n x 1 - n: size of vector function step: step size for differencing adata: additional data passed to the function */ -#ifdef USE_MIC -__attribute__ ((target(MIC))) -#endif static double cubic_interp( - void (*func)(double *p, double *hx, int m, int n, void *adata), - double *xk, double *pk, double a, double b, double *x, double *xp, double *xo, int m, int n, double step, void *adata) { + double (*func)(double *p, int m, void *adata), + double *xk, double *pk, double a, double b, double *xp, int m, double step, void *adata) { double f0,f1,f0d,f1d; /* function values and derivatives at a,b */ double p01,p02,z0,fz0; @@ -406,38 +124,24 @@ cubic_interp( my_dcopy(m,xk,1,xp,1); /* xp<=xk */ my_daxpy(m,pk,a,xp); /* xp<=xp+(a)*pk */ - func(xp,x,m,n,adata); - my_daxpy(n,xo,-1.0,x); - f0=my_dnrm2(n,x); - f0*=f0; + f0=func(xp,m,adata); /* grad(phi_0): evaluate at -step and +step */ my_daxpy(m,pk,step,xp); /* xp<=xp+(a+step)*pk */ - func(xp,x,m,n,adata); - my_daxpy(n,xo,-1.0,x); - p01=my_dnrm2(n,x); + p01=func(xp,m,adata); my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(a-step)*pk */ - func(xp,x,m,n,adata); - my_daxpy(n,xo,-1.0,x); - p02=my_dnrm2(n,x); - f0d=(p01*p01-p02*p02)/(2.0*step); + p02=func(xp,m,adata); + f0d=(p01-p02)/(2.0*step); - //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp=xk+(a-step)*pk */ - //my_daxpy(m,pk,b,xp); /* xp<=xp+(b)*pk */ + //FIXME my_dcopy(m,xk,1,xp,1); /* not necessary because xp=xk+(a-step)*pk */ + //FIXME my_daxpy(m,pk,b,xp); /* xp<=xp+(b)*pk */ my_daxpy(m,pk,-a+step+b,xp); /* xp<=xp+(b)*pk */ - func(xp,x,m,n,adata); - my_daxpy(n,xo,-1.0,x); - f1=my_dnrm2(n,x); - f1*=f1; + f1=func(xp,m,adata); /* grad(phi_1): evaluate at -step and +step */ my_daxpy(m,pk,step,xp); /* xp<=xp+(b+step)*pk */ - func(xp,x,m,n,adata); - my_daxpy(n,xo,-1.0,x); - p01=my_dnrm2(n,x); + p01=func(xp,m,adata); my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(b-step)*pk */ - func(xp,x,m,n,adata); - my_daxpy(n,xo,-1.0,x); - p02=my_dnrm2(n,x); - f1d=(p01*p01-p02*p02)/(2.0*step); + p02=func(xp,m,adata); + f1d=(p01-p02)/(2.0*step); //printf("Interp a,f(a),f'(a): (%lf,%lf,%lf) (%lf,%lf,%lf)\n",a,f0,f0d,b,f1,f1d); @@ -459,13 +163,10 @@ cubic_interp( fz0=f0+f1; } else { /* evaluate function for this root */ - //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp=xk+(b-step)*pk */ - //my_daxpy(m,pk,a+z0*(b-a),xp); /* xp<=xp+(a+z0(b-a))*pk */ - my_daxpy(m,pk,-b+step+a+z0*(b-a),xp); /* xp<=xp+(a+z0(b-a))*pk */ - func(xp,x,m,n,adata); - my_daxpy(n,xo,-1.0,x); - fz0=my_dnrm2(n,x); - fz0*=fz0; + //FIXME my_dcopy(m,xk,1,xp,1); /* not necessary because xp=xk+(b-step)*pk */ + //my_daxpy(m,pk,a+z0*(b-a),xp); /* xp<=xp+(z0)*pk */ + my_daxpy(m,pk,-b+step+a+z0*(b-a),xp); /* xp<=xp+(a+z0*(b-a))*pk */ + fz0=func(xp,m,adata); } /* now choose between f0,f1,fz0,fz1 */ @@ -496,26 +197,21 @@ cubic_interp( /*************** Fletcher line search **********************************/ /* zoom function for line search */ -/* func: vector function +/* func: scalar function xk: parameter values size m x 1 (at which step is calculated) pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk) a/b: bracket interval [a,b] (a>b) is possible - x: size n x 1 (storage) xp: size m x 1 (storage) phi_0: phi(0) gphi_0: grad(phi(0)) - xo: observed data size n x 1 - n: size of vector function + sigma,rho,t1,t2,t3: line search parameters (from Fletcher) step: step size for differencing adata: additional data passed to the function */ -#ifdef USE_MIC -__attribute__ ((target(MIC))) -#endif static double linesearch_zoom( - void (*func)(double *p, double *hx, int m, int n, void *adata), - double *xk, double *pk, double a, double b, double *x, double *xp, double phi_0, double gphi_0, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) { + double (*func)(double *p, int m, void *adata), + double *xk, double *pk, double a, double b, double *xp, double phi_0, double gphi_0, double sigma, double rho, double t1, double t2, double t3, int m, double step, void *adata) { double alphaj,phi_j,phi_aj; double gphi_j,p01,p02,aj,bj; @@ -529,46 +225,31 @@ linesearch_zoom( /* choose alphaj from [a+t2(b-a),b-t3(b-a)] */ p01=aj+t2*(bj-aj); p02=bj-t3*(bj-aj); - alphaj=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata); + alphaj=cubic_interp(func,xk,pk,p01,p02,xp,m,step,adata); //printf("cubic intep [%lf,%lf]->%lf\n",p01,p02,alphaj); /* evaluate phi(alphaj) */ my_dcopy(m,xk,1,xp,1); /* xp<=xk */ my_daxpy(m,pk,alphaj,xp); /* xp<=xp+(alphaj)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - my_daxpy(n,xo,-1.0,x); - phi_j=my_dnrm2(n,x); - phi_j*=phi_j; + phi_j=func(xp,m,adata); /* evaluate phi(aj) */ - //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp = xk+alphaj*pk */ - //my_daxpy(m,pk,aj,xp); /* xp<=xp+(aj)*pk */ + //FIXME my_dcopy(m,xk,1,xp,1); /* xp<=xk : not necessary because already copied */ + //FIXME my_daxpy(m,pk,aj,xp); /* xp<=xp+(aj)*pk */ my_daxpy(m,pk,-alphaj+aj,xp); /* xp<=xp+(aj)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - my_daxpy(n,xo,-1.0,x); - phi_aj=my_dnrm2(n,x); - phi_aj*=phi_aj; - + phi_aj=func(xp,m,adata); if ((phi_j>phi_0+rho*alphaj*gphi_0) || phi_j>=phi_aj) { bj=alphaj; /* aj unchanged */ } else { /* evaluate grad(alphaj) */ - //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp = xk+aj*pk */ - //my_daxpy(m,pk,alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */ + //FIXME my_dcopy(m,xk,1,xp,1); /* xp<=xk */ + //FIXME my_daxpy(m,pk,alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */ my_daxpy(m,pk,-aj+alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - my_daxpy(n,xo,-1.0,x); - p01=my_dnrm2(n,x); + p01=func(xp,m,adata); my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphaj-step)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - my_daxpy(n,xo,-1.0,x); - p02=my_dnrm2(n,x); - gphi_j=(p01*p01-p02*p02)/(2.0*step); + p02=func(xp,m,adata); + gphi_j=(p01-p02)/(2.0*step); /* termination due to roundoff/other errors pp. 38, Fletcher */ if ((aj-alphaj)*gphi_j<=step) { @@ -605,29 +286,25 @@ linesearch_zoom( /* line search */ -/* func: vector function +/* func: scalar function xk: parameter values size m x 1 (at which step is calculated) pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk) alpha1: initial value for step sigma,rho,t1,t2,t3: line search parameters (from Fletcher) - xo: observed data size n x 1 - n: size of vector function + m: size or parameter vector step: step size for differencing adata: additional data passed to the function */ -#ifdef USE_MIC -__attribute__ ((target(MIC))) -#endif -static double +double linesearch( - void (*func)(double *p, double *hx, int m, int n, void *adata), - double *xk, double *pk, double alpha1, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) { + double (*func)(double *p, int m, void *adata), + double *xk, double *pk, double alpha1, double sigma, double rho, double t1, double t2, double t3, int m, double step, void *adata) { /* phi(alpha)=f(xk+alpha pk) for vector function func f(xk) =||func(xk)||^2 */ - double *x,*xp; + double *xp; double alphai,alphai1; double phi_0,phi_alphai,phi_alphai1; double p01,p02; @@ -639,54 +316,36 @@ linesearch( int ci; - if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } if ((xp=(double*)calloc((size_t)m,sizeof(double)))==0) { -#ifndef USE_MIC fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif exit(1); } alphak=1.0; /* evaluate phi_0 and grad(phi_0) */ - func(xk,x,m,n,adata); - my_daxpy(n,xo,-1.0,x); - phi_0=my_dnrm2(n,x); - phi_0*=phi_0; + phi_0=func(xk,m,adata); /* select tolarance 1/100 of current function value */ tol=MIN(0.01*phi_0,1e-6); /* grad(phi_0): evaluate at -step and +step */ my_dcopy(m,xk,1,xp,1); /* xp<=xk */ my_daxpy(m,pk,step,xp); /* xp<=xp+(0.0+step)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - my_daxpy(n,xo,-1.0,x); - p01=my_dnrm2(n,x); + p01=func(xp,m,adata); my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(0.0-step)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - my_daxpy(n,xo,-1.0,x); - p02=my_dnrm2(n,x); - gphi_0=(p01*p01-p02*p02)/(2.0*step); + p02=func(xp,m,adata); + gphi_0=(p01-p02)/(2.0*step); /* estimate for mu */ /* mu = (tol-phi_0)/(rho gphi_0) */ mu=(tol-phi_0)/(rho*gphi_0); #ifdef DEBUG - printf("mu=%lf, alpha1=%lf\n",mu,alpha1); + printf("deltaphi=%lf, mu=%lf, alpha1=%lf\n",gphi_0,mu,alpha1); #endif /* catch if not finite (deltaphi=0 or nan) */ if (!isnormal(mu)) { - free(x); free(xp); #ifdef DEBUG - printf("line interval too small\n"); + printf("line interval too small\n"); #endif return mu; } @@ -699,11 +358,7 @@ linesearch( /* evalualte phi(alpha(i))=f(xk+alphai pk) */ my_dcopy(m,xk,1,xp,1); /* xp<=xk */ my_daxpy(m,pk,alphai,xp); /* xp<=xp+alphai*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - my_daxpy(n,xo,-1.0,x); - phi_alphai=my_dnrm2(n,x); - phi_alphai*=phi_alphai; + phi_alphai=func(xp,m,adata); if (phi_alphaiphi_0+alphai*gphi_0) || (ci>1 && phi_alphai>=phi_alphai1)) { /* ai=alphai1, bi=alphai bracket */ - alphak=linesearch_zoom(func,xk,pk,alphai1,alphai,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata); + alphak=linesearch_zoom(func,xk,pk,alphai1,alphai,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,m,step,adata); #ifdef DEBUG printf("Linesearch : Condition 1 met\n"); #endif @@ -723,19 +378,14 @@ linesearch( } /* evaluate grad(phi(alpha(i))) */ - //my_dcopy(m,xk,1,xp,1); /* NOT NEEDED here because already xp=xk+alphai*pk */ - //my_daxpy(m,pk,alphai+step,xp); /* xp<=xp+(alphai+step)*pk */ + //FIXME my_dcopy(m,xk,1,xp,1); /* NOT NEEDED here?? xp<=xk */ + //FIXME my_daxpy(m,pk,alphai+step,xp); /* xp<=xp+(alphai+step)*pk */ + /* note that xp already is xk+alphai. pk, so only add the missing term */ my_daxpy(m,pk,step,xp); /* xp<=xp+(alphai+step)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - my_daxpy(n,xo,-1.0,x); - p01=my_dnrm2(n,x); + p01=func(xp,m,adata); my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphai-step)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - my_daxpy(n,xo,-1.0,x); - p02=my_dnrm2(n,x); - gphi_i=(p01*p01-p02*p02)/(2.0*step); + p02=func(xp,m,adata); + gphi_i=(p01-p02)/(2.0*step); if (fabs(gphi_i)<=-sigma*gphi_0) { alphak=alphai; @@ -747,7 +397,7 @@ linesearch( if (gphi_i>=0) { /* ai=alphai, bi=alphai1 bracket */ - alphak=linesearch_zoom(func,xk,pk,alphai,alphai1,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata); + alphak=linesearch_zoom(func,xk,pk,alphai,alphai1,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,m,step,adata); #ifdef DEBUG printf("Linesearch : Condition 3 met\n"); #endif @@ -763,7 +413,7 @@ linesearch( /* choose by interpolation in [2*alphai-alphai1,min(mu,alphai+t1*(alphai-alphai1)] */ p01=2.0*alphai-alphai1; p02=MIN(mu,alphai+t1*(alphai-alphai1)); - alphai=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata); + alphai=cubic_interp(func,xk,pk,p01,p02,xp,m,step,adata); //printf("cubic interp [%lf,%lf]->%lf\n",p01,p02,alphai); } phi_alphai1=phi_alphai; @@ -773,7 +423,6 @@ linesearch( - free(x); free(xp); #ifdef DEBUG printf("Step size=%g\n",alphak); @@ -782,10 +431,26 @@ linesearch( } /*************** END Fletcher line search **********************************/ + + +/* cost function : return a scalar cost, input : p (mx1) parameters, m: no. of params, adata: additional data + grad function: return gradient (mx1): input : p (mx1) parameters, g (mx1) gradient vector, m: no. of params, adata: additional data +*/ +/* + p: parameters m x 1 (used as initial value, output final value) + itmax: max iterations + M: BFGS memory size + adata: additional data +*/ + int lbfgs_fit( - void (*func)(double *p, double *hx, int m, int n, void *adata), - double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, void *adata) { + double (*cost_func)(double *p, int m, void *adata), + void (*grad_func)(double *p, double *g, int m, void *adata), + /* the following is the mini batch function, called per eath iteration + to change the input/output data used for the next iteration, if =NULL, full batch mode */ + void (*change_batch)(int iter, void *adata), /* iter= iteration number */ + double *p, int m, int itmax, int M, void *adata) { double *gk; /* gradients at both k+1 and k iter */ double *xk1,*xk; /* parameters at k+1 and k iter */ @@ -799,56 +464,45 @@ lbfgs_fit( if ((gk=(double*)calloc((size_t)m,sizeof(double)))==0) { -#ifndef USE_MIC fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif exit(1); } if ((xk1=(double*)calloc((size_t)m,sizeof(double)))==0) { -#ifndef USE_MIC fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif exit(1); } if ((xk=(double*)calloc((size_t)m,sizeof(double)))==0) { -#ifndef USE_MIC fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif exit(1); } if ((pk=(double*)calloc((size_t)m,sizeof(double)))==0) { -#ifndef USE_MIC fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif exit(1); } /* storage size mM x 1*/ if ((s=(double*)calloc((size_t)m*M,sizeof(double)))==0) { -#ifndef USE_MIC fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif exit(1); } if ((y=(double*)calloc((size_t)m*M,sizeof(double)))==0) { -#ifndef USE_MIC fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif exit(1); } if ((rho=(double*)calloc((size_t)M,sizeof(double)))==0) { -#ifndef USE_MIC fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif exit(1); } /* initial value for params xk=p */ my_dcopy(m,p,1,xk,1); + if (change_batch) { + change_batch(0,adata); + } /* gradient gk=grad(f)_k */ - func_grad(func,xk,gk,x,m,n,step,adata); + grad_func(xk,gk,m,adata); double gradnrm=my_dnrm2(m,gk); /* if gradient is too small, no need to solve, so stop */ if (gradnrm %lf\n\n\n",info0[0],info0[1]); /* use LBFGS */ if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) { lmdata0.robust_nu=robust_nu0; - lbfgs_fit_robust_cuda(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata0); + lbfgs_fit_robust_cuda(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata0); } else { - lbfgs_fit(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata0); + lbfgs_fit(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata0); } } - /* final residual calculation */ + /* final residual calculation: FIXME: possible GPU accel here? */ minimize_viz_full_pth(p, xsub, m, n, (void*)&lmdata0); my_daxpy(n, xsub, -1.0, x); diff --git a/src/lib/Dirac/lmfit_nocuda.c b/src/lib/Dirac/lmfit_nocuda.c index 35fd301..9f617cc 100644 --- a/src/lib/Dirac/lmfit_nocuda.c +++ b/src/lib/Dirac/lmfit_nocuda.c @@ -688,7 +688,7 @@ predict_threadfn_withgain_full(void *data) { #ifdef USE_MIC __attribute__ ((target(MIC))) #endif -static void +void minimize_viz_full_pth(double *p, double *x, int m, int n, void *data) { me_data_t *dp=(me_data_t*)data; @@ -1023,9 +1023,9 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res); /* use LBFGS */ if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) { lmdata.robust_nu=robust_nu0; - lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata); + lbfgs_fit_robust_wrapper(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata); } else { - lbfgs_fit(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata); + lbfgs_fit_wrapper(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata); } #ifdef USE_MIC lmdata.Nt=Nt; /* reset threads for MIC */ @@ -1181,9 +1181,9 @@ bfgsfit_visibilities(double *u, double *v, double *w, double *x, int N, /* use LBFGS */ if (solver_mode==2 || solver_mode==3) { lmdata.robust_nu=mean_nu; - lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata); + lbfgs_fit_robust_wrapper(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata); } else { - lbfgs_fit(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata); + lbfgs_fit_wrapper(p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata); } #ifdef USE_MIC lmdata.Nt=Nt; /* reset threads for MIC */ diff --git a/src/lib/Dirac/robust_lbfgs_nocuda.c b/src/lib/Dirac/robust_lbfgs_nocuda.c index 53e61c7..570c2bb 100644 --- a/src/lib/Dirac/robust_lbfgs_nocuda.c +++ b/src/lib/Dirac/robust_lbfgs_nocuda.c @@ -54,6 +54,8 @@ ambt(complex double * __restrict a, complex double * __restrict b, complex doubl } /**** end repeated code ********************/ + +/*************************************** ROBUST ***************************/ /***************************************************************/ /* worker thread to calculate sum ( log(1+ (y_i-f_i)^2/nu) ) @@ -78,7 +80,7 @@ func_robust_th(void *data) { /* recalculate log(1+ (y_i-f_i)^2/nu) from function() that calculates f_i y (data) - f=function() + f=model vector x=log(1+(y_i-f_i)^2/nu) output all size n x 1 Nt: no of threads @@ -144,518 +146,7 @@ func_robust( } /***************************************************************/ -/* use algorithm 9.1 to compute pk=Hk gk */ -/* pk,gk: size m x 1 - s, y: size mM x 1 - rho: size M x 1 - ii: true location of the k th values in s,y */ -#ifdef USE_MIC -__attribute__ ((target(MIC))) -#endif -static void -mult_hessian(int m, double *pk, double *gk, double *s, double *y, double *rho, int M, int ii) { - int ci; - double *alphai; - int *idx; /* store sorted locations of s, y here */ - double gamma,beta; - if ((alphai=(double*)calloc((size_t)M,sizeof(double)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } - if ((idx=(int*)calloc((size_t)M,sizeof(int)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } - if (M>0) { - /* find the location of k-1 th value */ - if (ii>0) { - ii=ii-1; - } else { - ii=M-1; - } - /* s,y will have 0,1,...,ii,ii+1,...M-1 */ - /* map this to ii+1,ii+2,...,M-1,0,1,..,ii */ - for (ci=0; ci%d ",ci,idx[ci]); - } - printf("\n"); -#endif - /* q = grad(f)k : pk<=gk */ - my_dcopy(m,gk,1,pk,1); - /* this should be done in the right order */ - for (ci=0; ci0) { - gamma=my_ddot(m,&s[m*idx[M-1]],&y[m*idx[M-1]]); - gamma/=my_ddot(m,&y[m*idx[M-1]],&y[m*idx[M-1]]); - /* Hk(0)=gamma I, so scale q by gamma */ - /* r= Hk(0) q */ - my_dscal(m,gamma,pk); - } - - for (ci=0; cib is possible) - to find step that minimizes cost function */ -/* func: vector function - xk: parameter values size m x 1 (at which step is calculated) - pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk) - a/b: interval for interpolation - x: size n x 1 (storage) - xp: size m x 1 (storage) - xo: observed data size n x 1 - n: size of vector function - step: step size for differencing - adata: additional data passed to the function -*/ -#ifdef USE_MIC -__attribute__ ((target(MIC))) -#endif -static double -cubic_interp( - void (*func)(double *p, double *hx, int m, int n, void *adata), - double *xk, double *pk, double a, double b, double *x, double *xp, double *xo, int m, int n, double step, void *adata) { - - me_data_t *dp=(me_data_t*)adata; - double f0,f1,f0d,f1d; /* function values and derivatives at a,b */ - double p01,p02,z0,fz0; - double aa,cc; - - my_dcopy(m,xk,1,xp,1); /* xp<=xk */ - my_daxpy(m,pk,a,xp); /* xp<=xp+(a)*pk */ - func(xp,x,m,n,adata); - //my_daxpy(n,xo,-1.0,x); - //f0=my_dnrm2(n,x); - //f0*=f0; - f0=func_robust(x,xo,n,dp->robust_nu,dp->Nt); - /* grad(phi_0): evaluate at -step and +step */ - my_daxpy(m,pk,step,xp); /* xp<=xp+(a+step)*pk */ - func(xp,x,m,n,adata); - //my_daxpy(n,xo,-1.0,x); - //p01=my_dnrm2(n,x); - p01=func_robust(x,xo,n,dp->robust_nu,dp->Nt); - my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(a-step)*pk */ - func(xp,x,m,n,adata); - //my_daxpy(n,xo,-1.0,x); - //p02=my_dnrm2(n,x); - p02=func_robust(x,xo,n,dp->robust_nu,dp->Nt); - f0d=(p01-p02)/(2.0*step); - - //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp=xk+(a-step)*pk */ - //my_daxpy(m,pk,b,xp); /* xp<=xp+(b)*pk */ - my_daxpy(m,pk,-a+step+b,xp); /* xp<=xp+(b)*pk */ - func(xp,x,m,n,adata); - //my_daxpy(n,xo,-1.0,x); - //f1=my_dnrm2(n,x); - //f1*=f1; - f1=func_robust(x,xo,n,dp->robust_nu,dp->Nt); - /* grad(phi_1): evaluate at -step and +step */ - my_daxpy(m,pk,step,xp); /* xp<=xp+(b+step)*pk */ - func(xp,x,m,n,adata); - //my_daxpy(n,xo,-1.0,x); - //p01=my_dnrm2(n,x); - p01=func_robust(x,xo,n,dp->robust_nu,dp->Nt); - my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(b-step)*pk */ - func(xp,x,m,n,adata); - //my_daxpy(n,xo,-1.0,x); - //p02=my_dnrm2(n,x); - p02=func_robust(x,xo,n,dp->robust_nu,dp->Nt); - f1d=(p01-p02)/(2.0*step); - - - //printf("Interp a,f(a),f'(a): (%lf,%lf,%lf) (%lf,%lf,%lf)\n",a,f0,f0d,b,f1,f1d); - /* cubic poly in [0,1] is f0+f0d z+eta z^2+xi z^3 - where eta=3(f1-f0)-2f0d-f1d, xi=f0d+f1d-2(f1-f0) - derivative f0d+2 eta z+3 xi z^2 => cc+bb z+aa z^2 */ - aa=3.0*(f0-f1)/(b-a)+(f1d-f0d); - p01=aa*aa-f0d*f1d; - /* root exist? */ - if (p01>0.0) { - /* root */ - cc=sqrt(p01); - z0=b-(f1d+cc-aa)*(b-a)/(f1d-f0d+2.0*cc); - /* FIXME: check if this is within boundary */ - aa=MAX(a,b); - cc=MIN(a,b); - //printf("Root=%lf, in [%lf,%lf]\n",z0,cc,aa); - if (z0>aa || z0robust_nu,dp->Nt); - } - - /* now choose between f0,f1,fz0,fz1 */ - if (f0b) is possible - x: size n x 1 (storage) - xp: size m x 1 (storage) - phi_0: phi(0) - gphi_0: grad(phi(0)) - xo: observed data size n x 1 - n: size of vector function - step: step size for differencing - adata: additional data passed to the function -*/ -#ifdef USE_MIC -__attribute__ ((target(MIC))) -#endif -static double -linesearch_zoom( - void (*func)(double *p, double *hx, int m, int n, void *adata), - double *xk, double *pk, double a, double b, double *x, double *xp, double phi_0, double gphi_0, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) { - - me_data_t *dp=(me_data_t*)adata; - double alphaj,phi_j,phi_aj; - double gphi_j,p01,p02,aj,bj; - double alphak=1.0; - int ci,found_step=0; - - aj=a; - bj=b; - ci=0; - while(ci<10) { - /* choose alphaj from [a+t2(b-a),b-t3(b-a)] */ - p01=aj+t2*(bj-aj); - p02=bj-t3*(bj-aj); - alphaj=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata); - //printf("cubic intep [%lf,%lf]->%lf\n",p01,p02,alphaj); - - /* evaluate phi(alphaj) */ - my_dcopy(m,xk,1,xp,1); /* xp<=xk */ - my_daxpy(m,pk,alphaj,xp); /* xp<=xp+(alphaj)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - //my_daxpy(n,xo,-1.0,x); - //phi_j=my_dnrm2(n,x); - //phi_j*=phi_j; - phi_j=func_robust(x,xo,n,dp->robust_nu,dp->Nt); - - /* evaluate phi(aj) */ - //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp = xk+alphaj*pk */ - //my_daxpy(m,pk,aj,xp); /* xp<=xp+(aj)*pk */ - my_daxpy(m,pk,-alphaj+aj,xp); /* xp<=xp+(aj)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - //my_daxpy(n,xo,-1.0,x); - //phi_aj=my_dnrm2(n,x); - //phi_aj*=phi_aj; - phi_aj=func_robust(x,xo,n,dp->robust_nu,dp->Nt); - -#ifdef DEBUG - printf("phi_j=%lf, phi_aj=%lf\n",phi_j,phi_aj); -#endif - if ((phi_j>phi_0+rho*alphaj*gphi_0) || phi_j>=phi_aj) { - bj=alphaj; /* aj unchanged */ - } else { - /* evaluate grad(alphaj) */ - //my_dcopy(m,xk,1,xp,1); /* xp<=xk: NOT NEEDED because already xp = xk+aj*pk */ - //my_daxpy(m,pk,alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */ - my_daxpy(m,pk,-aj+alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - //my_daxpy(n,xo,-1.0,x); - //p01=my_dnrm2(n,x); - p01=func_robust(x,xo,n,dp->robust_nu,dp->Nt); - - my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphaj-step)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - //my_daxpy(n,xo,-1.0,x); - //p02=my_dnrm2(n,x); - p02=func_robust(x,xo,n,dp->robust_nu,dp->Nt); - gphi_j=(p01-p02)/(2.0*step); -#ifdef DEBUG - printf("p01=%lf, p02=%lf\n",p01,p02); -#endif - - /* termination due to roundoff/other errors pp. 38, Fletcher */ - if ((aj-alphaj)*gphi_j<=step) { - alphak=alphaj; - found_step=1; - break; - } - - if (fabs(gphi_j)<=-sigma*gphi_0) { - alphak=alphaj; - found_step=1; - break; - } - - if (gphi_j*(bj-aj)>=0) { - bj=aj; - } /* else bj unchanged */ - aj=alphaj; - } - ci++; - } - - if (!found_step) { - /* use bound to find possible step */ - alphak=alphaj; - } - -#ifdef DEBUG - printf("Found %lf Interval [%lf,%lf]\n",alphak,a,b); -#endif - return alphak; -} - - - -/* line search */ -/* func: vector function - xk: parameter values size m x 1 (at which step is calculated) - pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk) - alpha1: initial value for step - sigma,rho,t1,t2,t3: line search parameters (from Fletcher) - xo: observed data size n x 1 - n: size of vector function - step: step size for differencing - adata: additional data passed to the function -*/ -#ifdef USE_MIC -__attribute__ ((target(MIC))) -#endif -static double -linesearch( - void (*func)(double *p, double *hx, int m, int n, void *adata), - double *xk, double *pk, double alpha1, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) { - - /* phi(alpha)=f(xk+alpha pk) - for vector function func - f(xk) =||func(xk)||^2 */ - - me_data_t *dp=(me_data_t*)adata; - double *x,*xp; - double alphai,alphai1; - double phi_0,phi_alphai,phi_alphai1; - double p01,p02; - double gphi_0,gphi_i; - double alphak; - - double mu; - double tol; /* lower limit for minimization */ - - int ci; - - if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } - if ((xp=(double*)calloc((size_t)m,sizeof(double)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } - - alphak=1.0; - /* evaluate phi_0 and grad(phi_0) */ - func(xk,x,m,n,adata); - //my_daxpy(n,xo,-1.0,x); - //phi_0=my_dnrm2(n,x); - //phi_0*=phi_0; - phi_0=func_robust(x,xo,n,dp->robust_nu,dp->Nt); - /* select tolarance 1/100 of current function value */ - tol=MIN(0.01*phi_0,1e-6); - - - /* grad(phi_0): evaluate at -step and +step */ - my_dcopy(m,xk,1,xp,1); /* xp<=xk */ - my_daxpy(m,pk,step,xp); /* xp<=xp+(0.0+step)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - //my_daxpy(n,xo,-1.0,x); - //p01=my_dnrm2(n,x); - p01=func_robust(x,xo,n,dp->robust_nu,dp->Nt); - my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(0.0-step)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - //my_daxpy(n,xo,-1.0,x); - //p02=my_dnrm2(n,x); - p02=func_robust(x,xo,n,dp->robust_nu,dp->Nt); - gphi_0=(p01-p02)/(2.0*step); - - - /* estimate for mu */ - /* mu = (tol-phi_0)/(rho gphi_0) */ - mu=(tol-phi_0)/(rho*gphi_0); -#ifdef DEBUG - printf("cost=%lf grad=%lf mu=%lf, alpha1=%lf\n",phi_0,gphi_0,mu,alpha1); -#endif - /* catch if not finite (deltaphi=0 or nan) */ - if (!isnormal(mu)) { - free(x); - free(xp); -#ifdef DEBUG - printf("line interval too small\n"); -#endif - return mu; - } - - - ci=1; - alphai=alpha1; /* initial value for alpha(i) : check if 0robust_nu,dp->Nt); - - if (phi_alphaiphi_0+alphai*gphi_0) || (ci>1 && phi_alphai>=phi_alphai1)) { - /* ai=alphai1, bi=alphai bracket */ - alphak=linesearch_zoom(func,xk,pk,alphai1,alphai,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata); -#ifdef DEBUG - printf("Linesearch : Condition 1 met\n"); -#endif - break; - } - - /* evaluate grad(phi(alpha(i))) */ - //my_dcopy(m,xk,1,xp,1); /* NOT NEEDED here because already xp=xk+alphai*pk */ - //my_daxpy(m,pk,alphai+step,xp); /* xp<=xp+(alphai+step)*pk */ - my_daxpy(m,pk,step,xp); /* xp<=xp+(alphai+step)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - //my_daxpy(n,xo,-1.0,x); - //p01=my_dnrm2(n,x); - p01=func_robust(x,xo,n,dp->robust_nu,dp->Nt); - my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphai-step)*pk */ - func(xp,x,m,n,adata); - /* calculate x<=x-xo */ - //my_daxpy(n,xo,-1.0,x); - //p02=my_dnrm2(n,x); - p01=func_robust(x,xo,n,dp->robust_nu,dp->Nt); - gphi_i=(p01-p02)/(2.0*step); - - if (fabs(gphi_i)<=-sigma*gphi_0) { - alphak=alphai; -#ifdef DEBUG - printf("Linesearch : Condition 2 met\n"); -#endif - break; - } - - if (gphi_i>=0) { - /* ai=alphai, bi=alphai1 bracket */ - alphak=linesearch_zoom(func,xk,pk,alphai,alphai1,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata); -#ifdef DEBUG - printf("Linesearch : Condition 3 met\n"); -#endif - break; - } - - /* else preserve old values */ - if (mu<=(2*alphai-alphai1)) { - /* next step */ - alphai1=alphai; - alphai=mu; - } else { - /* choose by interpolation in [2*alphai-alphai1,min(mu,alphai+t1*(alphai-alphai1)] */ - p01=2*alphai-alphai1; - p02=MIN(mu,alphai+t1*(alphai-alphai1)); - alphai=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata); - //printf("cubic interp [%lf,%lf]->%lf\n",p01,p02,alphai); - } - phi_alphai1=phi_alphai; - - ci++; - } - - - - free(x); - free(xp); -#ifdef DEBUG - printf("Step size=%lf\n",alphak); -#endif - return alphak; -} -/*************** END Fletcher line search **********************************/ - -/*************************************** ROBUST ***************************/ /* worker thread for a cpu */ #ifdef USE_MIC __attribute__ ((target(MIC))) @@ -816,13 +307,12 @@ cpu_calc_deriv_robust(void *adata) { return NULL; } /* calculate gradient */ -/* func: vector function +/* func: vector function to predict model p: parameter values size m x 1 (at which grad is calculated) g: gradient size m x 1 xo: observed data size n x 1 robust_nu: nu in T distribution n: size of vector function - step: step size for differencing adata: additional data passed to the function */ #ifdef USE_MIC @@ -831,8 +321,8 @@ __attribute__ ((target(MIC))) static int func_grad_robust( void (*func)(double *p, double *hx, int m, int n, void *adata), - double *p, double *g, double *xo, int m, int n, double step, void *adata) { - /* gradient for each parameter is + double *p, double *g, double *xo, int m, int n, void *adata) { + /* numerical gradient for each parameter is (||func(p+step*e_i)-x||^2-||func(p-step*e_i)-x||^2)/2*step i=0,...,m-1 for all parameters e_i: unit vector, 1 only at i-th location @@ -922,162 +412,352 @@ func_grad_robust( free(x); return 0; } +/*************************************** END ROBUST ***************************/ -int -lbfgs_fit_robust( + + +/* worker thread for a cpu */ +#ifdef USE_MIC +__attribute__ ((target(MIC))) +#endif +static void * +cpu_calc_deriv(void *adata) { + thread_data_grad_t *t=(thread_data_grad_t*)adata; + + int ci,nb; + int stc,stoff,stm,sta1,sta2; + int N=t->N; /* stations */ + int M=t->M; /* clusters */ + int Nbase=(t->Nbase)*(t->tilesz); + + + complex double xr[4]; /* residuals */ + complex double G1[4],G2[4],C[4],T1[4],T2[4]; + double pp[8]; + complex double csum; + int cli,tpchunk,pstart,nchunk,tilesperchunk,stci,ttile,tptile,poff; + + /* iterate over each paramter */ + for (ci=t->g_start; ci<=t->g_end; ++ci) { + t->g[ci]=0.0; + /* find station and parameter corresponding to this value of ci */ + /* this parameter should correspond to the right baseline (x tilesz) + to contribute to the derivative */ + cli=0; + while((clicarr[cli].p[0] || ci>t->carr[cli].p[0]+8*N*t->carr[cli].nchunk-1)) { + cli++; + } + /* now either cli>=M: cluster not found + or cli=t->carr[cli-1].p[0] && ci<=t->carr[cli-1].p[0]+8*N*t->carr[cli-1].nchunk-1) { + cli--; + } + + if (clicarr[cli].p[0]; + + stc=(stci%(8*N))/8; /* 0..N-1 */ + /* make sure this baseline contribute to this parameter */ + tpchunk=stci/(8*N); + nchunk=t->carr[cli].nchunk; + pstart=t->carr[cli].p[0]; + tilesperchunk=(t->tilesz+nchunk-1)/nchunk; + + + /* iterate over all baselines and accumulate sum */ + for (nb=0; nbNbase; + /* which chunk this tile belongs to */ + tptile=ttile/tilesperchunk; + /* now tptile has to match tpchunk, otherwise ignore calculation */ + if (tptile==tpchunk) { + + sta1=t->barr[nb].sta1; + sta2=t->barr[nb].sta2; + + if (((stc==sta1)||(stc==sta2))&& !t->barr[nb].flag) { + /* this baseline has a contribution */ + /* which paramter of this station */ + stoff=(stci%(8*N))%8; /* 0..7 */ + /* which cluster */ + stm=cli; /* 0..M-1 */ + + /* exact expression for derivative + 2 real( vec^H(residual_this_baseline) + * vec(-J_{pm}C_{pqm} J_{qm}^H) + where m: chosen cluster + J_{pm},J_{qm} Jones matrices for baseline p-q + depending on the parameter, J ==> E + E: zero matrix, except 1 at location of m + + residual : in x[8*nb:8*nb+7] + C coh: in coh[8*M*nb+m*8:8*M*nb+m*8+7] (double storage) + coh[4*M*nb+4*m:4*M*nb+4*m+3] (complex storage) + J_p,J_q: in p[sta1*8+m*8*N: sta1*8+m*8*N+7] + and p[sta2*8+m*8*N: sta2*8+m*8*N+ 7] + */ + /* read in residual vector, conjugated */ + xr[0]=(t->x[nb*8])-_Complex_I*(t->x[nb*8+1]); + xr[1]=(t->x[nb*8+2])-_Complex_I*(t->x[nb*8+3]); + xr[2]=(t->x[nb*8+4])-_Complex_I*(t->x[nb*8+5]); + xr[3]=(t->x[nb*8+6])-_Complex_I*(t->x[nb*8+7]); + + /* read in coherency */ + C[0]=t->coh[4*M*nb+4*stm]; + C[1]=t->coh[4*M*nb+4*stm+1]; + C[2]=t->coh[4*M*nb+4*stm+2]; + C[3]=t->coh[4*M*nb+4*stm+3]; + + memset(pp,0,sizeof(double)*8); + if (stc==sta1) { + /* this station parameter gradient */ + pp[stoff]=1.0; + memset(G1,0,sizeof(complex double)*4); + G1[0]=pp[0]+_Complex_I*pp[1]; + G1[1]=pp[2]+_Complex_I*pp[3]; + G1[2]=pp[4]+_Complex_I*pp[5]; + G1[3]=pp[6]+_Complex_I*pp[7]; + poff=pstart+tpchunk*8*N+sta2*8; + G2[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]); + G2[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]); + G2[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+4]); + G2[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]); + + } else if (stc==sta2) { + memset(G2,0,sizeof(complex double)*4); + pp[stoff]=1.0; + G2[0]=pp[0]+_Complex_I*pp[1]; + G2[1]=pp[2]+_Complex_I*pp[3]; + G2[2]=pp[4]+_Complex_I*pp[5]; + G2[3]=pp[6]+_Complex_I*pp[7]; + poff=pstart+tpchunk*8*N+sta1*8; + G1[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]); + G1[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]); + G1[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+5]); + G1[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]); + } + + /* T1=G1*C */ + amb(G1,C,T1); + /* T2=T1*G2' */ + ambt(T1,G2,T2); + + /* calculate product xr*vec(J_p C J_q^H ) */ + csum=xr[0]*T2[0]; + csum+=xr[1]*T2[1]; + csum+=xr[2]*T2[2]; + csum+=xr[3]*T2[3]; + + /* accumulate sum */ + t->g[ci]+=-2.0*creal(csum); + } + } + } + } + } + + + return NULL; +} + +#ifdef USE_MIC +__attribute__ ((target(MIC))) +#endif +static int +func_grad( void (*func)(double *p, double *hx, int m, int n, void *adata), - double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, - void *adata) { + double *p, double *g, double *xo, int m, int n,void *adata) { + /* numerical gradient for each parameter is + (||func(p+step*e_i)-x||^2-||func(p-step*e_i)-x||^2)/2*step + i=0,...,m-1 for all parameters + e_i: unit vector, 1 only at i-th location + */ - double *gk; /* gradients at both k+1 and k iter */ - double *xk1,*xk; /* parameters at k+1 and k iter */ - double *pk; /* step direction H_k * grad(f) */ + double *x; /* array to store residual */ + int ci; + me_data_t *dp=(me_data_t*)adata; - double step=1e-6; - double *y, *s; /* storage for delta(grad) and delta(p) */ - double *rho; /* storage for 1/yk^T*sk */ - int ci,ck,cm; - double alphak=1.0; - + int Nt=dp->Nt; - if ((gk=(double*)calloc((size_t)m,sizeof(double)))==0) { + pthread_attr_t attr; + pthread_t *th_array; + thread_data_grad_t *threaddata; + + + if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) { #ifndef USE_MIC fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); #endif exit(1); } - if ((xk1=(double*)calloc((size_t)m,sizeof(double)))==0) { + /* evaluate func once, store in x, and create threads */ + /* and calculate the residual x=xo-func */ + func(p,x,m,n,adata); + /* calculate x<=x-xo */ + my_daxpy(n,xo,-1.0,x); + + /* setup threads */ + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE); + + if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) { #ifndef USE_MIC - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); #endif - exit(1); + exit(1); } - if ((xk=(double*)calloc((size_t)m,sizeof(double)))==0) { + if ((threaddata=(thread_data_grad_t*)malloc((size_t)Nt*sizeof(thread_data_grad_t)))==0) { #ifndef USE_MIC - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); #endif - exit(1); + exit(1); } - if ((pk=(double*)calloc((size_t)m,sizeof(double)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } + int nth,nth1,Nparm; + /* parameters per thread */ + Nparm=(m+Nt-1)/Nt; - /* storage size mM x 1*/ - if ((s=(double*)calloc((size_t)m*M,sizeof(double)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } - if ((y=(double*)calloc((size_t)m*M,sizeof(double)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } - if ((rho=(double*)calloc((size_t)M,sizeof(double)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } - - /* initial value for params xk=p */ - my_dcopy(m,p,1,xk,1); - /* gradient gk=grad(f)_k */ - func_grad_robust(func,xk,gk,x,m,n,step,adata); - double gradnrm=my_dnrm2(m,gk); - /* if gradient is too small, no need to solve, so stop */ - if (gradnrmCLM_STOP_THRESH) { - /* mult with hessian pk=-H_k*gk */ - if (ckNbase; + threaddata[nth].tilesz=dp->tilesz; + threaddata[nth].barr=dp->barr; + threaddata[nth].carr=dp->carr; + threaddata[nth].M=dp->M; + threaddata[nth].N=dp->N; + threaddata[nth].coh=dp->coh; + threaddata[nth].m=m; + threaddata[nth].n=n; + threaddata[nth].x=x; + threaddata[nth].p=p; + threaddata[nth].g=g; + threaddata[nth].g_start=ci; + threaddata[nth].g_end=ci+Nparm-1; + if (threaddata[nth].g_end>=m) { + threaddata[nth].g_end=m-1; } + ci=ci+Nparm; + pthread_create(&th_array[nth],&attr,cpu_calc_deriv,(void*)(&threaddata[nth])); } + /* now wait for threads to finish */ + for(nth1=0; nth1x; /* input data */ + int n=dp->n; + double *xmodel; + + if ((xmodel=(double*)calloc((size_t)n,sizeof(double)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); + } + + /* predict model */ + minimize_viz_full_pth(p, xmodel, m, n, dp->adata); + /* find ||x-xmodel||^2 */ + my_daxpy(n,x,-1.0,xmodel); + double f0=my_dnrm2(n,xmodel); + f0*=f0; + + free(xmodel); + return f0; +} + +static void +grad_func(double *p, double *g, int m, void *adata) { + wrapper_me_data_t *dp=(wrapper_me_data_t*)adata; + double *x=dp->x; /* input data */ + int n=dp->n; + + func_grad(&minimize_viz_full_pth,p,g,x, m, n, dp->adata); +} + + +static double +robust_cost_func(double *p, int m, void *adata) { + wrapper_me_data_t *dp=(wrapper_me_data_t*)adata; + me_data_t *d1=(me_data_t *)dp->adata; + double *x=dp->x; /* input data */ + int n=dp->n; + double *xmodel; + + if ((xmodel=(double*)calloc((size_t)n,sizeof(double)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); + } + + /* predict model */ + minimize_viz_full_pth(p, xmodel, m, n, dp->adata); + + double f0=func_robust(x,xmodel,n, d1->robust_nu, d1->Nt); + + free(xmodel); + return f0; +} + +static void +robust_grad_func(double *p, double *g, int m, void *adata) { + wrapper_me_data_t *dp=(wrapper_me_data_t*)adata; + double *x=dp->x; /* input data */ + int n=dp->n; + + func_grad_robust(&minimize_viz_full_pth,p,g,x, m, n, dp->adata); +} + +int +lbfgs_fit_wrapper( + double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, + void *adata) { + wrapper_me_data_t data1; + data1.adata=(me_data_t *)adata; + data1.x=x; + data1.n=n; + + + lbfgs_fit(&cost_func,&grad_func,NULL,p,m,itmax,M,&data1); + return 0; +} + +int +lbfgs_fit_robust_wrapper( + double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, + void *adata) { + wrapper_me_data_t data1; + data1.adata=(me_data_t *)adata; + data1.x=x; + data1.n=n; + + + lbfgs_fit(&robust_cost_func,&robust_grad_func,NULL,p,m,itmax,M,&data1); return 0; } From d11e1653fde199b8b831a397feaa1ef4aa2950a2 Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Mon, 30 Jul 2018 16:16:03 +0200 Subject: [PATCH 2/4] make sure rho text file exactly matches no. clusters --- src/MPI/main.cpp | 4 ++-- src/lib/Radio/readsky.c | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/MPI/main.cpp b/src/MPI/main.cpp index 26e1d6b..655e4d2 100644 --- a/src/MPI/main.cpp +++ b/src/MPI/main.cpp @@ -34,7 +34,7 @@ using namespace Data; void print_copyright(void) { - cout<<"SAGECal-MPI 0.5.0 (C) 2011-2018 Sarod Yatawatta"<0, adaptive update of regularization factor: default "<=0) { +//printf("Mt=%d M=%d\n",Mt,M); + while(c>=0 && cj0) { /* found a valid line */ @@ -804,8 +806,9 @@ read_arho_fromfile(const char *admm_rho_file,int Mt,double *arho, int M, double c=skip_restof_line(cfp); c=skip_lines(cfp); } +//printf("c=%d ci=%d cj=%d\n",c,ci,cj); /* report any errors */ - if (!(c==EOF && ci==Mt-1)) { + if (!(c==EOF && ci==Mt-1 && cj=M)) { fprintf(stderr,"%s: %d: Error: cluster numbers in cluster file and regularization file do not match up.\n",__FILE__,__LINE__); exit(1); } From 39d3ed8ad3bbb18080d3c40b894415f171c76e3f Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Mon, 30 Jul 2018 16:33:46 +0200 Subject: [PATCH 3/4] fix typo --- src/lib/Radio/readsky.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib/Radio/readsky.c b/src/lib/Radio/readsky.c index a2fccba..1a9cde1 100644 --- a/src/lib/Radio/readsky.c +++ b/src/lib/Radio/readsky.c @@ -808,7 +808,7 @@ read_arho_fromfile(const char *admm_rho_file,int Mt,double *arho, int M, double } //printf("c=%d ci=%d cj=%d\n",c,ci,cj); /* report any errors */ - if (!(c==EOF && ci==Mt-1 && cj=M)) { + if (!(c==EOF && ci==Mt-1 && cj==M)) { fprintf(stderr,"%s: %d: Error: cluster numbers in cluster file and regularization file do not match up.\n",__FILE__,__LINE__); exit(1); } From 0b072da621f8217270c29e5a63048a3a95ac336c Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Tue, 31 Jul 2018 11:49:23 +0200 Subject: [PATCH 4/4] fixing issue #52 --- src/lib/Dirac/Dirac.h | 7 ++ src/lib/Dirac/rtr_solve.c | 18 +++-- src/lib/Dirac/rtr_solve_robust.c | 110 -------------------------- src/lib/Dirac/rtr_solve_robust_admm.c | 110 -------------------------- 4 files changed, 19 insertions(+), 226 deletions(-) diff --git a/src/lib/Dirac/Dirac.h b/src/lib/Dirac/Dirac.h index 5830600..5269206 100644 --- a/src/lib/Dirac/Dirac.h +++ b/src/lib/Dirac/Dirac.h @@ -967,6 +967,13 @@ typedef struct global_data_rtr_ { pthread_attr_t attr; } global_data_rtr_t; +/* function to count how many baselines contribute to the calculation of + grad and hess, so normalization can be made */ +#ifdef USE_MIC +__attribute__ ((target(MIC))) +#endif +extern void +fns_fcount(global_data_rtr_t *gdata); /* RTR (ICASSP 2013) */ #ifdef USE_MIC diff --git a/src/lib/Dirac/rtr_solve.c b/src/lib/Dirac/rtr_solve.c index 6218212..c1aefdf 100644 --- a/src/lib/Dirac/rtr_solve.c +++ b/src/lib/Dirac/rtr_solve.c @@ -81,13 +81,9 @@ threadfn_fns_fcount(void *data) { sta1=t->barr[ci+t->boff].sta1; sta2=t->barr[ci+t->boff].sta2; - pthread_mutex_lock(&t->mx_array[sta1]); t->bcount[sta1]+=1; - pthread_mutex_unlock(&t->mx_array[sta1]); - pthread_mutex_lock(&t->mx_array[sta2]); t->bcount[sta2]+=1; - pthread_mutex_unlock(&t->mx_array[sta2]); } } @@ -100,7 +96,7 @@ threadfn_fns_fcount(void *data) { #ifdef USE_MIC __attribute__ ((target(MIC))) #endif -static void +void fns_fcount(global_data_rtr_t *gdata) { me_data_t *dp=(me_data_t*)gdata->medata; @@ -146,7 +142,12 @@ fns_fcount(global_data_rtr_t *gdata) { threaddata[nth].boff=ci+boff; threaddata[nth].Nb=Nthb; threaddata[nth].barr=dp->barr; - threaddata[nth].bcount=bcount; /* note this should be 0 first */ + /* note bcount[] should be 0 first */ + if ((threaddata[nth].bcount=(int*)calloc((size_t)dp->N,sizeof(int)))==0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + threaddata[nth].mx_array=gdata->mx_array; pthread_create(&gdata->th_array[nth],&gdata->attr,threadfn_fns_fcount,(void*)(&threaddata[nth])); @@ -157,6 +158,11 @@ fns_fcount(global_data_rtr_t *gdata) { /* now wait for threads to finish */ for(nth1=0; nth1th_array[nth1],NULL); + /* add all thread bcount[] arrays into bcount[] */ + for (ci=0; ciN; ci++) { + bcount[ci]+=threaddata[nth1].bcount[ci]; + } + free(threaddata[nth1].bcount); } free(threaddata); diff --git a/src/lib/Dirac/rtr_solve_robust.c b/src/lib/Dirac/rtr_solve_robust.c index a19fced..6d7ba68 100644 --- a/src/lib/Dirac/rtr_solve_robust.c +++ b/src/lib/Dirac/rtr_solve_robust.c @@ -64,116 +64,6 @@ atmb(complex double * __restrict a, complex double * __restrict b, complex doubl } -/* worker thread function for counting */ -#ifdef USE_MIC -__attribute__ ((target(MIC))) -#endif -static void * -threadfn_fns_fcount(void *data) { - thread_data_rtr_t *t=(thread_data_rtr_t*)data; - - int ci,sta1,sta2; - for (ci=0; ciNb; ci++) { - /* if this baseline is flagged, we do not compute */ - if (!t->barr[ci+t->boff].flag) { - - /* stations for this baseline */ - sta1=t->barr[ci+t->boff].sta1; - sta2=t->barr[ci+t->boff].sta2; - - pthread_mutex_lock(&t->mx_array[sta1]); - t->bcount[sta1]+=1; - pthread_mutex_unlock(&t->mx_array[sta1]); - - pthread_mutex_lock(&t->mx_array[sta2]); - t->bcount[sta2]+=1; - pthread_mutex_unlock(&t->mx_array[sta2]); - } - } - - return NULL; -} - - -/* function to count how many baselines contribute to the calculation of - grad and hess, so normalization can be made */ -#ifdef USE_MIC -__attribute__ ((target(MIC))) -#endif -static void -fns_fcount(global_data_rtr_t *gdata) { - - me_data_t *dp=(me_data_t*)gdata->medata; - - int nth,nth1,ci; - - /* no of threads */ - int Nt=(dp->Nt); - int Nthb0,Nthb; - thread_data_rtr_t *threaddata; - - int Nbase1=(dp->Nbase)*(dp->tilesz); - int boff=(dp->Nbase)*(dp->tileoff); - - /* calculate min baselines a thread can handle */ - Nthb0=(Nbase1+Nt-1)/Nt; - - if ((threaddata=(thread_data_rtr_t*)malloc((size_t)Nt*sizeof(thread_data_rtr_t)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } - int *bcount; - if ((bcount=(int*)calloc((size_t)dp->N,sizeof(int)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } - - /* iterate over threads, allocating baselines per thread */ - ci=0; - for (nth=0; nthbarr; - threaddata[nth].bcount=bcount; /* note this should be 0 first */ - threaddata[nth].mx_array=gdata->mx_array; - - pthread_create(&gdata->th_array[nth],&gdata->attr,threadfn_fns_fcount,(void*)(&threaddata[nth])); - /* next baseline set */ - ci=ci+Nthb; - } - - /* now wait for threads to finish */ - for(nth1=0; nth1th_array[nth1],NULL); - } - free(threaddata); - - /* calculate inverse count */ - for (nth1=0; nth1N; nth1++) { - gdata->iw[nth1]=(bcount[nth1]>0?1.0/(double)bcount[nth1]:0.0); - } - free(bcount); - /* scale back weight such that max value is 1 */ - nth1=my_idamax(dp->N,gdata->iw,1); - double maxw=gdata->iw[nth1-1]; /* 1 based index */ - if (maxw>0.0) { /* all baselines can be flagged */ - my_dscal(dp->N,1.0/maxw,gdata->iw); - } -} - - /* worker thread function for cost function calculation */ #ifdef USE_MIC diff --git a/src/lib/Dirac/rtr_solve_robust_admm.c b/src/lib/Dirac/rtr_solve_robust_admm.c index 18773d4..008430e 100644 --- a/src/lib/Dirac/rtr_solve_robust_admm.c +++ b/src/lib/Dirac/rtr_solve_robust_admm.c @@ -64,116 +64,6 @@ atmb(complex double * __restrict a, complex double * __restrict b, complex doubl } -/* worker thread function for counting */ -#ifdef USE_MIC -__attribute__ ((target(MIC))) -#endif -static void * -threadfn_fns_fcount(void *data) { - thread_data_rtr_t *t=(thread_data_rtr_t*)data; - - int ci,sta1,sta2; - for (ci=0; ciNb; ci++) { - /* if this baseline is flagged, we do not compute */ - if (!t->barr[ci+t->boff].flag) { - - /* stations for this baseline */ - sta1=t->barr[ci+t->boff].sta1; - sta2=t->barr[ci+t->boff].sta2; - - pthread_mutex_lock(&t->mx_array[sta1]); - t->bcount[sta1]+=1; - pthread_mutex_unlock(&t->mx_array[sta1]); - - pthread_mutex_lock(&t->mx_array[sta2]); - t->bcount[sta2]+=1; - pthread_mutex_unlock(&t->mx_array[sta2]); - } - } - - return NULL; -} - - -/* function to count how many baselines contribute to the calculation of - grad and hess, so normalization can be made */ -#ifdef USE_MIC -__attribute__ ((target(MIC))) -#endif -static void -fns_fcount(global_data_rtr_t *gdata) { - - me_data_t *dp=(me_data_t*)gdata->medata; - - int nth,nth1,ci; - - /* no of threads */ - int Nt=(dp->Nt); - int Nthb0,Nthb; - thread_data_rtr_t *threaddata; - - int Nbase1=(dp->Nbase)*(dp->tilesz); - int boff=(dp->Nbase)*(dp->tileoff); - - /* calculate min baselines a thread can handle */ - Nthb0=(Nbase1+Nt-1)/Nt; - - if ((threaddata=(thread_data_rtr_t*)malloc((size_t)Nt*sizeof(thread_data_rtr_t)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } - int *bcount; - if ((bcount=(int*)calloc((size_t)dp->N,sizeof(int)))==0) { -#ifndef USE_MIC - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); -#endif - exit(1); - } - - /* iterate over threads, allocating baselines per thread */ - ci=0; - for (nth=0; nthbarr; - threaddata[nth].bcount=bcount; /* note this should be 0 first */ - threaddata[nth].mx_array=gdata->mx_array; - - pthread_create(&gdata->th_array[nth],&gdata->attr,threadfn_fns_fcount,(void*)(&threaddata[nth])); - /* next baseline set */ - ci=ci+Nthb; - } - - /* now wait for threads to finish */ - for(nth1=0; nth1th_array[nth1],NULL); - } - free(threaddata); - - /* calculate inverse count */ - for (nth1=0; nth1N; nth1++) { - gdata->iw[nth1]=(bcount[nth1]>0?1.0/(double)bcount[nth1]:0.0); - } - free(bcount); - /* scale back weight such that max value is 1 */ - nth1=my_idamax(dp->N,gdata->iw,1); - double maxw=gdata->iw[nth1-1]; /* 1 based index */ - if (maxw>0.0) { /* all baselines can be flagged */ - my_dscal(dp->N,1.0/maxw,gdata->iw); - } -} - - /* worker thread function for cost function calculation */ #ifdef USE_MIC