update to 0.3.8

This commit is contained in:
Sarod Yatawatta 2015-08-19 19:20:08 +02:00
parent b167c81854
commit 4f971a5cdd
27 changed files with 2446 additions and 387 deletions

View File

@ -13,6 +13,7 @@ LDFLAGS=-Wl,--rpath,/usr/local/OpenBLAS/lib/
#LDFLAGS=-Wl,-t,--rpath,/software/users/lofareor/SW/lib64
# -Wl,--hash-style=both
# with multithread FFTW
MY_LIBS=-lm -lsagecal
INCLUDES=-I. -I./lib -I$(CASA_INCDIR) -I/usr/include
LIBPATH=-L$(LAPACK_DIR) -L$(CASA_LIBDIR) -L./lib

View File

@ -33,7 +33,7 @@ using namespace Data;
void
print_copyright(void) {
cout<<"SAGECal-MPI 0.3.5 (C) 2011-2015 Sarod Yatawatta"<<endl;
cout<<"SAGECal-MPI 0.3.9 (C) 2011-2015 Sarod Yatawatta"<<endl;
}
@ -44,10 +44,10 @@ print_help(void) {
cout << "-f MSlist: text file with MS names" << endl;
cout << "-s sky.txt: sky model file"<< endl;
cout << "-c cluster.txt: cluster file"<< endl;
cout << "-p solutions.txt: if given, save solution in this file, if not given 'XXX.MS.solutions' will be used"<< endl;
cout << "-p solutions.txt: if given, save (global) solutions in this file, but slaves will always write to 'XXX.MS.solutions'"<< endl;
cout << "-F sky model format: 0: LSM, 1: LSM with 3 order spectra : default "<< Data::format<<endl;
cout << "-I input column (DATA/CORRECTED_DATA) : default " <<Data::DataField<< endl;
cout << "-O ouput column (DATA/CORRECTED_DATA) : default " <<Data::OutField<< endl;
cout << "-I input column (DATA/CORRECTED_DATA/...) : default " <<Data::DataField<< endl;
cout << "-O ouput column (DATA/CORRECTED_DATA/...) : default " <<Data::OutField<< endl;
cout << "-e max EM iterations : default " <<Data::max_emiter<< endl;
cout << "-g max iterations (within single EM) : default " <<Data::max_iter<< endl;
cout << "-l max LBFGS iterations : default " <<Data::max_lbfgs<< endl;
@ -57,16 +57,16 @@ print_help(void) {
cout << "-A ADMM iterations: default " <<Data::Nadmm<< endl;
cout << "-P consensus polynomial order: default " <<Data::Npoly<< endl;
cout << "-r regularization factor: default " <<Data::admm_rho<< endl;
cout << "-G regularization factor of each cluster (text file instead of -r): default : None" << endl;
cout << "-x exclude baselines length (lambda) lower than this in calibration : default "<<Data::min_uvcut << endl;
cout << "-y exclude baselines length (lambda) higher than this in calibration : default "<<Data::max_uvcut << endl;
cout <<endl<<"Advanced options:"<<endl;
cout << "-k cluster_id : correct residuals with solution of this cluster : default "<<Data::ccid<< endl;
cout << "-o robust rho, robust matrix inversion during correction: default "<<Data::rho<< endl;
cout << "-j 0,1,2... 0 : OSaccel, 1 no OSaccel, 2: OSRLM, 3: RLM, 4: RTR, 5: RRTR: default "<<Data::solver_mode<< endl;
cout << "-j 0,1,2... 0 : OSaccel, 1 no OSaccel, 2: OSRLM, 3: RLM, 4: RTR, 5: RRTR: 6: NSD, default "<<Data::solver_mode<< endl;
cout << "-L robust nu, lower bound: default "<<Data::nulow<< endl;
cout << "-H robust nu, upper bound: default "<<Data::nuhigh<< endl;
cout << "-R randomize iterations: default "<<Data::randomize<< endl;
cout << "-D 0,1,2 : if >0, enable diagnostics (Jacobian Leverage) 1 replace Jacobian Leverage as output, 2 only fractional noise/leverage is printed: default " <<Data::DoDiag<< endl;
cout << "-T stop after this number of solutions (0 means no limit): default "<<Data::Nmaxtime<< endl;
cout <<"Report bugs to <sarod@users.sf.net>"<<endl;
}
@ -75,7 +75,7 @@ print_help(void) {
void
ParseCmdLine(int ac, char **av) {
char c;
while((c=getopt(ac, av, "c:e:f:g:j:k:l:m:n:o:p:r:s:t:x:y:A:D:F:I:L:O:P:H:R:T:h"))!= -1)
while((c=getopt(ac, av, "c:e:f:g:j:k:l:m:n:o:p:r:s:t:x:y:A:F:I:L:O:P:G:H:R:T:h"))!= -1)
{
switch(c)
{
@ -147,6 +147,9 @@ ParseCmdLine(int ac, char **av) {
case 'r':
admm_rho= atof(optarg);
break;
case 'G':
admm_rho_file= optarg;
break;
case 'R':
randomize= atoi(optarg);
break;

View File

@ -67,40 +67,42 @@ sagecal_master(int argc, char **argv) {
iodata.Nms=ntasks-1;
/**** get info from slaves ***************************************/
int *bufint=new int[4];
int *bufint=new int[5];
double *bufdouble=new double[1];
iodata.freqs=new double[iodata.Nms];
iodata.freq0=0.0;
iodata.N=iodata.M=iodata.totalt=0;
int Mo=0;
/* use iodata to store the results, also check for consistency of results */
for (int cm=0; cm<iodata.Nms; cm++) {
MPI_Recv(bufint, 4, /* N,M,tilesz,totalt */
MPI_Recv(bufint, 5, /* N,Mo(actual clusters),M(with hybrid),tilesz,totalt */
MPI_INT, cm+1, TAG_MSAUX, MPI_COMM_WORLD, &status);
cout<<"Slave "<<cm+1<<" N="<<bufint[0]<<" M="<<bufint[1]<<" tilesz="<<bufint[2]<<" totaltime="<<bufint[3]<<endl;
cout<<"Slave "<<cm+1<<" N="<<bufint[0]<<" M="<<bufint[1]<<"/"<<bufint[2]<<" tilesz="<<bufint[3]<<" totaltime="<<bufint[4]<<endl;
if (cm==0) { /* update data */
iodata.N=bufint[0];
iodata.M=bufint[1];
iodata.tilesz=bufint[2];
iodata.totalt=bufint[3];
Mo=bufint[1];
iodata.M=bufint[2];
iodata.tilesz=bufint[3];
iodata.totalt=bufint[4];
} else { /* compare against others */
if ((iodata.N != bufint[0]) || (iodata.M != bufint[1]) || (iodata.tilesz != bufint[2])) {
cout<<"Slave "<<cm+1<<" parameters do not match N="<<bufint[0]<<" M="<<bufint[1]<<" tilesz="<<bufint[2]<<endl;
if ((iodata.N != bufint[0]) || (iodata.M != bufint[2]) || (iodata.tilesz != bufint[3])) {
cout<<"Slave "<<cm+1<<" parameters do not match N="<<bufint[0]<<" M="<<bufint[2]<<" tilesz="<<bufint[3]<<endl;
}
if (iodata.totalt<bufint[3]) {
if (iodata.totalt<bufint[4]) {
/* use max value as total time */
iodata.totalt=bufint[3];
iodata.totalt=bufint[4];
}
}
MPI_Recv(bufdouble, 1, /* freq */
MPI_DOUBLE, cm+1, TAG_MSAUX, MPI_COMM_WORLD, &status);
iodata.freqs[cm]=bufdouble[0];
iodata.freq0 +=bufdouble[0];
cout<<"Slave "<<cm+1<<" freq="<<bufdouble[0]<<endl;
cout<<"Slave "<<cm+1<<" frequency (MHz)="<<bufdouble[0]*1e-6<<endl;
}
iodata.freq0/=(double)iodata.Nms;
delete [] bufint;
delete [] bufdouble;
cout<<"Reference freq="<<iodata.freq0<<endl;
cout<<"Reference frequency (MHz)="<<iodata.freq0*1.0e-6<<endl;
/* ADMM memory */
double *Z,*Y,*z;
/* Z: 2Nx2 x Npoly x M */
@ -126,6 +128,24 @@ cout<<"Reference freq="<<iodata.freq0<<endl;
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
/* file for saving solutions */
FILE *sfp=0;
if (solfile) {
if ((sfp=fopen(solfile,"w+"))==0) {
fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
exit(1);
}
}
/* write additional info to solution file */
if (solfile) {
fprintf(sfp,"# solution file (Z) created by SAGECal\n");
fprintf(sfp,"# reference_freq(MHz) polynomial_order stations clusters effective_clusters\n");
fprintf(sfp,"%lf %d %d %d %d\n",iodata.freq0*1e-6,Npoly,iodata.N,Mo,iodata.M);
}
/* interpolation polynomial */
double *B,*Bi;
@ -139,12 +159,38 @@ cout<<"Reference freq="<<iodata.freq0<<endl;
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
/* regularization factor array */
double *arho;
if ((arho=(double*)calloc((size_t)Nadmm,sizeof(double)))==0) {
/* regularization factor array, size Mx1
one per each hybrid cluster */
double *arho,*arhoslave;
if ((arho=(double*)calloc((size_t)iodata.M,sizeof(double)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((arhoslave=(double*)calloc((size_t)Mo,sizeof(double)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
/* if text file is given, read it and update rho array */
if (Data::admm_rho_file) {
read_arho_fromfile(Data::admm_rho_file,iodata.M,arho,Mo,arhoslave);
} else {
/* copy common value */
/* setup regularization factor array */
for (int p=0; p<iodata.M; p++) {
arho[p]=admm_rho;
}
for (int p=0; p<Mo; p++) {
arhoslave[p]=admm_rho;
}
}
/* send array to slaves */
/* update rho on each slave */
for(int cm=0; cm<iodata.Nms; cm++) {
MPI_Send(arhoslave, Mo, MPI_DOUBLE, cm+1,TAG_RHO, MPI_COMM_WORLD);
}
free(arhoslave);
#ifdef DEBUG
FILE *dfp;
@ -159,11 +205,6 @@ cout<<"Reference freq="<<iodata.freq0<<endl;
/* find sum B(:,i)B(:,i)^T, and its pseudoinverse */
find_prod_inverse(B,Bi,Npoly,iodata.Nms);
/* setup regularization factor array */
for (int p=0; p<Nadmm; p++) {
arho[p]=admm_rho;
}
#ifdef DEBUG
fprintf(dfp,"B=[\n");
for (int p=0; p<Npoly; p++) {
@ -174,7 +215,7 @@ cout<<"Reference freq="<<iodata.freq0<<endl;
}
fprintf(dfp,"];\n");
fprintf(dfp,"rho=%lf;\narho=[",admm_rho);
for (int p=0; p<Nadmm; p++) {
for (int p=0; p<iodata.M; p++) {
fprintf(dfp,"%lf ",arho[p]);
}
fprintf(dfp,"];\n");
@ -197,8 +238,13 @@ cout<<"Reference freq="<<iodata.freq0<<endl;
#ifdef DEBUG
Ntime=1;
#endif
cout<<"Master total timeslots="<<Ntime<<endl;
cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="<<admm_rho<<endl;
cout<<"Master total timeslots="<<Ntime<<endl;
if (!Data::admm_rho_file) {
cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="<<admm_rho<<endl;
} else {
cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization given by text file "<<Data::admm_rho_file<<endl;
}
int msgcode;
for (int ct=0; ct<Ntime; ct++) {
/* send start processing signal to slaves */
@ -208,11 +254,6 @@ cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="
}
for (int admm=0; admm<Nadmm; admm++) {
/* update rho on each slave */
for(int cm=0; cm<iodata.Nms; cm++) {
MPI_Send(&arho[admm], 1, MPI_DOUBLE, cm+1,TAG_RHO, MPI_COMM_WORLD);
}
/* get Y_i+rho J_i from each slave */
/* note: for first iteration, reorder values as
2Nx2 complex matrix blocks, M times from each slave
@ -270,8 +311,16 @@ cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="
my_daxpy(8*iodata.N*iodata.M, &Y[cm*8*iodata.N*iodata.M], B[cm*Npoly+ci], &z[ci*8*iodata.N*iodata.M]);
}
}
/* also scale by 1/rho */
my_dscal(8*iodata.N*iodata.M*Npoly,1.0/arho[admm],z);
/* also scale by 1/rho, only if rho>0, otherwise set it to 0.0*/
for (int cm=0; cm<iodata.M; cm++) {
double invscale=0.0;
if (arho[cm]>0.0) {
invscale=1.0/arho[cm];
}
for (int ci=0; ci<Npoly; ci++) {
my_dscal(8*iodata.N,invscale,&z[8*iodata.N*iodata.M*ci+8*iodata.N*cm]);
}
}
#ifdef DEBUG
fprintf(dfp,"%%%%%%%%%%%%%% time=%d admm=%d\n",ct,admm);
@ -342,6 +391,19 @@ cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="
resetcount++;
}
}
/* write Z to solution file, same format as J, but we have Npoly times more
values per timeslot per column */
if (solfile) {
for (int p=0; p<iodata.N*8*Npoly; p++) {
fprintf(sfp,"%d ",p);
for (int pp=0; pp<iodata.M; pp++) {
fprintf(sfp," %e",Z[pp*iodata.N*8*Npoly+p]);
}
fprintf(sfp,"\n");
}
}
}
/* send end signal to each slave */
@ -350,6 +412,10 @@ cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="
MPI_Send(&msgcode, 1, MPI_INT, cm+1,TAG_CTRL, MPI_COMM_WORLD);
}
if (solfile) {
fclose(sfp);
}
#ifdef DEBUG
fclose(dfp);

View File

@ -24,6 +24,7 @@
#include <stdio.h>
#include <string.h>
#include <pthread.h>
#include <time.h>
#include<sagecal.h>
#include <mpi.h>
@ -71,21 +72,14 @@ sagecal_slave(int argc, char **argv) {
double **pm;
complex double *coh;
FILE *sfp=0;
if (solfile) {
if ((sfp=fopen(solfile,"w+"))==0) {
/* always create default solution file name MS+'.solutions' */
std::string filebuff=std::string(Data::TableName)+std::string(".solutions\0");
if ((sfp=fopen(filebuff.c_str(),"w+"))==0) {
fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
return 1;
}
} else {
/* create default solution file name MS+'.solutions' */
std::string filebuff=std::string(Data::TableName)+std::string(".solutions\0");
if ((sfp=fopen(filebuff.c_str(),"w+"))==0) {
fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
return 1;
}
/* set solfile to non null value */
solfile=const_cast<char*>(filebuff.c_str());
}
/* set solfile to non null value */
solfile=const_cast<char*>(filebuff.c_str());
double mean_nu;
@ -192,17 +186,25 @@ sagecal_slave(int argc, char **argv) {
msitr[cm]->origin();
}
/* write additional info to solution file */
if (solfile) {
fprintf(sfp,"# solution file created by SAGECal\n");
fprintf(sfp,"# freq(MHz) bandwidth(MHz) time_interval(min) stations clusters effective_clusters\n");
fprintf(sfp,"%lf %lf %lf %d %d %d\n",iodata.freq0*1e-6,iodata.deltaf*1e-6,(double)iodata.tilesz*iodata.deltat/60.0,iodata.N,M,Mt);
}
/**** send info to master ***************************************/
/* send freq (freq0), no. stations (N), total timeslots (totalt), no. of clusters (Mt), integration time (deltat), bandwidth (deltaf) */
int *bufint=new int[4];
/* send freq (freq0), no. stations (N), total timeslots (totalt), no. of clusters (M), true no. of clusters with hybrid (Mt), integration time (deltat), bandwidth (deltaf) */
int *bufint=new int[5];
double *bufdouble=new double[1];
bufint[0]=iodata.N;
bufint[1]=Mt;
bufint[2]=iodata.tilesz;
bufint[3]=iodata.totalt;
bufint[1]=M;
bufint[2]=Mt;
bufint[3]=iodata.tilesz;
bufint[4]=iodata.totalt;
bufdouble[0]=iodata.freq0;
MPI_Send(bufint, 4, MPI_INT, 0,TAG_MSAUX, MPI_COMM_WORLD);
MPI_Send(bufint, 5, MPI_INT, 0,TAG_MSAUX, MPI_COMM_WORLD);
MPI_Send(bufdouble, 1, MPI_DOUBLE, 0,TAG_MSAUX, MPI_COMM_WORLD);
delete [] bufint;
@ -227,6 +229,15 @@ sagecal_slave(int argc, char **argv) {
exit(1);
}
double *arho;
if ((arho=(double*)calloc((size_t)M,sizeof(double)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
/* get regularization factor array */
MPI_Recv(arho,M,MPI_DOUBLE,0,TAG_RHO,MPI_COMM_WORLD,&status);
/* if we have more than 1 channel, need to backup raw data */
double *xbackup=0;
if (iodata.Nchan>1) {
@ -236,6 +247,7 @@ sagecal_slave(int argc, char **argv) {
}
}
int msgcode=0;
/* starting iterations doubled */
int start_iter=1;
@ -265,22 +277,19 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
/******************** ADMM *******************************/
for (int admm=0; admm<Nadmm; admm++) {
/* get current regularization factor */
MPI_Recv(&admm_rho,1,MPI_DOUBLE,0,TAG_RHO,MPI_COMM_WORLD,&status);
/* ADMM 1: minimize cost function */
if (admm==0) {
#ifndef HAVE_CUDA
if (start_iter) {
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(Data::solver_mode==SM_RTR_OSRLM_RLBFGS?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,(iodata.N<=64?2*Data::max_emiter:4*Data::max_emiter),Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(iodata.N<=64 && Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(iodata.N<=64 && (Data::solver_mode==SM_RTR_OSRLM_RLBFGS||Data::solver_mode==SM_NSD_RLBFGS)?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,0,&mean_nu,&res_0,&res_1); /* 0 for dummy whiten flag */
start_iter=0;
} else {
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,0,&mean_nu,&res_0,&res_1);
}
#endif /* !HAVE_CUDA */
#ifdef HAVE_CUDA
if (start_iter) {
sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(Data::solver_mode==SM_RTR_OSRLM_RLBFGS?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,(iodata.N<=64?2*Data::max_emiter:4*Data::max_emiter),Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(iodata.N<=64 && Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(iodata.N<=64 && (Data::solver_mode==SM_RTR_OSRLM_RLBFGS||Data::solver_mode==SM_NSD_RLBFGS)?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
start_iter=0;
} else {
sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
@ -301,11 +310,11 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
}
#ifndef HAVE_CUDA
sagefit_visibilities_admm(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,admm_rho,&mean_nu,&res_0,&res_1);
sagefit_visibilities_admm(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,arho,&mean_nu,&res_0,&res_1);
#endif /* !HAVE_CUDA */
#ifdef HAVE_CUDA
//sagefit_visibilities_admm(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,admm_rho,&mean_nu,&res_0,&res_1);
sagefit_visibilities_admm_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,admm_rho,&mean_nu,&res_0,&res_1);
//sagefit_visibilities_admm(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,arho,&mean_nu,&res_0,&res_1);
sagefit_visibilities_admm_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,arho,&mean_nu,&res_0,&res_1);
#endif /* HAVE_CUDA */
}
@ -314,10 +323,25 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
if (admm==0) {
/* Y is set to 0 : so original is just rho * J*/
my_dcopy(iodata.N*8*Mt, p, 1, Y, 1);
my_dscal(iodata.N*8*Mt, admm_rho, Y);
/* scale by individual rho for each cluster */
/* if rho<=0, do nothing */
ck=0;
for (ci=0; ci<M; ci++) {
/* Y will be set to 0 if rho<=0 */
my_dscal(iodata.N*8*carr[ci].nchunk, arho[ci], &Y[ck]);
ck+=iodata.N*8*carr[ci].nchunk;
}
} else {
my_daxpy(iodata.N*8*Mt, p, admm_rho, Y);
ck=0;
for (ci=0; ci<M; ci++) {
if (arho[ci]>0.0) {
my_daxpy(iodata.N*8*carr[ci].nchunk, &p[ck], arho[ci], &Y[ck]);
}
ck+=iodata.N*8*carr[ci].nchunk;
//cout<<"Clus="<<ci<<" Chunk="<<carr[ci].nchunk<<" Rho="<<arho[ci]<<endl;
}
}
MPI_Send(Y, iodata.N*8*Mt, MPI_DOUBLE, 0,TAG_YDATA, MPI_COMM_WORLD);
/* for initial ADMM iteration, get back Y with common unitary ambiguity */
if (admm==0) {
@ -329,7 +353,14 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
/* update Y_i <= Y_i + rho (J_i-B_i Z)
since we already have Y_i + rho J_i, only need -rho (B_i Z) */
my_daxpy(iodata.N*8*Mt, Z, -admm_rho, Y);
ck=0;
for (ci=0; ci<M; ci++) {
if (arho[ci]>0.0) {
my_daxpy(iodata.N*8*carr[ci].nchunk, &Z[ck], -arho[ci], &Y[ck]);
}
ck+=iodata.N*8*carr[ci].nchunk;
}
/* calculate primal residual J-BZ */
my_dcopy(iodata.N*8*Mt, p, 1, pres, 1);
my_daxpy(iodata.N*8*Mt, Z, -1.0, pres);
@ -365,21 +396,21 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
/* if residual has increased too much, or all are flagged (0 residual)
or NaN
reset solutions to original
initial values */
if (res_1==0.0 || !isfinite(res_1) || res_1>res_ratio*res_prev) {
initial values : use residual at 1st ADMM */
if (res_01==0.0 || !isfinite(res_01) || res_01>res_ratio*res_prev) {
cout<<"Resetting Solution"<<endl;
/* reset solutions so next iteration has default initial values */
memcpy(p,pinit,(size_t)iodata.N*8*Mt*sizeof(double));
/* also assume iterations have restarted from scratch */
start_iter=1;
/* also forget min residual (otherwise will try to reset it always) */
res_prev=res_1;
} else if (res_1<res_prev) { /* only store the min value */
res_prev=res_1;
res_prev=res_01;
} else if (res_01<res_prev) { /* only store the min value */
res_prev=res_01;
}
end_time = time(0);
elapsed_time = ((double) (end_time-start_time)) / 60.0;
if (solver_mode==SM_OSLM_OSRLM_RLBFGS||solver_mode==SM_RLM_RLBFGS||solver_mode==SM_RTR_OSRLM_RLBFGS) {
if (solver_mode==SM_OSLM_OSRLM_RLBFGS||solver_mode==SM_RLM_RLBFGS||solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
cout<<"nu="<<mean_nu<<endl;
}
cout<<myrank<< ": Timeslot: "<<tilex<<" residual: initial="<<res_00<<"/"<<res_0<<",final="<<res_01<<"/"<<res_1<<", Time spent="<<elapsed_time<<" minutes"<<endl;
@ -463,6 +494,7 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
free(Z);
free(Y);
free(pres);
free(arho);
/**********************************************************/
cout<<"Done."<<endl;

View File

@ -1,6 +1,6 @@
OUTPUT=
CXX=g++
CXXFLAGS=-O3 -Wall -g #-fnostack-protector
CXXFLAGS=-O3 -Wall -g #-pg #-fnostack-protector
CASA_LIBDIR=/opt/casacore/lib
CASA_INCDIR=/opt/casacore/include/casacore
CASA_LIBS=-lcasa_casa -lcasa_tables -lcasa_measures -lcasa_ms -lcfitsio

View File

@ -55,6 +55,7 @@ int Data::lbfgs_m=7;
int Data::gpu_threads=128;
int Data::linsolv=1;
int Data::randomize=1;
int Data::whiten=0;
int Data::DoSim=0;
int Data::DoDiag=0;
int Data::doChan=0; /* if 1, solve for each channel in multi channel data */
@ -69,6 +70,7 @@ char *Data::MSlist=NULL;
int Data::Nadmm=1;
int Data::Npoly=2;
double Data::admm_rho=5.0;
char *Data::admm_rho_file=NULL;
/* no upper limit, solve for all timeslots */
int Data::Nmaxtime=0;
@ -298,6 +300,7 @@ Data::loadData(Table ti, Data::IOData iodata) {
for(int k = 0; k < iodata.Nchan; k++) {
Complex *ptr = data[k].data();
bool *flgptr=flag[k].data();
//if (!flag.data()[k]){
if (!flgptr[0] && !flgptr[1] && !flgptr[2] && !flgptr[3]){
cxx+=ptr[0];
cxy+=ptr[1];
@ -438,6 +441,7 @@ Data::loadDataList(vector<MSIter*> msitr, Data::IOData iodata) {
for(int k = 0; k < iodata.NchanMS[cm]; k++) {
Complex *ptr = data[k].data();
bool *flgptr=flag[k].data();
//if (!flag.data()[k]){
if (!flgptr[0] && !flgptr[1] && !flgptr[2] && !flgptr[3]){
cxx+=ptr[0];
cxy+=ptr[1];

View File

@ -114,6 +114,7 @@ namespace Data
extern char *ignorefile;
extern double nulow,nuhigh;
extern int randomize;
extern int whiten;
extern int DoSim; /* if 1, simulation mode */
extern int doChan; /* if 1, solve for each channel in multi channel data */
extern int DoDiag; /* if >0, enables diagnostics (Leverage) 1: write leverage as output (no residual), 2: only calculate fractions of leverage/noise */
@ -122,6 +123,7 @@ namespace Data
extern int Nadmm; /* ADMM iterations >=1 */
extern int Npoly; /* polynomial order >=1 */
extern double admm_rho; /* regularization */
extern char *admm_rho_file; /* text file for regularization of each cluster */
/* for debugging, upper limit on time slots */
extern int Nmaxtime;
}

View File

@ -31,7 +31,7 @@ using namespace Data;
void
print_copyright(void) {
cout<<"SAGECal 0.3.5 (C) 2011-2015 Sarod Yatawatta"<<endl;
cout<<"SAGECal 0.3.8 (C) 2011-2015 Sarod Yatawatta"<<endl;
}
@ -47,15 +47,15 @@ print_help(void) {
cout << "-c cluster.txt: cluster file"<< endl;
cout << "-p solutions.txt: if given, save solution in this file"<< endl;
cout << "-F sky model format: 0: LSM, 1: LSM with 3 order spectra : default "<< Data::format<<endl;
cout << "-I input column (DATA/CORRECTED_DATA) : default " <<Data::DataField<< endl;
cout << "-O ouput column (DATA/CORRECTED_DATA) : default " <<Data::OutField<< endl;
cout << "-I input column (DATA/CORRECTED_DATA/...) : default " <<Data::DataField<< endl;
cout << "-O ouput column (DATA/CORRECTED_DATA/...) : default " <<Data::OutField<< endl;
cout << "-e max EM iterations : default " <<Data::max_emiter<< endl;
cout << "-g max iterations (within single EM) : default " <<Data::max_iter<< endl;
cout << "-l max LBFGS iterations : default " <<Data::max_lbfgs<< endl;
cout << "-m LBFGS memory size : default " <<Data::lbfgs_m<< endl;
cout << "-n no of worker threads : default "<<Data::Nt << endl;
cout << "-t tile size : default " <<Data::TileSize<< endl;
cout << "-a 0,1 : if 1, only simulate (with solutions if solutions file is also given): default " <<Data::DoSim<< endl;
cout << "-a 0,1,2 : if 1, only simulate, if 2, simulate and add to residual, (multiplied by solutions if solutions file is also given): default " <<Data::DoSim<< endl;
cout << "-z ignore_clusters: if only doing a simulation, ignore the cluster ids listed in this file" << endl;
cout << "-b 0,1 : if 1, solve for each channel: default " <<Data::doChan<< endl;
cout << "-x exclude baselines length (lambda) lower than this in calibration : default "<<Data::min_uvcut << endl;
@ -63,9 +63,10 @@ print_help(void) {
cout <<endl<<"Advanced options:"<<endl;
cout << "-k cluster_id : correct residuals with solution of this cluster : default "<<Data::ccid<< endl;
cout << "-o robust rho, robust matrix inversion during correction: default "<<Data::rho<< endl;
cout << "-j 0,1,2... 0 : OSaccel, 1 no OSaccel, 2: OSRLM, 3: RLM, 4: RTR, 5: RRTR: default "<<Data::solver_mode<< endl;
cout << "-j 0,1,2... 0 : OSaccel, 1 no OSaccel, 2: OSRLM, 3: RLM, 4: RTR, 5: RRTR, 6:NSD : default "<<Data::solver_mode<< endl;
cout << "-L robust nu, lower bound: default "<<Data::nulow<< endl;
cout << "-H robust nu, upper bound: default "<<Data::nuhigh<< endl;
cout << "-W pre-whiten data: default "<<Data::whiten<< endl;
cout << "-R randomize iterations: default "<<Data::randomize<< endl;
cout << "-D 0,1,2 : if >0, enable diagnostics (Jacobian Leverage) 1 replace Jacobian Leverage as output, 2 only fractional noise/leverage is printed: default " <<Data::DoDiag<< endl;
cout <<"Report bugs to <sarod@users.sf.net>"<<endl;
@ -81,7 +82,7 @@ ParseCmdLine(int ac, char **av) {
print_help();
exit(0);
}
while((c=getopt(ac, av, "a:b:c:d:e:f:g:j:k:l:m:n:o:p:s:t:x:y:z:D:F:I:O:L:H:R:h"))!= -1)
while((c=getopt(ac, av, "a:b:c:d:e:f:g:j:k:l:m:n:o:p:s:t:x:y:z:D:F:I:O:L:H:R:W:h"))!= -1)
{
switch(c)
{
@ -105,7 +106,7 @@ ParseCmdLine(int ac, char **av) {
break;
case 'a':
DoSim= atoi(optarg);
if (DoSim>1) { DoSim=1; }
if (DoSim<0) { DoSim=1; }
break;
case 'b':
doChan= atoi(optarg);
@ -154,6 +155,9 @@ ParseCmdLine(int ac, char **av) {
case 'R':
randomize= atoi(optarg);
break;
case 'W':
whiten= atoi(optarg);
break;
case 'x':
Data::min_uvcut= atof(optarg);
break;
@ -187,13 +191,13 @@ ParseCmdLine(int ac, char **av) {
cout<<"Selecting baselines > "<<min_uvcut<<" and < "<<max_uvcut<<" wavelengths."<<endl;
if (!DoSim) {
cout<<"Using ";
if (solver_mode==SM_LM_LBFGS || solver_mode==SM_OSLM_LBFGS || solver_mode==SM_RTR_OSLM_LBFGS) {
if (solver_mode==SM_LM_LBFGS || solver_mode==SM_OSLM_LBFGS || solver_mode==SM_RTR_OSLM_LBFGS || solver_mode==SM_NSD_RLBFGS) {
cout<<"Gaussian noise model for solver."<<endl;
} else {
cout<<"Robust noise model for solver with degrees of freedom ["<<nulow<<","<<nuhigh<<"]."<<endl;
}
} else {
cout<<"Only doing simulation."<<endl;
cout<<"Only doing simulation (with possible correction for cluster id "<<ccid<<")."<<endl;
}
}
@ -226,11 +230,11 @@ main(int argc, char **argv) {
}
/**********************************************************/
int M,Mt,ci,cj,ck;
/* parameters */
double *p,*pinit,*pfreq;
double **pm;
complex double *coh;
FILE *sfp=0;
/* parameters */
double *p,*pinit,*pfreq;
double **pm;
complex double *coh;
FILE *sfp=0;
if (solfile) {
if (!Data::DoSim) {
if ((sfp=fopen(solfile,"w+"))==0) {
@ -240,9 +244,16 @@ main(int argc, char **argv) {
} else {
/* simulation mode, read only access */
if ((sfp=fopen(solfile,"r"))==0) {
fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
fprintf(stderr,"%s: %d: no solution file present\n",__FILE__,__LINE__);
return 1;
}
/* remember to skip first 3 lines from solution file */
char chr;
for (ci=0; ci<3; ci++) {
do {
chr = fgetc(sfp);
} while (chr != '\n');
}
}
}
@ -390,7 +401,15 @@ main(int argc, char **argv) {
for(int cm=0; cm<iodata.Nms; cm++) {
msitr[cm]->origin();
}
/* starting iterations doubled */
/* write additional info to solution file */
if (solfile && !Data::DoSim) {
fprintf(sfp,"# solution file created by SAGECal\n");
fprintf(sfp,"# freq(MHz) bandwidth(MHz) time_interval(min) stations clusters effective_clusters\n");
fprintf(sfp,"%lf %lf %lf %d %d %d\n",iodata.freq0*1e-6,iodata.deltaf*1e-6,(double)iodata.tilesz*iodata.deltat/60.0,iodata.N,M,Mt);
}
/* starting iterations are doubled */
int start_iter=1;
while (msitr[0]->more()) {
start_time = time(0);
@ -456,7 +475,8 @@ main(int argc, char **argv) {
inout(p: length(8*mic_N*Mt))
sagefit_visibilities_mic(mic_u,mic_v,mic_w,mic_x,mic_N,mic_Nbase,mic_tilesz,barr,mic_chunks,mic_pindex,coh,M,Mt,mic_freq0,mic_deltaf,p,mic_data_min_uvcut,mic_data_Nt,2*mic_data_max_emiter,mic_data_max_iter,(mic_data_dochan? 0 :mic_data_max_lbfgs),mic_data_lbfgs_m,mic_data_gpu_threads,mic_data_linsolv,mic_data_solver_mode,mic_data_nulow,mic_data_nuhigh,mic_data_randomize,&mean_nu,&res_0,&res_1);
#else /* NOT MIC */
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,(Data::doChan? 0 :Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(Data::solver_mode==SM_RTR_OSRLM_RLBFGS?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,(iodata.N<=64?2*Data::max_emiter:4*Data::max_emiter),Data::max_iter,(Data::doChan? 0 :Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(iodata.N<=64 && Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(iodata.N<=64 && (Data::solver_mode==SM_RTR_OSRLM_RLBFGS||Data::solver_mode==SM_NSD_RLBFGS)?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,Data::whiten,&mean_nu,&res_0,&res_1);
//sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,(Data::doChan? 0 :Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,Data::whiten,&mean_nu,&res_0,&res_1);
#endif /* USE_MIC */
start_iter=0;
} else {
@ -474,14 +494,14 @@ main(int argc, char **argv) {
inout(p: length(8*mic_N*Mt))
sagefit_visibilities_mic(mic_u,mic_v,mic_w,mic_x,mic_N,mic_Nbase,mic_tilesz,barr,mic_chunks,mic_pindex,coh,M,Mt,mic_freq0,mic_deltaf,p,mic_data_min_uvcut,mic_data_Nt,mic_data_max_emiter,mic_data_max_iter,(mic_data_dochan? 0: mic_data_max_lbfgs),mic_data_lbfgs_m,mic_data_gpu_threads,mic_data_linsolv,mic_data_solver_mode,mic_data_nulow,mic_data_nuhigh,mic_data_randomize,&mean_nu,&res_0,&res_1);
#else /* NOT MIC */
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,(Data::doChan? 0: Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,(Data::doChan? 0: Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,Data::whiten,&mean_nu,&res_0,&res_1);
#endif /* USE_MIC */
}
#endif /* !HAVE_CUDA */
#ifdef HAVE_CUDA
#ifdef ONE_GPU
if (start_iter) {
sagefit_visibilities_dual_pt_one_gpu(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,(Data::doChan? 0: Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(Data::solver_mode==SM_RTR_OSRLM_RLBFGS?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
sagefit_visibilities_dual_pt_one_gpu(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt, (iodata.N<=64?2*Data::max_emiter:4*Data::max_emiter),Data::max_iter,(Data::doChan? 0: Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(iodata.N<=64 && Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(iodata.N<=64 && (Data::solver_mode==SM_RTR_OSRLM_RLBFGS||Data::solver_mode==SM_NSD_RLBFGS)?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
start_iter=0;
} else {
sagefit_visibilities_dual_pt_one_gpu(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,(Data::doChan? 0:Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
@ -489,7 +509,7 @@ main(int argc, char **argv) {
#endif /* ONE_GPU */
#ifndef ONE_GPU
if (start_iter) {
sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,(Data::doChan? 0:Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(Data::solver_mode==SM_RTR_OSRLM_RLBFGS?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,(iodata.N<=64?2*Data::max_emiter:4*Data::max_emiter),Data::max_iter,(Data::doChan? 0:Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(iodata.N<=64 && Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(iodata.N<=64 && (Data::solver_mode==SM_RTR_OSRLM_RLBFGS||Data::solver_mode==SM_NSD_RLBFGS)?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
///DBG sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,(Data::doChan? 0:Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
start_iter=0;
} else {
@ -561,11 +581,11 @@ main(int argc, char **argv) {
} else {
/************ simulation only mode ***************************/
if (!solfile) {
predict_visibilities_multifreq(iodata.u,iodata.v,iodata.w,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt);
predict_visibilities_multifreq(iodata.u,iodata.v,iodata.w,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,(Data::DoSim>1?1:0));
} else {
read_solutions(sfp,p,carr,iodata.N,M);
/* if solution file is given, read in the solutions and predict */
predict_visibilities_multifreq_withsol(iodata.u,iodata.v,iodata.w,p,iodata.xo,ignorelist,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt);
predict_visibilities_multifreq_withsol(iodata.u,iodata.v,iodata.w,p,iodata.xo,ignorelist,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,(Data::DoSim>1?1:0),Data::ccid,Data::rho);
}
/************ end simulation only mode ***************************/
}
@ -613,10 +633,14 @@ main(int argc, char **argv) {
}
end_time = time(0);
elapsed_time = ((double) (end_time-start_time)) / 60.0;
if (solver_mode==SM_OSLM_OSRLM_RLBFGS||solver_mode==SM_RLM_RLBFGS||solver_mode==SM_RTR_OSRLM_RLBFGS) {
if (!Data::DoSim) {
if (solver_mode==SM_OSLM_OSRLM_RLBFGS||solver_mode==SM_RLM_RLBFGS||solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
cout<<"nu="<<mean_nu<<endl;
}
cout<<"Timeslot: "<<tilex<<" Residual: initial="<<res_0<<",final="<<res_1<<", Time spent="<<elapsed_time<<" minutes"<<endl;
cout<<"Timeslot: "<<tilex<<" Residual: initial="<<res_0<<",final="<<res_1<<", Time spent="<<elapsed_time<<" minutes"<<endl;
} else {
cout<<"Timeslot: "<<tilex<<", Time spent="<<elapsed_time<<" minutes"<<endl;
}
}
Data::freeData(iodata);

View File

@ -1,6 +1,6 @@
CC=gcc
CXX=g++
CFLAGS= -Wall -O3 -g
CFLAGS= -Wall -O3 -g #-pg
CLIBS= -lm -lpthread
#LAPACK=-L/usr/lib/atlas/sse -llapack -lblas
#LAPACK=-L/usr/local/GotoBLAS2/lib -lgoto2 -lpthread -lgfortran
@ -52,7 +52,6 @@ rtr_solve_robust_admm.o:rtr_solve_robust_admm.c
admm_solve.o:admm_solve.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
RANLIB=ranlib
libsagecal.a:$(OBJECTS) sagecal.h
ar rv $@ $(OBJECTS); \

View File

@ -387,7 +387,7 @@ minimize_viz_full_pth(double *p, double *x, int m, int n, void *data) {
int
sagefit_visibilities_admm(double *u, double *v, double *w, double *x, int N,
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, double admm_rho, double *mean_nu, double *res_0, double *res_1) {
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, double *admm_rho, double *mean_nu, double *res_0, double *res_1) {
int ci,cj,ck,tcj;
double *p; // parameters: m x 1
@ -511,7 +511,7 @@ printf("\n\ncluster %d iter=%d\n",cj,this_itermax);
/* use a reasonable TR radius because cost function has extra
regularization NB: ADMM very sensitive to this */
double Delta0=2.0;
rtr_solve_nocuda_robust_admm(&p[carr[cj].p[ck]], &Y[carr[cj].p[ck]], &BZ[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], N, ntiles*Nbase, this_itermax+5, this_itermax+10, Delta0, Delta0*0.125, admm_rho, nulow, nuhigh, info, &lmdata);
rtr_solve_nocuda_robust_admm(&p[carr[cj].p[ck]], &Y[carr[cj].p[ck]], &BZ[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], N, ntiles*Nbase, this_itermax+5, this_itermax+10, Delta0, Delta0*0.125, admm_rho[cj], nulow, nuhigh, info, &lmdata);
if (ci==max_emiter-1){
robust_nuM[cj]+=lmdata.robust_nu;
}
@ -563,6 +563,8 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
free(robust_nuM);
if (robust_nu0<nulow) {
robust_nu0=nulow;
} else if (robust_nu0>nuhigh) {
robust_nu0=nuhigh;
}
/* final residual calculation */
@ -601,7 +603,7 @@ pipeline_slave_code_admm_flt(void *data)
sync_barrier(&(td->pline->gate2)); /* stop at gate 2 */
/* do work : only one solver */
//printf("state=%d, thread %d\n",gd->status[tid],tid);
if (gd->status[tid]==PT_DO_WORK_RRTR ) {
if (gd->status[tid]==PT_DO_WORK_RRTR || gd->status[tid]==PT_DO_WORK_NSD) {
/************************* work *********************/
me_data_t *t=(me_data_t *)gd->lmdata[tid];
/* divide the tiles into chunks tilesz/nchunk */
@ -626,10 +628,13 @@ pipeline_slave_code_admm_flt(void *data)
ntiles=t->tilesz-cj;
}
/* max trust region radius: keep reasonable */
float Delta0=2.0f;
/* storage (float) 8N*(BlocksPerGrid+8) + 8N*5 + 8*M + 8*Nbase + 2*Nbase + N + M */
ret=rtr_solve_cuda_robust_admm_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->Y[tid][ci*(gd->M[tid])], &gd->Z[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+5, gd->itermax[tid]+10, Delta0, Delta0*0.125f, gd->admm_rho, gd->nulow, gd->nuhigh, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], cj, ntiles, (void*)gd->lmdata[tid]);
if (gd->status[tid]==PT_DO_WORK_NSD) {
ret=nsd_solve_cuda_robust_admm_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->Y[tid][ci*(gd->M[tid])], &gd->Z[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+15, gd->admm_rho[tid], gd->nulow, gd->nuhigh, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], cj, ntiles, (void*)gd->lmdata[tid]);
} else {
/* max trust region radius: keep reasonable */
float Delta0=2.0f;
ret=rtr_solve_cuda_robust_admm_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->Y[tid][ci*(gd->M[tid])], &gd->Z[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+10, Delta0, Delta0*0.125f, gd->admm_rho[tid], gd->nulow, gd->nuhigh, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], cj, ntiles, (void*)gd->lmdata[tid]);
}
init_res+=gd->info[tid][0];
final_res+=gd->info[tid][1];
@ -712,7 +717,7 @@ destroy_pipeline_admm_flt(th_pipeline *pline)
//#define DEBUG
int
sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x, int N,
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode, double nulow, double nuhigh, int randomize,double admm_rho, double *mean_nu, double *res_0, double *res_1) {
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode, double nulow, double nuhigh, int randomize,double *admm_rho, double *mean_nu, double *res_0, double *res_1) {
int ci,cj;
@ -857,23 +862,20 @@ sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x
tpg.status[0]=tpg.status[1]=PT_DO_AGPU;
/* also calculate the total storage needed to be allocated on a GPU */
/* determine total size for memory allocation */
int Mm=8*N;
int64_t data_sz=0;
/* size for RTR (float) : M*( 13 + ntiles*Nbase/128 +1+1) + 18*ntiles*Nbase
where M: no of stations (params/8) ntiles: tile size Nbase: baselines
/* size for RTR/NSD (float), 128 is the ThreadsPerBlock
NSD is a bit lower
*/
int64_t data_sz_rtr=0;
/* 2 x Nbase more than normal RTR for weight, log(weight) */
data_sz_rtr=(N*(13+Nbase1/128+1+1)+20*Nbase1)*sizeof(float);
/* size for ROBUSTLM */
data_sz=(int64_t)(n+Mm*n+Mm*Mm+Mm+Mm*Mm+Mm+Mm+Mm+Mm+Nbase1*8+n+n+n+n)*sizeof(float)+(int64_t)Nbase1*2*sizeof(char);
data_sz=MAX(data_sz,data_sz_rtr);
data_sz+=(int64_t)Mm*sizeof(float);
if (solver_mode==SM_NSD_RLBFGS) {
data_sz=(8*N*(7+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
} else { /* default is RTR */
data_sz=(8*N*(11+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
}
tpg.data_size=data_sz;
tpg.nulow=nulow;
tpg.nuhigh=nuhigh;
tpg.randomize=randomize;
tpg.admm_rho=(float)admm_rho;
sync_barrier(&(tp.gate2)); /* sync at gate 2*/
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
@ -948,6 +950,7 @@ sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x
tpg.p[0]=&pf[carr[c0].p[0]]; /* length carr[c0].nchunk times */
tpg.Y[0]=&Yf[carr[c0].p[0]]; /* length carr[c0].nchunk times */
tpg.Z[0]=&Zf[carr[c0].p[0]]; /* length carr[c0].nchunk times */
tpg.admm_rho[0]=(float)admm_rho[c0];
tpg.x[0]=xdummy0f;
tpg.M[0]=8*N; /* even though size of p is > M, dont change this */
tpg.N[0]=n; /* Nbase*tilesz*8 */
@ -960,6 +963,7 @@ sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x
tpg.p[1]=&pf[carr[c1].p[0]]; /* length carr[c1].nchunk times */
tpg.Y[1]=&Yf[carr[c1].p[0]]; /* length carr[c1].nchunk times */
tpg.Z[1]=&Zf[carr[c1].p[0]]; /* length carr[c1].nchunk times */
tpg.admm_rho[1]=(float)admm_rho[c1];
tpg.x[1]=xdummy1f;
tpg.M[1]=8*N; /* even though size of p is > M, dont change this */
tpg.N[1]=n; /* Nbase*tilesz*8 */
@ -971,7 +975,11 @@ sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x
/**************************************************************************/
/* both threads do work */
tpg.status[0]=tpg.status[1]=PT_DO_WORK_RRTR;
if (solver_mode==SM_NSD_RLBFGS) {
tpg.status[0]=tpg.status[1]=PT_DO_WORK_NSD;
} else {
tpg.status[0]=tpg.status[1]=PT_DO_WORK_RRTR;
}
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
sync_barrier(&(tp.gate1)); /* sync at gate 1 */
tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
@ -1037,6 +1045,7 @@ printf("1: %lf -> %lf 2: %lf -> %lf\n\n\n",info0[0],info0[1],info1[0],info1[1]);
tpg.p[0]=&pf[carr[c0].p[0]];
tpg.Y[0]=&Yf[carr[c0].p[0]];
tpg.Z[0]=&Zf[carr[c0].p[0]];
tpg.admm_rho[0]=(float)admm_rho[c0];
tpg.x[0]=xdummy0f;
tpg.M[0]=8*N;
tpg.N[0]=n;
@ -1046,7 +1055,11 @@ printf("1: %lf -> %lf 2: %lf -> %lf\n\n\n",info0[0],info0[1],info1[0],info1[1]);
tpg.linsolv=linsolv;
tpg.lmdata[0]=&lmdata0;
tpg.status[0]=PT_DO_WORK_RRTR;
if (solver_mode==SM_NSD_RLBFGS) {
tpg.status[0]=PT_DO_WORK_NSD;
} else {
tpg.status[0]=PT_DO_WORK_RRTR;
}
tpg.status[1]=PT_DO_NOTHING;
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
@ -1106,9 +1119,12 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
printf("mean nu=%lf\n",robust_nu0);
#endif
free(robust_nuM);
if (robust_nu0<nulow) {
if (robust_nu0<nulow) {
robust_nu0=nulow;
}
} else if (robust_nu0>nuhigh) {
robust_nu0=nuhigh;
}
/******** free threads ***************/
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
tpg.status[0]=tpg.status[1]=PT_DO_DGPU;
@ -1139,7 +1155,7 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
int
sagefit_visibilities_admm_dual_pt_flt_one(double *u, double *v, double *w, double *x, int N,
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode, double nulow, double nuhigh, int randomize,double admm_rho, double *mean_nu, double *res_0, double *res_1) {
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode, double nulow, double nuhigh, int randomize,double *admm_rho, double *mean_nu, double *res_0, double *res_1) {
int ci,cj;
@ -1284,24 +1300,16 @@ sagefit_visibilities_admm_dual_pt_flt_one(double *u, double *v, double *w, doubl
tpg.status[0]=tpg.status[1]=PT_DO_AGPU;
/* also calculate the total storage needed to be allocated on a GPU */
/* determine total size for memory allocation */
int Mm=8*N;
int64_t data_sz=0;
/* size for RTR (float) : M*( 13 + ntiles*Nbase/128 +1+1) + 18*ntiles*Nbase
where M: no of stations (params/8) ntiles: tile size Nbase: baselines
/* size for RTR/NSD (float), 128 is the ThreadsPerBlock
NSD is a bit lower, but use the same
*/
int64_t data_sz_rtr=0;
/* 2 x Nbase more than normal RTR for weight, log(weight) */
data_sz_rtr=(N*(13+Nbase1/128+1+1)+20*Nbase1)*sizeof(float);
/* size for ROBUSTLM */
data_sz=(int64_t)(n+Mm*n+Mm*Mm+Mm+Mm*Mm+Mm+Mm+Mm+Mm+Nbase1*8+n+n+n+n)*sizeof(float)+(int64_t)Nbase1*2*sizeof(char);
data_sz=MAX(data_sz,data_sz_rtr);
data_sz+=(int64_t)Mm*sizeof(float);
data_sz=(8*N*(11+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
tpg.data_size=data_sz;
tpg.nulow=nulow;
tpg.nuhigh=nuhigh;
tpg.randomize=randomize;
tpg.admm_rho=(float)admm_rho;
sync_barrier(&(tp.gate2)); /* sync at gate 2*/
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
@ -1358,6 +1366,7 @@ sagefit_visibilities_admm_dual_pt_flt_one(double *u, double *v, double *w, doubl
tpg.p[0]=&pf[carr[c0].p[0]];
tpg.Y[0]=&Yf[carr[c0].p[0]];
tpg.Z[0]=&Zf[carr[c0].p[0]];
tpg.admm_rho[0]=(float)admm_rho[c0];
tpg.x[0]=xdummy0f;
tpg.M[0]=8*N;
tpg.N[0]=n;
@ -1427,9 +1436,13 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
printf("mean nu=%lf\n",robust_nu0);
#endif
free(robust_nuM);
if (robust_nu0<nulow) {
if (robust_nu0<nulow) {
robust_nu0=nulow;
}
} else if (robust_nu0>nuhigh) {
robust_nu0=nuhigh;
}
/******** free threads ***************/
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
tpg.status[0]=tpg.status[1]=PT_DO_DGPU;

View File

@ -23,6 +23,7 @@
#include <string.h>
#include <math.h>
#include <float.h>
#include <unistd.h>
#include "sagecal.h"
#include <cuda_runtime.h>
@ -1175,8 +1176,14 @@ attach_gpu_to_thread2(int card, cublasHandle_t *cbhandle,float **WORK, int64_t
cudaError_t err;
culaStatus status;
cublasStatus_t cbstatus;
status=culaSelectDevice(card); /* FIXME: Do not enable CULA if its not going to be used */
if (usecula) {
status=culaSelectDevice(card); /* Do not enable CULA if its not going to be used */
/* if first try failed, wait and retry */
if (status) {
fprintf(stderr,"%s: %d: CULA device select failure, retrying\n",__FILE__,__LINE__);
sleep(10);
status=culaSelectDevice(card);
}
checkStatus(status,__FILE__,__LINE__);
status=culaInitialize();
checkStatus(status,__FILE__,__LINE__);
@ -1189,13 +1196,21 @@ attach_gpu_to_thread2(int card, cublasHandle_t *cbhandle,float **WORK, int64_t
exit(1);
}
checkStatus(status,__FILE__,__LINE__);
cudaSetDevice(card); /* we need this */
} else { /* not using CULA */
cudaSetDevice(card);
}
cudaSetDevice(card); /* do we need this because sometimes we do not use cula */
cbstatus=cublasCreate(cbhandle);
if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
fprintf(stderr,"%s: %d: CUBLAS create fail\n",__FILE__,__LINE__);
exit(1);
/* retry once more before exiting */
fprintf(stderr,"%s: %d: CUBLAS create failure, retrying\n",__FILE__,__LINE__);
sleep(10);
cbstatus=cublasCreate(cbhandle);
if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
fprintf(stderr,"%s: %d: CUBLAS create fail\n",__FILE__,__LINE__);
exit(1);
}
}
err=cudaMalloc((void**)WORK, (size_t)work_size);
@ -1246,7 +1261,6 @@ detach_gpu_from_thread2(int card,cublasHandle_t cbhandle,float *WORK, int usecul
}
cudaFree(WORK);
//cudaSetDevice(card);
cudaDeviceReset();
}
void

View File

@ -4,6 +4,8 @@ export LD_LIBRARY_PATH
source /opt/intel/composerxe/bin/compilervars.sh intel64
export MIC_ENV_PREFIX=MIC
##export MKL_MIC_ENABLE=1
##export OMP_NUM_THREADS=16
export MIC_PREFIX=MIC
export MIC_OMP_NUM_THREADS=240
#

View File

@ -1975,6 +1975,8 @@ sagefit_visibilities_dual_pt(double *u, double *v, double *w, double *x, int N,
free(robust_nuM);
if (robust_nu0<nulow) {
robust_nu0=nulow;
} else if (robust_nu0>nuhigh) {
robust_nu0=nuhigh;
}
}
@ -2363,7 +2365,10 @@ sagefit_visibilities_dual_pt_one_gpu(double *u, double *v, double *w, double *x,
free(robust_nuM);
if (robust_nu0<nulow) {
robust_nu0=nulow;
} else if (robust_nu0>nuhigh) {
robust_nu0=nuhigh;
}
}
/******** free threads ***************/
@ -2503,7 +2508,7 @@ pipeline_slave_code_flt(void *data)
//printf("state=%d, thread %d\n",gd->status[tid],tid);
if (gd->status[tid]==PT_DO_WORK_LM || gd->status[tid]==PT_DO_WORK_OSLM
|| gd->status[tid]==PT_DO_WORK_RLM || gd->status[tid]==PT_DO_WORK_OSRLM
|| gd->status[tid]==PT_DO_WORK_RTR || gd->status[tid]==PT_DO_WORK_RRTR ) {
|| gd->status[tid]==PT_DO_WORK_RTR || gd->status[tid]==PT_DO_WORK_RRTR || gd->status[tid]==PT_DO_WORK_NSD) {
/************************* work *********************/
me_data_t *t=(me_data_t *)gd->lmdata[tid];
/* divide the tiles into chunks tilesz/nchunk */
@ -2535,21 +2540,18 @@ pipeline_slave_code_flt(void *data)
} else if (gd->status[tid]==PT_DO_WORK_RLM) {
ret=rlevmar_der_single_cuda_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid], 8*ntiles*t->Nbase, gd->itermax[tid], gd->opts[tid], gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], gd->linsolv, cj, ntiles, gd->nulow,gd->nuhigh,(void*)gd->lmdata[tid]);
} else if (gd->status[tid]==PT_DO_WORK_OSRLM) {
ret=osrlevmar_der_single_cuda_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid], 8*ntiles*t->Nbase, gd->itermax[tid], gd->opts[tid], gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], gd->linsolv, cj, ntiles, gd->nulow,gd->nuhigh,gd->randomize,(void*)gd->lmdata[tid]);
ret=osrlevmar_der_single_cuda_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid], 8*ntiles*t->Nbase, gd->itermax[tid], gd->opts[tid], gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], gd->linsolv, cj, ntiles, gd->nulow,gd->nuhigh,gd->randomize,0,(void*)gd->lmdata[tid]); /* FIXME 0 for whiten */
} else if (gd->status[tid]==PT_DO_WORK_RTR) {
/* note stations: M/8, baselines ntiles*Nbase RSD+RTR */
float Delta0=0.01f; /* use very small value because previous LM has already made the solution close to true value */
/* storage:
need (float) 8N*(BlocksPerGrid+8) + 8N*5 + 8*M + 8*Nbase + 2*Nbase + N
where N<=M/8 M<=ntiles*Nbase, Nbase<=ntiles*Nbase,
BlocksPerGrid=ceil(ntiles*Nbase/128)
so(max): M*( 13 + ntiles*Nbase/128 +1 +1) + 18*ntiles*Nbase
where M: no of stations (params/8), ntiles: tile size, Nbase: baselines */
/* storage: see function header
*/
ret=rtr_solve_cuda_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+5, gd->itermax[tid]+10, Delta0, Delta0*0.125f, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], cj, ntiles, (void*)gd->lmdata[tid]);
} else if (gd->status[tid]==PT_DO_WORK_RRTR) {
float Delta0=0.01f;
/* storage (float) 8N*(BlocksPerGrid+8) + 8N*5 + 8*M + 8*Nbase + 2*Nbase + N + M */
ret=rtr_solve_cuda_robust_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+5, gd->itermax[tid]+10, Delta0, Delta0*0.125f, gd->nulow, gd->nuhigh, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], cj, ntiles, (void*)gd->lmdata[tid]);
} else if (gd->status[tid]==PT_DO_WORK_NSD) {
ret=nsd_solve_cuda_robust_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+15, gd->nulow, gd->nuhigh, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], cj, ntiles, (void*)gd->lmdata[tid]);
}
init_res+=gd->info[tid][0];
final_res+=gd->info[tid][1];
@ -2654,7 +2656,6 @@ sagefit_visibilities_dual_pt_flt(double *u, double *v, double *w, double *x, int
int *cr=0; /* array for random permutation of clusters */
int c0,c1;
//opts[0]=LM_INIT_MU; opts[1]=1E-15; opts[2]=1E-15; opts[3]=1E-20;
opts[0]=CLM_INIT_MU; opts[1]=1E-9; opts[2]=1E-9; opts[3]=1E-9;
opts[4]=-CLM_DIFF_DELTA;
@ -2747,7 +2748,7 @@ sagefit_visibilities_dual_pt_flt(double *u, double *v, double *w, double *x, int
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
if ((robust_nuM=(double*)calloc((size_t)(M),sizeof(double)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
@ -2775,37 +2776,37 @@ sagefit_visibilities_dual_pt_flt(double *u, double *v, double *w, double *x, int
int64_t data_sz=0;
/* size for MLM (disabled) */
//data_sz=(int64_t)(n+Mm*n+n+n+n+Mm+Mm+Mm*Mm+Mm*Mm+Mm+Mm+Mm+Mm+Mm+Nbase1*8)*sizeof(double)+(int64_t)Nbase1*2*sizeof(char);
/* size for RTR (float) : M*( 13 + ntiles*Nbase/128 +1+1) + 18*ntiles*Nbase
where M: no of stations (params/8) ntiles: tile size Nbase: baselines
/* size for RTR/NSD (float), 128 is the ThreadsPerBlock
*/
int64_t data_sz_rtr=0;
if (solver_mode==SM_RTR_OSLM_LBFGS) {
data_sz_rtr=(N*(13+Nbase1/128+1+1)+18*Nbase1)*sizeof(float);
/* use same size as robust version, probably is lower */
data_sz=(8*N*(11+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
} else if (solver_mode==SM_RTR_OSRLM_RLBFGS) {
/* 2 x Nbase more than normal RTR for weight, log(weight) */
data_sz_rtr=(N*(13+Nbase1/128+1+1)+20*Nbase1)*sizeof(float);
}
if (solver_mode==SM_LM_LBFGS || solver_mode==SM_OSLM_LBFGS || solver_mode==SM_RTR_OSLM_LBFGS) {
data_sz=(8*N*(11+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
} else if (solver_mode==SM_NSD_RLBFGS) {
data_sz=(8*N*(7+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
} else if (solver_mode==SM_LM_LBFGS || solver_mode==SM_OSLM_LBFGS) {
/* size for LM */
data_sz=(int64_t)(n+Mm*n+Mm*Mm+Mm+Mm*Mm+Mm+Mm+Mm+Mm+Nbase1*8+n+n)*sizeof(float)+(int64_t)Nbase1*2*sizeof(char);
if(solver_mode==SM_RTR_OSLM_LBFGS) {
data_sz=MAX(data_sz,data_sz_rtr);
if (linsolv==1) {
data_sz+=(int64_t)Mm*sizeof(float);
} else if (linsolv==2) {
data_sz+=(int64_t)(Mm*Mm+Mm*Mm+Mm)*sizeof(float);
}
} else if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
} else if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS) {
/* size for ROBUSTLM */
data_sz=(int64_t)(n+Mm*n+Mm*Mm+Mm+Mm*Mm+Mm+Mm+Mm+Mm+Nbase1*8+n+n+n+n)*sizeof(float)+(int64_t)Nbase1*2*sizeof(char);
if(solver_mode==SM_RTR_OSRLM_RLBFGS) {
data_sz=MAX(data_sz,data_sz_rtr);
if (linsolv==1) {
data_sz+=(int64_t)Mm*sizeof(float);
} else if (linsolv==2) {
data_sz+=(int64_t)(Mm*Mm+Mm*Mm+Mm)*sizeof(float);
}
} else {
fprintf(stderr,"%s: %d: invalid mode for solver\n",__FILE__,__LINE__);
exit(1);
}
if (linsolv==1) {
data_sz+=(int64_t)Mm*sizeof(float);
} else if (linsolv==2) {
data_sz+=(int64_t)(Mm*Mm+Mm*Mm+Mm)*sizeof(float);
}
tpg.data_size=data_sz;
tpg.nulow=nulow;
tpg.nuhigh=nuhigh;
@ -2938,6 +2939,11 @@ sagefit_visibilities_dual_pt_flt(double *u, double *v, double *w, double *x, int
lmdata0.robust_nu=lmdata1.robust_nu=robust_nu0; /* initial robust nu */
}
tpg.status[0]=tpg.status[1]=PT_DO_WORK_RRTR;
} else if (solver_mode==SM_NSD_RLBFGS) {
if (!ci) {
lmdata0.robust_nu=lmdata1.robust_nu=robust_nu0; /* initial robust nu */
}
tpg.status[0]=tpg.status[1]=PT_DO_WORK_NSD;
} else {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: undefined solver mode\n",__FILE__,__LINE__);
@ -2965,7 +2971,7 @@ printf("1: %lf -> %lf 2: %lf -> %lf\n\n\n",info0[0],info0[1],info1[0],info1[1]);
nerr[c1]=0.0;
}
/* update robust_nu */
if ((solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) && (ci==max_emiter-1)) {
if ((solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) && (ci==max_emiter-1)) {
robust_nuM[c0]+=lmdata0.robust_nu;
robust_nuM[c1]+=lmdata1.robust_nu;
}
@ -3047,11 +3053,15 @@ printf("1: %lf -> %lf 2: %lf -> %lf\n\n\n",info0[0],info0[1],info1[0],info1[1]);
} else if (solver_mode==SM_RTR_OSLM_LBFGS) {
tpg.status[0]=PT_DO_WORK_RTR;
} else if (solver_mode==SM_RTR_OSRLM_RLBFGS) {
/* last iteration is OSRLM */
if (!ci) {
lmdata0.robust_nu=robust_nu0; /* initial robust nu */
}
tpg.status[0]=PT_DO_WORK_RRTR;
} else if (solver_mode==SM_NSD_RLBFGS) {
if (!ci) {
lmdata0.robust_nu=robust_nu0; /* initial robust nu */
}
tpg.status[0]=PT_DO_WORK_NSD;
} else {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: undefined solver mode\n",__FILE__,__LINE__);
@ -3077,7 +3087,7 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
nerr[c0]=0.0;
}
/* update robust_nu */
if ((solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) && (ci==max_emiter-1)) {
if ((solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) && (ci==max_emiter-1)) {
robust_nuM[c0]+=lmdata0.robust_nu;
}
/* once again subtract solved model from data */
@ -3108,7 +3118,7 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
free(xdummy1f);
free(pf);
free(ddcohf);
if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
/* calculate mean robust_nu over all clusters */
robust_nu0=my_dasum(M,robust_nuM)/(double)M;
#ifdef DEBUG
@ -3120,7 +3130,10 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
free(robust_nuM);
if (robust_nu0<nulow) {
robust_nu0=nulow;
} else if (robust_nu0>nuhigh) {
robust_nu0=nuhigh;
}
}
/******** free threads ***************/
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
@ -3133,11 +3146,7 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
if (max_lbfgs>0) {
/* use LBFGS */
if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
/* if RTR, divide by 8 */
if (solver_mode==SM_RTR_OSRLM_RLBFGS) {
robust_nu0 *=0.125;
}
if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
lmdata0.robust_nu=robust_nu0;
ret=lbfgs_fit_robust_cuda(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata0);
} else {

View File

@ -646,7 +646,6 @@ predict_threadfn_withgain_full(void *data) {
*/
px=(ci+t->boff)/((Ntilebase+t->carr[cm].nchunk-1)/t->carr[cm].nchunk);
//pm=&(t->p[cm*8*N]);
pm=&(t->p[t->carr[cm].p[px]]);
G1[0]=(pm[sta1*8])+_Complex_I*(pm[sta1*8+1]);
G1[1]=(pm[sta1*8+2])+_Complex_I*(pm[sta1*8+3]);
@ -710,7 +709,6 @@ minimize_viz_full_pth(double *p, double *x, int m, int n, void *data) {
int Nbase1=(dp->Nbase)*(dp->tilesz);
/* calculate min baselines a thread can handle */
//Nthb0=ceil((double)Nbase1/(double)Nt);
Nthb0=(Nbase1+Nt-1)/Nt;
/* setup threads */
@ -800,7 +798,6 @@ minimize_viz_full_pth00(double *p, double *x, int m, int n, void *data) {
int Ntilebase=(dp->Nbase)*(dp->tilesz);
int px;
#pragma omp parallel for
for (ci=0; ci<Ntilebase; ci++) {
/* iterate over the sky model and calculate contribution */
/* for this x[8*ci:8*(ci+1)-1] */
@ -873,7 +870,7 @@ minimize_viz_full_pth00(double *p, double *x, int m, int n, void *data) {
int
sagefit_visibilities(double *u, double *v, double *w, double *x, int N,
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, double *mean_nu, double *res_0, double *res_1) {
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, int whiten, double *mean_nu, double *res_0, double *res_1) {
/* u,v,w : size Nbase*tilesz x 1 x: size Nbase*8*tilesz x 1 */
/* barr: size Nbase*tilesz x 1 carr: size Mx1 */
/* pp: size 8*N*M x 1 */
@ -943,7 +940,7 @@ sagefit_visibilities(double *u, double *v, double *w, double *x, int N,
#endif
exit(1);
}
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
if ((robust_nuM=(double*)calloc((size_t)(M),sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
@ -1012,7 +1009,7 @@ printf("\n\ncluster %d iter=%d\n",cj,this_itermax);
/* only the last EM iteration is robust */
if (ci==max_emiter-1){
lmdata.robust_nu=robust_nu0;
ret=rlevmar_der_single_nocuda(mylm_fit_single_pth0, mylm_jac_single_pth, &p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], 8*N, 8*ntiles*Nbase, this_itermax, NULL, info, linsolv, Nt, nulow, nuhigh, (void*)&lmdata);
ret=rlevmar_der_single_nocuda(mylm_fit_single_pth0, mylm_jac_single_pth, &p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], 8*N, 8*ntiles*Nbase, this_itermax, NULL, info, linsolv, Nt, nulow, nuhigh, whiten, (void*)&lmdata);
/* get updated value of robust_nu */
robust_nuM[cj]+=lmdata.robust_nu;
} else {
@ -1022,7 +1019,7 @@ printf("\n\ncluster %d iter=%d\n",cj,this_itermax);
/* only the last EM iteration is robust */
if (ci==max_emiter-1){
lmdata.robust_nu=robust_nu0;
ret=osrlevmar_der_single_nocuda(mylm_fit_single_pth0, mylm_jac_single_pth, &p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], 8*N, 8*ntiles*Nbase, this_itermax, NULL, info, linsolv, Nt, nulow, nuhigh, randomize, (void*)&lmdata);
ret=osrlevmar_der_single_nocuda(mylm_fit_single_pth0, mylm_jac_single_pth, &p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], 8*N, 8*ntiles*Nbase, this_itermax, NULL, info, linsolv, Nt, nulow, nuhigh, randomize, whiten, (void*)&lmdata);
/* get updated value of robust_nu */
robust_nuM[cj]+=lmdata.robust_nu;
} else {
@ -1042,6 +1039,15 @@ printf("\n\ncluster %d iter=%d\n",cj,this_itermax);
if (ci==max_emiter-1){
robust_nuM[cj]+=lmdata.robust_nu;
}
} else if (solver_mode==SM_NSD_RLBFGS) { /* Nesterov's */
/* NSD */
if (!ci){
lmdata.robust_nu=robust_nu0;
}
ret=nsd_solve_nocuda_robust(&p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], N, ntiles*Nbase, this_itermax+15, nulow, nuhigh, info, &lmdata);
if (ci==max_emiter-1){
robust_nuM[cj]+=lmdata.robust_nu;
}
} else { /* not used */
//ret=mlm_der_single(mylm_fit_single_pth0, mylm_jac_single_pth, &p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], 8*N, 8*ntiles*Nbase, this_itermax, NULL, info, linsolv, (void*)&lmdata);
#ifndef USE_MIC
@ -1069,7 +1075,7 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
mylm_fit_single_pth(p, xsub, 8*N, n, (void*)&lmdata);
my_daxpy(n, xsub, -1.0, xdummy);
/* if robust LM, calculate average nu over hybrid clusters */
if ((solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) && (ci==max_emiter-1)) {
if ((solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) && (ci==max_emiter-1)) {
robust_nuM[cj]/=(double)carr[cj].nchunk;
}
}
@ -1088,7 +1094,7 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
}
free(nerr);
free(xdummy);
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
/* calculate mean robust_nu over all clusters */
robust_nu0=my_dasum(M,robust_nuM)/(double)M;
#ifdef DEBUG
@ -1100,6 +1106,8 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
free(robust_nuM);
if (robust_nu0<nulow) {
robust_nu0=nulow;
} else if (robust_nu0>nuhigh) {
robust_nu0=nuhigh;
}
}
@ -1108,13 +1116,10 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
lmdata.Nt=32; /* FIXME increase threads for MIC */
#endif
/* use LBFGS */
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
/* if RTR, divide by 8 */
if (solver_mode==SM_RTR_OSRLM_RLBFGS) {
robust_nu0 *=0.125;
}
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
lmdata.robust_nu=robust_nu0;
ret=lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
/* pre-whiten data when calculating residual */
ret=lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, whiten, (void*)&lmdata);
} else {
ret=lbfgs_fit(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
}
@ -1273,7 +1278,7 @@ bfgsfit_visibilities(double *u, double *v, double *w, double *x, int N,
/* use LBFGS */
if (solver_mode==2 || solver_mode==3) {
lmdata.robust_nu=mean_nu;
ret=lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
ret=lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, 0, (void*)&lmdata); /* 0 to disable whitening */
} else {
ret=lbfgs_fit(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
}

View File

@ -1352,7 +1352,7 @@ kernel_fns_fupdate_weights(int N, int Nbase, cuFloatComplex *x, float *y, float
1) its not flagged (sta1,sta2)>=0
*/
float sumn=0.0f;
float temp1,temp2,tt,yy,c=0.0f;
float temp1,temp2,tt;
if (sta1>=0 && sta2>=0) {
cuFloatComplex G1[4];
cuFloatComplex G2[4];
@ -1382,25 +1382,28 @@ kernel_fns_fupdate_weights(int N, int Nbase, cuFloatComplex *x, float *y, float
/* T=T*G2' */
ambt(T1,G2,T2);
/* error using Kahan summation */
/* use p=2, find MAX value of residual error out of XX,XY,YX,YY
instead of the sum */
/* V->U */
temp1=y[8*n]-T2[0].x;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
temp1=y[8*n+1]-T2[0].y;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
temp2=y[8*n+1]-T2[0].y;
sumn=temp1*temp1+temp2*temp2;
temp1=y[8*n+2]-T2[1].x;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
temp1=y[8*n+3]-T2[1].y;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
temp2=y[8*n+3]-T2[1].y;
tt=temp1*temp1+temp2*temp2;
if (sumn<tt) { sumn=tt; }
temp1=y[8*n+4]-T2[2].x;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
temp1=y[8*n+5]-T2[2].y;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
temp2=y[8*n+5]-T2[2].y;
tt=temp1*temp1+temp2*temp2;
if (sumn<tt) { sumn=tt; }
temp1=y[8*n+6]-T2[3].x;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
temp1=y[8*n+7]-T2[3].y;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
wtd[n]=(nu0+8.0f)/(nu0+sumn); /* 8 variate T distribution */
temp2=y[8*n+7]-T2[3].y;
tt=temp1*temp1+temp2*temp2;
if (sumn<tt) { sumn=tt; }
//wtd[n]=(nu0+8.0f)/(nu0+sumn); /* 8 variate T distribution */
wtd[n]=(nu0+2.0f)/(nu0+sumn); /* 2 variate T distribution */
}
}
@ -1422,7 +1425,7 @@ kernel_fns_fupdate_weights_q(int N, int Nbase, cuFloatComplex *x, float *y, floa
1) its not flagged (sta1,sta2)>=0
*/
float sumn=0.0f;
float temp1,temp2,tt,yy,c=0.0f;
float temp1,temp2,tt;
if (sta1>=0 && sta2>=0) {
cuFloatComplex G1[4];
cuFloatComplex G2[4];
@ -1452,25 +1455,28 @@ kernel_fns_fupdate_weights_q(int N, int Nbase, cuFloatComplex *x, float *y, floa
/* T=T*G2' */
ambt(T1,G2,T2);
/* error using Kahan summation */
/* use p=2, find MAX value of residual error out of XX,XY,YX,YY
instead of the sum */
/* V->U */
temp1=y[8*n]-T2[0].x;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
temp1=y[8*n+1]-T2[0].y;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
temp2=y[8*n+1]-T2[0].y;
sumn=temp1*temp1+temp2*temp2;
temp1=y[8*n+2]-T2[1].x;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
temp1=y[8*n+3]-T2[1].y;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
temp2=y[8*n+3]-T2[1].y;
tt=temp1*temp1+temp2*temp2;
if (sumn<tt) { sumn=tt; }
temp1=y[8*n+4]-T2[2].x;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
temp1=y[8*n+5]-T2[2].y;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
temp2=y[8*n+5]-T2[2].y;
tt=temp1*temp1+temp2*temp2;
if (sumn<tt) { sumn=tt; }
temp1=y[8*n+6]-T2[3].x;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
temp1=y[8*n+7]-T2[3].y;
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
wtd[n]=(nu0+8.0f)/(nu0+sumn); /* 8 variate T distribution */
temp2=y[8*n+7]-T2[3].y;
tt=temp1*temp1+temp2*temp2;
if (sumn<tt) { sumn=tt; }
//wtd[n]=(nu0+8.0f)/(nu0+sumn); /* 8 variate T distribution */
wtd[n]=(nu0+2.0f)/(nu0+sumn); /* 2 variate T distribution */
qd[n]=wtd[n]-logf(wtd[n]);
}
}
@ -1667,7 +1673,7 @@ cudakernel_fns_f(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatCo
return ed: error vector, BlocksPerGridx1
*/
/* need BlocksPerGrid+1+L float storage */
/* need BlocksPerGrid+4+L float storage <= (2 BlocksPerGrid + 4) */
float
cudakernel_fns_f_robust(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, float *y, float *coh, char *bbh, float *wtd, float *gWORK) {
#ifdef CUDA_DBG

View File

@ -87,7 +87,7 @@ calculate_uv_mode_vectors_scalar(double u, double v, double beta, int n0, double
shpvl[zci][xci]=H_e(xval,xci)*expval/sqrt((double)(2<<xci)*fact[xci]);
}
zci=1;
xval=v*beta;
xval=v*beta;
expval=exp(-0.5*xval*xval);
for (xci=0; xci<n0; xci++) {
shpvl[zci][xci]=H_e(xval,xci)*expval/sqrt((double)(2<<xci)*fact[xci]);

View File

@ -209,10 +209,10 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
/* each element of list is a list of source names */
if ((cfp=fopen(clusterfile,"r"))==0) {
fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
return 1;
exit(1);
}
/* allocate memory for buffer */
buff_len = 2048; /* FIXME: handle long names */
buff_len = MAX_SNAME; /* handle long names */
if((buff = (char*)malloc(sizeof(char)*(size_t)(buff_len)))==NULL) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
@ -231,7 +231,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
/* new cluster found */
if ((clus= (clust_t*)malloc(sizeof(clust_t)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
sscanf(buff,"%d",&clus->id);
clus->slist=NULL;
@ -248,11 +248,11 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
/* source found for this cluster */
if ((sclus= (clust_n*)malloc(sizeof(clust_n)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
if ((sclus->name=(char*)malloc((size_t)(strlen(buff)+1)*sizeof(char)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
strcpy(sclus->name,buff);
clus->slist=g_list_prepend(clus->slist,sclus);
@ -277,10 +277,10 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
*/
if ((cfp=fopen(skymodel,"r"))==0) {
fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
return 1;
exit(1);
}
if ((buff = (char*)realloc((void*)(buff),sizeof(char)*(size_t)(128)))==NULL) {
if ((buff = (char*)realloc((void*)(buff),sizeof(char)*(size_t)(MAX_SNAME)))==NULL) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
@ -298,12 +298,12 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
if (c!=EOF && c>0) {
if ((hkey=(char*)malloc((size_t)(strlen(buff)+1)*sizeof(char)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
strcpy(hkey,buff);
if ((source=(sinfo_t*)malloc(sizeof(sinfo_t)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
/* calculate l,m */
/* Rad=(hr+min/60+sec/60*60)*pi/12 */
@ -328,11 +328,11 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
fratio=log(freq0/f0);
fratio1=fratio*fratio;
fratio2=fratio1*fratio;
/* catch -ve sI */
/* catch -ve and 0 sI */
if (sI>0.0) {
source->sI=exp(log(sI)+spec_idx*fratio+spec_idx1*fratio1+spec_idx2*fratio2);
} else {
source->sI=-exp(log(-sI)+spec_idx*fratio+spec_idx1*fratio1+spec_idx2*fratio2);
source->sI=(sI==0.0?0.0:-exp(log(-sI)+spec_idx*fratio+spec_idx1*fratio1+spec_idx2*fratio2));
}
} else {
source->sI=sI;
@ -365,7 +365,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
source->stype=STYPE_GAUSSIAN;
if((exg=(exinfo_gaussian *)malloc(sizeof(exinfo_gaussian)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
exg->eX=2.0*eX; /* scale by 2 */
exg->eY=2.0*eY;
@ -387,7 +387,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
source->stype=STYPE_DISK;
if((exd=(exinfo_disk*)malloc(sizeof(exinfo_disk)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
exd->eX=eX;
/* negate angles */
@ -407,7 +407,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
source->stype=STYPE_RING;
if((exr=(exinfo_ring*)malloc(sizeof(exinfo_ring)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
exr->eX=eX;
/* negate angles */
@ -427,7 +427,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
source->stype=STYPE_SHAPELET;
if((exs=(exinfo_shapelet*)malloc(sizeof(exinfo_shapelet)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
exs->eX=eX;
exs->eY=eY;
@ -474,7 +474,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
/* setup the array of cluster/source information */
if ((*carr=(clus_source_t*)malloc((size_t)(g_list_length(clusters))*sizeof(clus_source_t)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
ci=0;
@ -491,48 +491,48 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
if (((*carr)[ci].ll=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
if (((*carr)[ci].mm=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
if (((*carr)[ci].nn=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
if (((*carr)[ci].sI=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
if (((*carr)[ci].stype=(unsigned char*)malloc((size_t)((*carr)[ci].N)*sizeof(unsigned char)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
if (((*carr)[ci].ex=(void**)malloc((size_t)((*carr)[ci].N)*sizeof(void*)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
/* for handling multi channel data */
if (((*carr)[ci].sI0=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
if (((*carr)[ci].f0=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
if (((*carr)[ci].spec_idx=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
if (((*carr)[ci].spec_idx1=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
if (((*carr)[ci].spec_idx2=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
return 1;
exit(1);
}
@ -689,3 +689,55 @@ update_ignorelist(const char *ignfile, int *ignlist, int M, clus_source_t *carr)
printf("Total %d clustes ignored in simulation.\n",cn);
return 0;
}
int
read_arho_fromfile(const char *admm_rho_file,int Mt,double *arho, int M, double *arhoslave) {
FILE *cfp;
int c,ci,cj,cluster_id,hybrid,hb;
double admm_rho;
if ((cfp=fopen(admm_rho_file,"r"))==0) {
fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
exit(1);
}
c=skip_lines(cfp);
ci=0; /* store it in reverse order */
cj=0;
while(c>=0) {
c=fscanf(cfp,"%d %d %lf",&cluster_id,&hybrid,&admm_rho);
/* add this value to arho array */
if (c!=EOF && c>0) {
/* found a valid line */
arhoslave[M-1-cj]=admm_rho; /* reverse order */
for (hb=0; hb<hybrid; hb++) {
if (hb==0) {
arhoslave[M-1-cj]=admm_rho; /* reverse order */
//printf("clus=%d arr=%d rhoslave=%lf\n",cluster_id,M-1-cj,admm_rho);
cj++;
}
arho[Mt-1-ci]=admm_rho; /* reverse order */
//printf("clus=%d arr=%d rho=%lf\n",cluster_id,Mt-1-ci,admm_rho);
if (ci<Mt-1) {
ci++;
} else {
/* array size does not match one given by text file */
break;
}
}
}
c=skip_restof_line(cfp);
c=skip_lines(cfp);
}
/* report any errors */
if (!(c==EOF && ci==Mt-1)) {
fprintf(stderr,"%s: %d: Error: cluster numbers in cluster file and regularization file do not match up.\n",__FILE__,__LINE__);
}
fclose(cfp);
return 0;
}

View File

@ -573,11 +573,11 @@ residual_threadfn_onefreq(void *data) {
fratio=log(freq0/t->carr[cm].f0[cn]);
fratio1=fratio*fratio;
fratio2=fratio1*fratio;
/* catch -ve sI */
/* catch -ve and 0 sI */
if (t->carr[cm].sI0[cn]>0.0) {
prodterm=exp(log(t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
} else {
prodterm=-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
prodterm=(t->carr[cm].sI0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph));
}
} else {
prodterm=t->carr[cm].sI[cn]*(cosph+_Complex_I*sinph);
@ -846,11 +846,11 @@ residual_threadfn_multifreq(void *data) {
fratio=log(freq0/t->carr[cm].f0[cn]);
fratio1=fratio*fratio;
fratio2=fratio1*fratio;
/* catch -ve sI */
/* catch -ve and 0 sI */
if (t->carr[cm].sI0[cn]>0.0) {
prodterm=exp(log(t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
} else {
prodterm=-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
prodterm=(t->carr[cm].sI0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph));
}
} else {
prodterm=t->carr[cm].sI[cn]*(cosph+_Complex_I*sinph);
@ -1087,11 +1087,11 @@ visibilities_threadfn_multifreq(void *data) {
fratio=log(freq0/t->carr[cm].f0[cn]);
fratio1=fratio*fratio;
fratio2=fratio1*fratio;
/* catch -ve sI */
/* catch -ve and 0 sI */
if (t->carr[cm].sI0[cn]>0.0) {
prodterm=exp(log(t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
} else {
prodterm=-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
prodterm=(t->carr[cm].sI0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph));
}
} else {
prodterm=t->carr[cm].sI[cn]*(cosph+_Complex_I*sinph);
@ -1151,7 +1151,7 @@ visibilities_threadfn_multifreq(void *data) {
/* FIXME: tail timeslots still not written properly (probably due to flagging while reading data) */
int
predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,int Nt) {
predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,int Nt, int add_to_data) {
int nth,nth1,ci;
int Nthb0,Nthb;
@ -1177,8 +1177,10 @@ predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int
exit(1);
}
/* set output column to zero */
memset(x,0,sizeof(double)*8*Nbase*tilesz*Nchan);
if (!add_to_data) {
/* set output column to zero */
memset(x,0,sizeof(double)*8*Nbase*tilesz*Nchan);
}
/* iterate over threads, allocating baselines per thread */
ci=0;
@ -1211,6 +1213,8 @@ predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int
threaddata[nth].fdelta=fdelta/(double)Nchan;
threaddata[nth].tdelta=tdelta;
threaddata[nth].dec0=dec0;
threaddata[nth].add_to_data=add_to_data;
pthread_create(&th_array[nth],&attr,visibilities_threadfn_multifreq,(void*)(&threaddata[nth]));
/* next baseline set */
@ -1252,8 +1256,10 @@ predictwithgain_threadfn_multifreq(void *data) {
/* iterate over the sky model and calculate contribution */
/* for this x[8*ci:8*(ci+1)-1] */
/* if this baseline is flagged, we do not compute */
for (cf=0; cf<t->Nchan; cf++) {
memset(&t->x[8*ci+cf*Ntilebase*8],0,sizeof(double)*8);
if (!t->add_to_data) { /* only model is written as output */
for (cf=0; cf<t->Nchan; cf++) {
memset(&t->x[8*ci+cf*Ntilebase*8],0,sizeof(double)*8);
}
}
/* stations for this baseline */
@ -1303,11 +1309,11 @@ predictwithgain_threadfn_multifreq(void *data) {
fratio=log(freq0/t->carr[cm].f0[cn]);
fratio1=fratio*fratio;
fratio2=fratio1*fratio;
/* catch -ve sI */
/* catch -ve and 0 sI */
if (t->carr[cm].sI0[cn]>0.0) {
prodterm=exp(log(t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
} else {
prodterm=-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
prodterm=(t->carr[cm].sI0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph));
}
} else {
prodterm=t->carr[cm].sI[cn]*(cosph+_Complex_I*sinph);
@ -1364,12 +1370,44 @@ predictwithgain_threadfn_multifreq(void *data) {
}
}
}
/* if valid cluster is given, correct with its solutions */
if (t->pinv) {
cm=t->ccid;
px=(ci+t->boff)/((Ntilebase+t->carr[cm].nchunk-1)/t->carr[cm].nchunk);
pm=&(t->pinv[8*t->N*px]);
G1[0]=(pm[sta1*8])+_Complex_I*(pm[sta1*8+1]);
G1[1]=(pm[sta1*8+2])+_Complex_I*(pm[sta1*8+3]);
G1[2]=(pm[sta1*8+4])+_Complex_I*(pm[sta1*8+5]);
G1[3]=(pm[sta1*8+6])+_Complex_I*(pm[sta1*8+7]);
G2[0]=(pm[sta2*8])+_Complex_I*(pm[sta2*8+1]);
G2[1]=(pm[sta2*8+2])+_Complex_I*(pm[sta2*8+3]);
G2[2]=(pm[sta2*8+4])+_Complex_I*(pm[sta2*8+5]);
G2[3]=(pm[sta2*8+6])+_Complex_I*(pm[sta2*8+7]);
/* now do correction, if any */
C[0]=t->x[8*ci]+_Complex_I*t->x[8*ci+1];
C[1]=t->x[8*ci+2]+_Complex_I*t->x[8*ci+3];
C[2]=t->x[8*ci+4]+_Complex_I*t->x[8*ci+5];
C[3]=t->x[8*ci+6]+_Complex_I*t->x[8*ci+7];
/* T1=G1*C */
amb(G1,C,T1);
/* T2=T1*G2' */
ambt(T1,G2,T2);
t->x[8*ci]=creal(T2[0]);
t->x[8*ci+1]=cimag(T2[0]);
t->x[8*ci+2]=creal(T2[1]);
t->x[8*ci+3]=cimag(T2[1]);
t->x[8*ci+4]=creal(T2[2]);
t->x[8*ci+5]=cimag(T2[2]);
t->x[8*ci+6]=creal(T2[3]);
t->x[8*ci+7]=cimag(T2[3]);
}
}
return NULL;
}
int
predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,double *x,int *ignlist,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt) {
predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,double *x,int *ignlist,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt, int add_to_data, int ccid, double rho) {
int nth,nth1,ci;
int Nthb0,Nthb;
@ -1379,6 +1417,32 @@ predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,d
int Nbase1=Nbase*tilesz;
int cm,cj;
double *pm,*pinv=0;
cm=-1;
/* find if any cluster is specified for correction of data */
for (cj=0; cj<M; cj++) { /* clusters */
/* check if cluster id == ccid to do a correction */
if (carr[cj].id==ccid) {
cm=cj;
ci=1; /* correction cluster found */
}
}
if (cm>=0) { /* valid cluser for correction */
/* allocate memory for inverse J */
if ((pinv=(double*)malloc((size_t)8*N*carr[cm].nchunk*sizeof(double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
for (cj=0; cj<carr[cm].nchunk; cj++) {
pm=&(p[carr[cm].p[cj]]); /* start of solutions */
/* invert N solutions */
for (ci=0; ci<N; ci++) {
mat_invert(&pm[8*ci],&pinv[8*ci+8*N*cj], rho);
}
}
}
/* calculate min baselines a thread can handle */
Nthb0=(Nbase1+Nt-1)/Nt;
@ -1427,6 +1491,11 @@ predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,d
threaddata[nth].fdelta=fdelta/(double)Nchan;
threaddata[nth].tdelta=tdelta;
threaddata[nth].dec0=dec0;
threaddata[nth].add_to_data=add_to_data;
/* for correction of data */
threaddata[nth].pinv=pinv;
threaddata[nth].ccid=cm;
pthread_create(&th_array[nth],&attr,predictwithgain_threadfn_multifreq,(void*)(&threaddata[nth]));
/* next baseline set */

View File

@ -321,10 +321,11 @@ __global__ void
kernel_evaluatenu_fl_eight(int Nd, float qsum, float *q, float deltanu,float nulow, float nu0) {
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
/* each block calculte psi((nu+8)/2)-log((nu+8)/2) */
/* actually p=2, so psi((nu+2)/2)-log((nu+2)/2) */
float dgm0;
if (threadIdx.x==0) {
dgm0=digamma_fl(nu0*0.5f+4.0f);
dgm0=dgm0-logf((nu0+8.0f)*0.5f); /* psi((nu0+8)/2)-log((nu0+8)/2) */
dgm0=digamma_fl(nu0*0.5f+1.0f);
dgm0=dgm0-logf((nu0+2.0f)*0.5f); /* psi((nu0+8)/2)-log((nu0+8)/2) */
}
__syncthreads();
if (tid<Nd) {
@ -453,7 +454,7 @@ cudakernel_evaluatenu_fl(int ThreadsPerBlock, int BlocksPerGrid, int Nd, float q
/* evaluate expression for finding optimum nu for
a range of nu values, using AECM
a range of nu values, using AECM (p=8 before, but now p=2)
nu0: current value of robust_nu*/
void
cudakernel_evaluatenu_fl_eight(int ThreadsPerBlock, int BlocksPerGrid, int Nd, float qsum, float *q, float deltanu,float nulow, float nu0) {

View File

@ -911,7 +911,9 @@ func_grad_robust(
int
lbfgs_fit_robust(
void (*func)(double *p, double *hx, int m, int n, void *adata),
double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, void *adata) {
double *p, double *x, int m, int n, int itmax, int M, int gpu_threads,
int whiten, /* if >0 whiten data 1: NCP, 2... */
void *adata) {
double *gk; /* gradients at both k+1 and k iter */
double *xk1,*xk; /* parameters at k+1 and k iter */

View File

@ -1204,6 +1204,7 @@ osrlevmar_der_single_cuda_fl(
int ntiles, /* total tile (data) size being solved for */
double robust_nulow, double robust_nuhigh, /* robust nu range */
int randomize, /* if >0 randomize */
int whiten, /* if >0 whiten data 1: NCP, 2... */
void *adata) /* pointer to possibly additional data, passed uninterpreted to func & jacf.
* Set to NULL if not needed
*/
@ -1883,6 +1884,7 @@ rlevmar_der_single_nocuda(
int linsolv, /* 0 Cholesky, 1 QR, 2 SVD */
int Nt, /* no of threads */
double robust_nulow, double robust_nuhigh, /* robust nu range */
int whiten, /* if >0 whiten data 1: NCP, 2... */
void *adata) /* pointer to possibly additional data, passed uninterpreted to func & jacf.
* Set to NULL if not needed
*/
@ -2019,17 +2021,19 @@ rlevmar_der_single_nocuda(
exit(1);
}
WORK=Ud=Sd=VTd=0;
// for (ci=0;ci<M; ci++) {
// aones[ci]=1.0;
// }
me_data_t *dt=(me_data_t*)adata;
setweights(M,aones,1.0,dt->Nt);
int nw,wt_itmax=3;
me_data_t *lmdata=(me_data_t*)adata;
double wt_sum,lambda,robust_nu=lmdata->robust_nu;
double robust_nu1;
setweights(M,aones,1.0,lmdata->Nt);
/*W set initial weights to 1 */
// for (ci=0;ci<N; ci++) {
// wtd[ci]=1.0;
// }
setweights(N,wtd,1.0,dt->Nt);
setweights(N,wtd,1.0,lmdata->Nt);
/* modify weights with whitening weights */
if (whiten) {
/* use correct offset for u,v based on tile offset */
add_whitening_weights(N/8, wtd, &lmdata->u[lmdata->tileoff*lmdata->Nbase], &lmdata->v[lmdata->tileoff*lmdata->Nbase], *(lmdata->freq0), lmdata->Nt);
}
/* memory allocation: different solvers */
if (solve_axb==0) {
@ -2078,10 +2082,6 @@ rlevmar_der_single_nocuda(
}
}
int nw,wt_itmax=3;
me_data_t *lmdata=(me_data_t*)adata;
double wt_sum,lambda,robust_nu=lmdata->robust_nu;
double robust_nu1;
/* EM iteration loop */
/************************************************************/
@ -2403,6 +2403,10 @@ printf("norm ||dp|| =%lf, norm ||p||=%lf\n",Dp_L2,p_L2);
printf("nu updated from %lf in [%lf,%lf] to %lf\n",robust_nu,robust_nulow, robust_nuhigh,robust_nu1);
#endif
robust_nu=robust_nu1;
if (whiten) {
/* use correct offset for u,v based on tile offset */
add_whitening_weights(N/8, wtd, &lmdata->u[lmdata->tileoff*lmdata->Nbase], &lmdata->v[lmdata->tileoff*lmdata->Nbase], *(lmdata->freq0), lmdata->Nt);
}
/* normalize weights */
wt_sum=lambda/(double)N;
@ -2491,6 +2495,7 @@ osrlevmar_der_single_nocuda(
int Nt, /* no of threads */
double robust_nulow, double robust_nuhigh, /* robust nu range */
int randomize, /* if >0 randomize */
int whiten, /* if >0 whiten data 1: NCP, 2... */
void *adata) /* pointer to possibly additional data, passed uninterpreted to func & jacf.
* Set to NULL if not needed
*/
@ -2611,16 +2616,20 @@ osrlevmar_der_single_nocuda(
exit(1);
}
WORK=Ud=Sd=VTd=0;
// for (ci=0;ci<M; ci++) {
// aones[ci]=1.0;
// }
me_data_t *dt=(me_data_t*)adata;
setweights(M,aones,1.0,dt->Nt);
me_data_t *lmdata0=(me_data_t*)adata;
int nw,wt_itmax=3;
double wt_sum,lambda,robust_nu=lmdata0->robust_nu;
double robust_nu1;
setweights(M,aones,1.0,lmdata0->Nt);
/*W set initial weights to 1 */
// for (ci=0;ci<N; ci++) {
// wtd[ci]=1.0;
// }
setweights(N,wtd,1.0,dt->Nt);
setweights(N,wtd,1.0,lmdata0->Nt);
/* modify weights with whitening weights */
if (whiten) {
add_whitening_weights(N/8, wtd, &lmdata0->u[lmdata0->tileoff*lmdata0->Nbase], &lmdata0->v[lmdata0->tileoff*lmdata0->Nbase], *(lmdata0->freq0), lmdata0->Nt);
}
/* memory allocation: different solvers */
if (solve_axb==0) {
@ -2668,10 +2677,7 @@ osrlevmar_der_single_nocuda(
}
}
int nw,wt_itmax=3;
me_data_t *lmdata0=(me_data_t*)adata;
double wt_sum,lambda,robust_nu=lmdata0->robust_nu;
double robust_nu1;
/* setup OS subsets and stating offsets */
/* ME data for Jacobian calculation (need a new one) */
me_data_t lmdata;
@ -2737,7 +2743,6 @@ osrlevmar_der_single_nocuda(
l=l+Ntpersubset;
}
/* EM iteration loop */
/************************************************************/
for (nw=0; nw<wt_itmax; nw++) {
@ -3051,6 +3056,9 @@ printf("norm ||dp|| =%lf, norm ||p||=%lf\n",Dp_L2,p_L2);
printf("nu updated from %lf in [%lf,%lf] to %lf\n",robust_nu,robust_nulow, robust_nuhigh,robust_nu1);
#endif
robust_nu=robust_nu1;
if (whiten) {
add_whitening_weights(N/8, wtd, &lmdata0->u[lmdata0->tileoff*lmdata0->Nbase], &lmdata0->v[lmdata0->tileoff*lmdata0->Nbase], *(lmdata0->freq0), lmdata0->Nt);
}
/* normalize weights */
wt_sum=lambda/(double)N;

View File

@ -312,6 +312,7 @@ fns_f(complex double *x, double *y, global_data_rtr_t *gdata) {
/* worker thread function for weight update (nu+8)/(nu+error^2) */
/* update: error: min of XX,XY,YX,YY errors, so p=2 */
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
@ -363,7 +364,8 @@ threadfn_fns_fupdate_weights(void *data) {
double r11=t->y[8*ci+6]-creal(T2[3]);
double i11=t->y[8*ci+7]-cimag(T2[3]);
t->wtd[ci] = (t->nu0+8.0)/(t->nu0+(r00*r00+i00*i00+r01*r01+i01*i01+r10*r10+i10*i10+r11*r11+i11*i11));
//t->wtd[ci] = (t->nu0+8.0)/(t->nu0+(r00*r00+i00*i00+r01*r01+i01*i01+r10*r10+i10*i10+r11*r11+i11*i11));
t->wtd[ci] = (t->nu0+2.0)/(t->nu0+MAX(r00*r00+i00*i00,MAX(r01*r01+i01*i01,MAX(r10*r10+i10*i10,r11*r11+i11*i11))));
}
}
@ -475,12 +477,15 @@ fns_fupdate_weights(complex double *x, double *y, global_data_rtr_t *gdata) {
pthread_join(gdata->th_array[nth1],NULL);
}
sumlogw/=(double)Nbase1;
free(threaddata);
free(threaddata);
/* find new value for nu, p-variate T dist, p=8 */
/* find new value for nu, p-variate T dist, p=8 (update p=2 because using MAX() for residual calculation, not sum) */
/* psi((nu_old+p)/2)-ln((nu_old+p)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1 = 0, AECM */
double nu1=update_nu(sumlogw, 30, Nt, gdata->nulow, gdata->nuhigh, 8, dp->robust_nu);
double nu1=update_nu(sumlogw, 30, Nt, gdata->nulow, gdata->nuhigh, 2, dp->robust_nu);
/* make sure new value stays within bounds */
if (nu1<gdata->nulow) { return gdata->nulow; }
if (nu1>gdata->nuhigh) { return gdata->nuhigh; }
return nu1;
}
@ -1400,6 +1405,7 @@ armijostep(int N,complex double *x,complex double *teta, double *y, global_data_
return nocostred;
}
/* Fine tune initial trust region radius, also update initial value for x
A. Sartenaer, 1995
returns : trust region estimate,
@ -1427,10 +1433,10 @@ itrr(int N,complex double *x,complex double *eta, complex double *Heta, double *
fns_fgrad(x,eta,y,gdata,1);
//normalize
double eta_nrm=my_cnrm2(4*N,eta);
my_cscal(4*N, 1.0/eta_nrm+0.0*_Complex_I, eta);
my_cscal(4*N, 1.0/eta_nrm+0.0*_Complex_I, eta);
my_ccopy(4*N,eta,1,s,1);
my_cscal(4*N, delta_0+0.0*_Complex_I, s);
my_cscal(4*N, delta_0+0.0*_Complex_I, s);
//Hessian at s
fns_fhess(x,s,Heta,y,gdata);
@ -1439,7 +1445,7 @@ itrr(int N,complex double *x,complex double *eta, complex double *Heta, double *
double mu_0=0.5; double mu_1=0.5; double mu_2=0.35;
double teta=0.25;
int m,MK=4;
for (m=0; m<MK; m++) {
/* x_prop=x0-s */
@ -1449,7 +1455,7 @@ itrr(int N,complex double *x,complex double *eta, complex double *Heta, double *
/* model = f0 - g(x_prop,g0,s) - 0.5 g(x_prop,Hess,s) */
mk=f0-fns_g(N,x_prop,eta,s)-0.5*fns_g(N,x_prop,Heta,s);
fk=fns_f(x_prop,y,gdata);
if (f0==mk) {
rho=1e9;
} else {
@ -1521,7 +1527,7 @@ printf("m=%d delta_0=%e delta_max=%e beta=%e rho=%e\n",m,delta_0,delta_m,beta_i,
#endif
my_ccopy(4*N,eta,1,s,1);
my_cscal(4*N,delta_0+0.0*_Complex_I, s);
my_cscal(4*N,delta_0+0.0*_Complex_I, s);
}
@ -1539,6 +1545,8 @@ printf("m=%d delta_0=%e delta_max=%e beta=%e rho=%e\n",m,delta_0,delta_m,beta_i,
return Delta0;
}
int
rtr_solve_nocuda_robust(
double *x0, /* initial values and updated solution at output (size 8*N double) */
@ -1694,6 +1702,7 @@ rtr_solve_nocuda_robust(
int rsdstat=0;
/***************************************************/
/* RSD solution */
//for (ci=0; ci<itmax_rsd; ci++) {
for (ci=0; ci<0; ci++) {
/* Armijo step */
/* teta=armijostep(V,C,N,x); */
@ -1707,11 +1716,13 @@ rtr_solve_nocuda_robust(
}
}
double Delta_new=itrr(N,x,eta,Heta, y, &gdata, fgradx, x_prop);
#ifdef DEBUG
printf("TR radius given=%lf est=%lf\n",Delta0,Delta_new);
#endif
//old values
//Delta_bar=MIN(fx,0.01);
//Delta0=Delta_bar*0.125;
Delta0=MIN(Delta_new,0.01); /* need to be more restrictive for EM */
@ -1971,3 +1982,265 @@ Delta = Delta0;
free(x);
return 0;
}
int
nsd_solve_nocuda_robust(
double *x0, /* initial values and updated solution at output (size 8*N double) */
double *y, /* data vector (size 8*M double) */
int N, /* no. of stations */
int M, /* no. of constraints */
int itermax, /* maximum number of iterations RSD */
double robust_nulow, double robust_nuhigh, /* robust nu range */
double *info, /* initial and final residuals */
me_data_t *adata) { /* pointer to additional data */
/* reshape x to make J: 2Nx2 complex double
*/
complex double *x;
if ((x=(complex double*)malloc((size_t)4*N*sizeof(complex double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
/* map x: [(re,im)J_1(0,0) (re,im)J_1(0,1) (re,im)J_1(1,0) (re,im)J_1(1,1)...]
to
J: [J_1(0,0) J_1(1,0) J_2(0,0) J_2(1,0) ..... J_1(0,1) J_1(1,1) J_2(0,1) J_2(1,1)....]
*/
double *Jd=(double*)x;
/* re J(0,0) */
my_dcopy(N, &x0[0], 8, &Jd[0], 4);
/* im J(0,0) */
my_dcopy(N, &x0[1], 8, &Jd[1], 4);
/* re J(1,0) */
my_dcopy(N, &x0[4], 8, &Jd[2], 4);
/* im J(1,0) */
my_dcopy(N, &x0[5], 8, &Jd[3], 4);
/* re J(0,1) */
my_dcopy(N, &x0[2], 8, &Jd[4*N], 4);
/* im J(0,1) */
my_dcopy(N, &x0[3], 8, &Jd[4*N+1], 4);
/* re J(1,1) */
my_dcopy(N, &x0[6], 8, &Jd[4*N+2], 4);
/* im J(1,1) */
my_dcopy(N, &x0[7], 8, &Jd[4*N+3], 4);
int Nt=adata->Nt;
int ci;
global_data_rtr_t gdata;
gdata.medata=adata;
/* setup threads */
pthread_attr_init(&gdata.attr);
pthread_attr_setdetachstate(&gdata.attr,PTHREAD_CREATE_JOINABLE);
if ((gdata.th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((gdata.mx_array=(pthread_mutex_t*)malloc((size_t)N*sizeof(pthread_mutex_t)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((gdata.iw=(double*)malloc((size_t)N*sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
/* weights for robust LS, length could be less than total no of baselines
therefore use relative offset boff */
if ((gdata.wtd=(double*)malloc((size_t)M*sizeof(double)))==0) {
#ifndef USE_MIC
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
for (ci=0; ci<N; ci++) {
pthread_mutex_init(&gdata.mx_array[ci],NULL);
}
/* count baseline->station contributions
NOTE: has to be done here because the baseline offset would change */
fns_fcount(&gdata);
/***************************************************/
complex double *fgradx,*eta,*z,*x_prop,*z_prop;
if ((fgradx=(complex double*)calloc((size_t)4*N,sizeof(complex double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((eta=(complex double*)calloc((size_t)4*N,sizeof(complex double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((z=(complex double*)calloc((size_t)4*N,sizeof(complex double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((x_prop=(complex double*)calloc((size_t)4*N,sizeof(complex double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((z_prop=(complex double*)calloc((size_t)4*N,sizeof(complex double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
/*set initial weights to 1 */
setweights(M,gdata.wtd,1.0,Nt);
gdata.nulow=robust_nulow;
gdata.nuhigh=robust_nuhigh;
double fx;
fx=fns_f(x,y,&gdata);
double fx0=fx;
/***************************************************/
/* gradient at x0 */
fns_fgrad(x,fgradx,y,&gdata,1);
/* Hessian at x0,x0 */
fns_fhess(x,x,z,y,&gdata);
/* intial step ~ 1/||Hessian|| */
double hess_nrm=my_cnrm2(4*N,z);
double t=1.0/hess_nrm;
/* if initial step too small */
if (t<1e-6) {
t=1e-6;
}
/* z <= x */
my_ccopy(4*N,x,1,z,1);
double theta=1.0;
double ALPHA=1.01; /* step-size growth factor */
double BETA=0.5; /* step-size shrinkage factor */
int k;
for (k=0; k<itermax; k++) {
/* x_prop <= x */
my_ccopy(4*N,x,1,x_prop,1);
/* z_prop <= z */
my_ccopy(4*N,z,1,z_prop,1);
/* x <= z - t * grad */
my_ccopy(4*N,z,1,x,1);
my_caxpy(4*N, fgradx, -t+0.0*_Complex_I, x);
/* if ||x-z|| == t||grad|| is below threshold, stop iteration */
double grad_nrm=my_cnrm2(4*N,fgradx);
double x_nrm=my_cnrm2(4*N,x);
/* norm(y-x)/max(1,norm(x)); */
if (grad_nrm*t/MAX(1.0,x_nrm) < 1e-6) {
break;
}
/* theta = 2/(1 + sqrt(1+4/(theta^2))); */
theta=2.0/(1.0 + sqrt(1.0+4.0/(theta*theta)));
/* z = x + (1-theta)*(x-x_prop);
z = (2-theta)*x - (1-theta) * x_prop */
my_ccopy(4*N,x,1,z,1);
my_cscal(4*N, 2.0-theta+0.0*_Complex_I, z);
my_caxpy(4*N, x_prop, -(1.0-theta)+0.0*_Complex_I, z);
/* eta = grad_old;
grad <= grad_f( z ) */
my_ccopy(4*N,fgradx,1,eta,1);
fns_fgrad(z,fgradx,y,&gdata,1);
/* z_prop <= z_prop - z */
my_caxpy(4*N, z, -1.0+0.0*_Complex_I, z_prop);
/* eta <= eta - new_grad */
my_caxpy(4*N, fgradx, -1.0+0.0*_Complex_I, eta);
/* ||z-z_prop|| */
double ydiffnrm=my_cnrm2(4*N,z_prop);
/* (z-z_prop)'*(grad-grad_old) */
double dot_ydiff_gdiff=my_ddot(8*N, (double *)z_prop, (double *)eta);
#ifdef DEBUG
printf("num=%e den=%e\n",ydiffnrm,dot_ydiff_gdiff);
#endif
/* the above can be NAN, if so break loop */
if (isnan(dot_ydiff_gdiff) || isinf(dot_ydiff_gdiff)) {
break;
}
/* backtracking
t_hat = 0.5*(norm(y-y_old)^2)/abs((y - y_old)'*(g_old - g));
t = min( ALPHA*t, max( BETA*t, t_hat ));
*/
double t_hat=0.5*(ydiffnrm*ydiffnrm)/fabs(dot_ydiff_gdiff);
t=MIN(ALPHA*t,MAX(BETA*t,t_hat));
#ifdef DEBUG
printf("k=%d theta=%e step=%e\n",k,theta,t);
#endif
}
fx=fns_f(x,y,&gdata);
/* final residual */
info[1]=fx;
#ifdef DEBUG
printf("k=%d cost initial=%e final=%e\n",k,fx0,fx);
#endif
free(fgradx);
free(eta);
free(z);
free(z_prop);
free(x_prop);
/***************************************************/
double robust_nu1=fns_fupdate_weights(x,y,&gdata);
adata->robust_nu=robust_nu1;
if (fx0>fx) {
/* copy back solution to x0 */
/* re J(0,0) */
my_dcopy(N, &Jd[0], 4, &x0[0], 8);
/* im J(0,0) */
my_dcopy(N, &Jd[1], 4, &x0[1], 8);
/* re J(1,0) */
my_dcopy(N, &Jd[2], 4, &x0[4], 8);
/* im J(1,0) */
my_dcopy(N, &Jd[3], 4, &x0[5], 8);
/* re J(0,1) */
my_dcopy(N, &Jd[4*N], 4, &x0[2], 8);
/* im J(0,1) */
my_dcopy(N, &Jd[4*N+1], 4, &x0[3], 8);
/* re J(1,1) */
my_dcopy(N, &Jd[4*N+2], 4, &x0[6], 8);
/* im J(1,1) */
my_dcopy(N, &Jd[4*N+3], 4, &x0[7], 8);
}
for (ci=0; ci<N; ci++) {
pthread_mutex_destroy(&gdata.mx_array[ci]);
}
pthread_attr_destroy(&gdata.attr);
free(gdata.th_array);
free(gdata.mx_array);
free(gdata.iw);
free(gdata.wtd);
free(x);
return 0;
}

View File

@ -331,7 +331,8 @@ fns_f(complex double *x, double *y, global_data_rtr_t *gdata) {
}
/* worker thread function for weight update (nu+8)/(nu+error^2) */
/* worker thread function for weight update (nu+p)/(nu+error^2) */
/* p=2, not p=8 because using MAX() not sum for error^2 */
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
@ -383,7 +384,8 @@ threadfn_fns_fupdate_weights(void *data) {
double r11=t->y[8*ci+6]-creal(T2[3]);
double i11=t->y[8*ci+7]-cimag(T2[3]);
t->wtd[ci] = (t->nu0+8.0)/(t->nu0+(r00*r00+i00*i00+r01*r01+i01*i01+r10*r10+i10*i10+r11*r11+i11*i11));
//t->wtd[ci] = (t->nu0+8.0)/(t->nu0+(r00*r00+i00*i00+r01*r01+i01*i01+r10*r10+i10*i10+r11*r11+i11*i11));
t->wtd[ci] = (t->nu0+2.0)/(t->nu0+MAX(r00*r00+i00*i00,MAX(r01*r01+i01*i01,MAX(r10*r10+i10*i10,r11*r11+i11*i11))));
}
}
@ -495,11 +497,14 @@ fns_fupdate_weights(complex double *x, double *y, global_data_rtr_t *gdata) {
pthread_join(gdata->th_array[nth1],NULL);
}
sumlogw/=(double)Nbase1;
free(threaddata);
free(threaddata);
/* find new value for nu, p-variate T dist, p=8 */
/* find new value for nu, p-variate T dist, p=8 (update p=2 because using MAX() for residual calculation, not sum) */
/* psi((nu_old+p)/2)-ln((nu_old+p)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1 = 0, AECM */
double nu1=update_nu(sumlogw, 30, Nt, gdata->nulow, gdata->nuhigh, 8, dp->robust_nu);
double nu1=update_nu(sumlogw, 30, Nt, gdata->nulow, gdata->nuhigh, 2, dp->robust_nu);
/* make sure new value stays within bounds */
if (nu1<gdata->nulow) { return gdata->nulow; }
if (nu1>gdata->nuhigh) { return gdata->nuhigh; }
return nu1;
}
@ -1385,6 +1390,145 @@ armijostep(int N,complex double *x,complex double *teta, double *y, global_data_
return nocostred;
}
/* Fine tune initial trust region radius, also update initial value for x
A. Sartenaer, 1995
returns : trust region estimate,
also modifies x
eta,Heta,s,x_prop: used as storage
*/
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
static double
itrr(int N,complex double *x,complex double *eta, complex double *Heta, double *y, global_data_rtr_t *gdata, complex double *s, complex double *x_prop) {
double f0,fk,mk,rho,rho1,Delta0;
/* initialize trust region radii */
double delta_0=1.0;
double delta_m=0.0;
double sigma=0.0;
double delta=0.0;
// initial cost
f0=fns_f(x,y,gdata);
// gradient at x0
fns_fgrad(x,eta,y,gdata,1);
//normalize
double eta_nrm=my_cnrm2(4*N,eta);
my_cscal(4*N, 1.0/eta_nrm+0.0*_Complex_I, eta);
my_ccopy(4*N,eta,1,s,1);
my_cscal(4*N, delta_0+0.0*_Complex_I, s);
//Hessian at s
fns_fhess(x,s,Heta,y,gdata);
/* constants used */
double gamma_1=0.0625; double gamma_2=5.0; double gamma_3=0.5; double gamma_4=2.0;
double mu_0=0.5; double mu_1=0.5; double mu_2=0.35;
double teta=0.25;
int m,MK=4;
for (m=0; m<MK; m++) {
/* x_prop=x0-s */
my_ccopy(4*N,x,1,x_prop,1);
my_caxpy(4*N, s, -1.0+0.0*_Complex_I, x_prop);
/* model = f0 - g(x_prop,g0,s) - 0.5 g(x_prop,Hess,s) */
mk=f0-fns_g(N,x_prop,eta,s)-0.5*fns_g(N,x_prop,Heta,s);
fk=fns_f(x_prop,y,gdata);
if (f0==mk) {
rho=1e9;
} else {
rho=(f0-fk)/(f0-mk);
}
rho1=fabs(rho-1.0);
/* update max radius */
if (rho1<mu_0) {
delta_m=MAX(delta_m,delta_0);
}
if ((f0-fk)>delta) {
delta=f0-fk;
sigma=delta_0;
}
/* radius update */
double beta_1,beta_2,beta_i;
beta_1=0.0;
beta_2=0.0;
if (m<MK) {
double g0_s=fns_g(N,x,eta,s);
double b1=(teta*(f0-g0_s)+(1.0-teta)*mk-fk);
beta_1=(b1==0.0?1e9:-teta*g0_s/b1);
double b2=(-teta*(f0-g0_s)+(1.0+teta)*mk-fk);
beta_2=(b2==0.0?1e9:teta*g0_s/b2);
double minbeta=MIN(beta_1,beta_2);
double maxbeta=MAX(beta_1,beta_2);
if (rho1>mu_1) {
if (minbeta>1.0) {
beta_i=gamma_3;
} else if ((maxbeta<gamma_1) || (minbeta<gamma_1 && maxbeta>=1.0)) {
beta_i=gamma_1;
} else if ((beta_1>=gamma_1 && beta_1<1.0) && (beta_2<gamma_1 || beta_2>=1.0)) {
beta_i=beta_1;
} else if ((beta_2>=gamma_1 && beta_2<1.0) && (beta_1<gamma_1 || beta_1>=1.0)) {
beta_i=beta_2;
} else {
beta_i=maxbeta;
}
} else if (rho1<=mu_2) {
if (maxbeta<1.0) {
beta_i=gamma_4;
} else if (maxbeta>gamma_2) {
beta_i=gamma_2;
} else if ((beta_1>=1.0 && beta_1<=gamma_2) && beta_2<1.0) {
beta_i=beta_1;
} else if ((beta_2>=1.0 && beta_2<=gamma_2) && beta_1<1.0) {
beta_i=beta_2;
} else {
beta_i=maxbeta;
}
} else {
if (maxbeta<gamma_3) {
beta_i=gamma_3;
} else if (maxbeta>gamma_4) {
beta_i=gamma_4;
} else {
beta_i=maxbeta;
}
}
/* update radius */
delta_0=delta_0/beta_i;
}
#ifdef DEBUG
printf("m=%d delta_0=%e delta_max=%e beta=%e rho=%e\n",m,delta_0,delta_m,beta_i,rho);
#endif
my_ccopy(4*N,eta,1,s,1);
my_cscal(4*N,delta_0+0.0*_Complex_I, s);
}
// update initial value
if (delta>0.0) {
my_caxpy(4*N, eta, -sigma+0.0*_Complex_I, x);
}
if (delta_m>0.0) {
Delta0=delta_m;
} else {
Delta0=delta_0;
}
return Delta0;
}
int
@ -1583,7 +1727,8 @@ rtr_solve_nocuda_robust_admm(
int rsdstat=0;
/***************************************************/
/* RSD solution */
for (ci=0; ci<itmax_rsd; ci++) {
//for (ci=0; ci<itmax_rsd; ci++) {
for (ci=0; ci<0; ci++) {
/* Armijo step */
/* teta=armijostep(V,C,N,x); */
rsdstat=armijostep(N,x,eta,y,&gdata,fgradx,x_prop,&fx); /* NOTE last two are just storage */
@ -1596,7 +1741,16 @@ rtr_solve_nocuda_robust_admm(
}
}
Delta_bar=MIN(fx,Delta_bar);
double Delta_new=itrr(N,x,eta,Heta, y, &gdata, fgradx, x_prop);
#ifdef DEBUG
printf("TR radius given=%lf est=%lf\n",Delta0,Delta_new);
#endif
//old values
//Delta_bar=MIN(fx,Delta_bar);
Delta0=MIN(Delta_new,0.01); /* need to be more restrictive for EM */
Delta_bar=Delta0*8.0;
rho_regularization=fx*1e-6;
//printf("fx=%g Delta_bar=%g Delta0=%g\n",fx,Delta_bar,Delta0);

View File

@ -116,7 +116,7 @@ cudakernel_fns_fgrad_robust1(int ThreadsPerBlock, int BlocksPerGrid, int N, int
to tangent space before it is averaged
so calculate grad using N(N-1)/2 constraints each (total M)
*/
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
/* need 8N*M/ThreadsPerBlock+ 8N float storage */
static void
cudakernel_fns_fgrad_robust(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, float *y, float *coh, char *bbh, float *iw, float *wtd, int negate, cublasHandle_t cbhandle,float *gWORK) {
/* baselines per timeslot = N(N-1)/2 ~2400, timeslots = M/baselines ~120
@ -213,11 +213,10 @@ printf("N=%d Baselines=%d timeslots=%d total=%d,Threads=%d Blocks=%d\n",N,nbase,
}
cudaMemcpy(eta,tempeta,4*N*sizeof(cuFloatComplex),cudaMemcpyDeviceToDevice);
cudaFree(Bd);
//cudakernel_fns_proj(N, x, tempeta, eta, cbhandle);
}
/* Hessian, also projected to tangent space */
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
/* need 8N*M/ThreadsPerBlock+ 8N*2 float storage */
static void
cudakernel_fns_fhess_robust1(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, cuFloatComplex *fhess, float *y, float *coh, char *bbh, float *iw, float *wtd, cublasHandle_t cbhandle, float *gWORK) {
cuFloatComplex *tempeta,*tempb;
@ -265,7 +264,7 @@ cudakernel_fns_fhess_robust1(int ThreadsPerBlock, int BlocksPerGrid, int N, int
to tangent space before it is averaged
so calculate grad using N(N-1)/2 constraints each (total M)
*/
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
/* need 8N*M/ThreadsPerBlock+ 8N float storage */
static void
cudakernel_fns_fhess_robust(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, cuFloatComplex *fhess, float *y, float *coh, char *bbh, float *iw, float *wtd, cublasHandle_t cbhandle, float *gWORK) {
cuFloatComplex *tempeta;
@ -350,7 +349,6 @@ printf("N=%d Baselines=%d timeslots=%d total=%d,Threads=%d Blocks=%d\n",N,nbase,
cudaMemcpy(fhess,tempeta,4*N*sizeof(cuFloatComplex),cudaMemcpyDeviceToDevice);
cudaFree(Bd);
//cudakernel_fns_proj(N, x, tempeta, fhess, cbhandle);
}
@ -449,6 +447,160 @@ printf("m=%d lhs=%e rhs=%e rat=%e norm=%e\n",m,lhs,rhs,lhs/rhs,metric);
}
/* Fine tune initial trust region radius, also update initial value for x
A. Sartenaer, 1995
returns : trust region estimate,
also modifies x
eta,Heta: used as storage
*/
/* need 8N*2 + MAX(2 Blocks + 4, 8N (1 + ceil(M/Threads))) float storage */
static float
itrr(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, cuFloatComplex *Heta, float *y, float *coh, char *bbh, float *iw, float *wtd, cublasHandle_t cbhandle, float *gWORK) {
cuFloatComplex alpha;
cublasStatus_t cbstatus;
/* temp storage, re-using global storage */
cuFloatComplex *s, *x_prop;
unsigned long int moff=0;
s=(cuFloatComplex*)&gWORK[moff];
moff+=8*N;
x_prop=(cuFloatComplex*)&gWORK[moff];
moff+=8*N;
float *gWORK1=&gWORK[moff];
float f0,fk,mk,rho,rho1,Delta0;
/* initialize trust region radii */
float delta_0=1.0f;
float delta_m=0.0f;
float sigma=0.0f;
float delta=0.0f;
// initial cost
f0=cudakernel_fns_f_robust(ThreadsPerBlock,BlocksPerGrid,N,M,x,y,coh,bbh,wtd,gWORK1);
// gradient at x0;
cudakernel_fns_fgrad_robust(ThreadsPerBlock,BlocksPerGrid,N,M,x,eta,y,coh,bbh,iw,wtd,1,cbhandle, gWORK1);
// normalize
float eta_nrm;
cublasScnrm2(cbhandle,4*N,eta,1,&eta_nrm);
alpha.x=1.0f/eta_nrm;alpha.y=0.0f;
cbstatus=cublasCscal(cbhandle,4*N,&alpha,eta,1);
cbstatus=cublasCcopy(cbhandle,4*N,eta,1,s,1);
alpha.x=delta_0;alpha.y=0.0f;
cbstatus=cublasCscal(cbhandle,4*N,&alpha,s,1);
/* Hessian at s */
cudakernel_fns_fhess_robust(ThreadsPerBlock,BlocksPerGrid,N,M,x,s,Heta,y,coh,bbh,iw,wtd,cbhandle, gWORK1);
/* constants used */
float gamma_1=0.0625f; float gamma_2=5.0f; float gamma_3=0.5f; float gamma_4=2.0f;
float mu_0=0.5f; float mu_1=0.5f; float mu_2=0.35f;
float teta=0.25f;
int MK=4;
int m;
for (m=0; m<MK; m++) {
/* x_prop=x0-s */
cbstatus=cublasCcopy(cbhandle,4*N,x,1,x_prop,1);
alpha.x=-1.0f;alpha.y=0.0f;
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, s, 1, x_prop, 1);
/* model = f0 - g(x_prop,g0,s) - 0.5 g(x_prop,Hess,s) */
mk=f0-cudakernel_fns_g(N,x_prop,eta,s,cbhandle)-0.5f*cudakernel_fns_g(N,x_prop,Heta,s,cbhandle);
fk=cudakernel_fns_f_robust(ThreadsPerBlock,BlocksPerGrid,N,M,x_prop,y,coh,bbh,wtd,gWORK1);
if (f0==mk) {
rho=1e9f;
} else {
rho=(f0-fk)/(f0-mk);
}
rho1=fabsf(rho-1.0f);
/* update max radius */
if (rho1<mu_0) {
delta_m=MAX(delta_m,delta_0);
}
if ((f0-fk)>delta) {
delta=f0-fk;
sigma=delta_0;
}
/* radius update */
float beta_1,beta_2,beta_i;
beta_1=0.0f;
beta_2=0.0f;
if (m<MK) {
float g0_s=cudakernel_fns_g(N,x,eta,s,cbhandle);
float b1=(teta*(f0-g0_s)+(1.0f-teta)*mk-fk);
beta_1=(b1==0.0f?1e9f:-teta*g0_s/b1);
float b2=(-teta*(f0-g0_s)+(1.0f+teta)*mk-fk);
beta_2=(b2==0.0f?1e9f:teta*g0_s/b2);
float minbeta=MIN(beta_1,beta_2);
float maxbeta=MAX(beta_1,beta_2);
if (rho1>mu_1) {
if (minbeta>1.0f) {
beta_i=gamma_3;
} else if ((maxbeta<gamma_1) || (minbeta<gamma_1 && maxbeta>=1.0f)) {
beta_i=gamma_1;
} else if ((beta_1>=gamma_1 && beta_1<1.0f) && (beta_2<gamma_1 || beta_2>=1.0f)) {
beta_i=beta_1;
} else if ((beta_2>=gamma_1 && beta_2<1.0f) && (beta_1<gamma_1 || beta_1>=1.0f)) {
beta_i=beta_2;
} else {
beta_i=maxbeta;
}
} else if (rho1<=mu_2) {
if (maxbeta<1.0f) {
beta_i=gamma_4;
} else if (maxbeta>gamma_2) {
beta_i=gamma_2;
} else if ((beta_1>=1.0f && beta_1<=gamma_2) && beta_2<1.0f) {
beta_i=beta_1;
} else if ((beta_2>=1.0f && beta_2<=gamma_2) && beta_1<1.0f) {
beta_i=beta_2;
} else {
beta_i=maxbeta;
}
} else {
if (maxbeta<gamma_3) {
beta_i=gamma_3;
} else if (maxbeta>gamma_4) {
beta_i=gamma_4;
} else {
beta_i=maxbeta;
}
}
/* update radius */
delta_0=delta_0/beta_i;
}
#ifdef DEBUG
printf("m=%d delta_0=%e delta_max=%e beta=%e rho=%e\n",m,delta_0,delta_m,beta_i,rho);
#endif
cbstatus=cublasCcopy(cbhandle,4*N,eta,1,s,1);
alpha.x=delta_0;alpha.y=0.0f;
cbstatus=cublasCscal(cbhandle,4*N,&alpha,s,1);
}
// update initial value
if (delta>0.0f) {
alpha.x=-sigma; alpha.y=0.0f;
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, eta, 1, x, 1);
}
if (delta_m>0.0f) {
Delta0=delta_m;
} else {
Delta0=delta_0;
}
return Delta0;
}
/* truncated conjugate gradient method
x, grad, eta, r, z, delta, Hxd : size 2N x 2 complex
so, vector size is 4N complex double
@ -572,6 +724,12 @@ tcg_solve_cuda(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComp
}
/* storage:
8N * 5 + N + 8M * 2 + 2M + M (base storage)
MAX( 2 * Blocks + 4, 8N(6 + ceil(M/Threads))) for functions
Blocks = ceil(M/Threads)
*/
int
rtr_solve_cuda_robust_fl(
float *x0, /* initial values and updated solution at output (size 8*N float) */
@ -745,28 +903,36 @@ rtr_solve_cuda_robust_fl(
printf("Initial Cost=%g\n",fx0);
#endif
/***************************************************/
int rsdstat=0;
/* RSD solution */
for (ci=0; ci<itmax_sd; ci++) {
/* Armijo step */
/* teta=armijostep(V,C,N,x); */
rsdstat=armijostep(ThreadsPerBlock, BlocksPerGrid, N, M, xd, etad, yd, cohd, bbd,iwd,wtd,&fx,cbhandle,gWORK1);
/* x=R(x,teta); */
cudakernel_fns_R(N,xd,etad,x_propd,cbhandle);
//my_ccopy(4*N,x_propd,1,xd,1);
if (!rsdstat) {
/* cost reduced, update solution */
cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
} else {
/* no cost reduction, break loop */
break;
}
}
// int rsdstat=0;
// /* RSD solution - disabled */
// for (ci=0; ci<itmax_sd; ci++) {
// /* Armijo step */
// /* teta=armijostep(V,C,N,x); */
// rsdstat=armijostep(ThreadsPerBlock, BlocksPerGrid, N, M, xd, etad, yd, cohd, bbd,iwd,wtd,&fx,cbhandle,gWORK1);
// /* x=R(x,teta); */
// cudakernel_fns_R(N,xd,etad,x_propd,cbhandle);
// if (!rsdstat) {
// /* cost reduced, update solution */
// cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
// } else {
// /* no cost reduction, break loop */
// break;
// }
// }
float Delta_new=itrr(ThreadsPerBlock, BlocksPerGrid, N, M, xd, etad, Hetad, yd, cohd, bbd, iwd, wtd, cbhandle, gWORK1);
#ifdef DEBUG
printf("TR radius given=%f est=%f\n",Delta0,Delta_new);
#endif
//old values
//Delta_bar=MIN(fx,0.01f);
//Delta0=Delta_bar*0.125f;
Delta0=MIN(Delta_new,0.01f); /* need to be more restrictive for EM */
Delta_bar=Delta0*8.0f;
cudakernel_fns_fupdate_weights(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,robust_nu);
Delta_bar=MIN(fx,0.01f);
Delta0=Delta_bar*0.125f;
//printf("fx=%g Delta_bar=%g Delta0=%g\n",fx,Delta_bar,Delta0);
#ifdef DEBUG
@ -918,7 +1084,8 @@ printf("NEW RTR cost=%g\n",fx);
/***************************************************/
cudaDeviceSynchronize();
/* w <= (8+nu)/(1+error^2), q<=w-log(w) */
/* w <= (p+nu)/(1+error^2), q<=w-log(w) */
/* p = 2, use MAX() residual of XX,XY,YX,YY, not the sum */
cudakernel_fns_fupdate_weights_q(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,qd,robust_nu);
/* sumq<=sum(w-log(w))/N */
cbstatus=cublasSasum(cbhandle, M, qd, 1, &q_sum);
@ -938,7 +1105,349 @@ printf("NEW RTR cost=%g\n",fx);
#ifdef DEBUG
printf("nu updated %d from %f [%lf,%lf] to %f\n",ci,robust_nu,robust_nulow,robust_nuhigh,robust_nu1);
#endif
dp->robust_nu=(double)robust_nu1;
/* seems pedantic, but make sure new value for robust_nu fits within bounds */
if (robust_nu1<robust_nulow) {
dp->robust_nu=robust_nulow;
} else if (robust_nu1>robust_nuhigh) {
dp->robust_nu=robust_nuhigh;
} else {
dp->robust_nu=(double)robust_nu1;
}
if(fx0>fx) {
//printf("Cost final %g initial %g\n",fx,fx0);
/* copy back current solution */
err=cudaMemcpy(x,xd,8*N*sizeof(float),cudaMemcpyDeviceToHost);
checkCudaError(err,__FILE__,__LINE__);
/* copy back solution to x0 : format checked*/
/* re J(0,0) */
my_fcopy(N, &Jd[0], 4, &x0[0], 8);
/* im J(0,0) */
my_fcopy(N, &Jd[1], 4, &x0[1], 8);
/* re J(1,0) */
my_fcopy(N, &Jd[2], 4, &x0[4], 8);
/* im J(1,0) */
my_fcopy(N, &Jd[3], 4, &x0[5], 8);
/* re J(0,1) */
my_fcopy(N, &Jd[4*N], 4, &x0[2], 8);
/* im J(0,1) */
my_fcopy(N, &Jd[4*N+1], 4, &x0[3], 8);
/* re J(1,1) */
my_fcopy(N, &Jd[4*N+2], 4, &x0[6], 8);
/* im J(1,1) */
my_fcopy(N, &Jd[4*N+3], 4, &x0[7], 8);
}
free(x);
return 0;
}
/* storage:
8N * 6 + N + 8M * 2 + 2M + M (base storage)
MAX( 2 * Blocks + 4, 8N(1 + ceil(M/Threads))) for functions
Blocks = ceil(M/Threads)
*/
int
nsd_solve_cuda_robust_fl(
float *x0, /* initial values and updated solution at output (size 8*N float) */
float *y, /* data vector (size 8*M float) */
int N, /* no of stations */
int M, /* no of constraints */
int itmax, /* maximum number of iterations */
double robust_nulow, double robust_nuhigh, /* robust nu range */
double *info, /* initial and final residuals */
cublasHandle_t cbhandle, /* device handle */
float *gWORK, /* GPU allocated memory */
int tileoff, /* tile offset when solving for many chunks */
int ntiles, /* total tile (data) size being solved for */
me_data_t *adata)
{
/* general note: all device variables end with a 'd' */
cudaError_t err;
cublasStatus_t cbstatus;
/* ME data */
me_data_t *dp=(me_data_t*)adata;
int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
/* coherency on device */
float *cohd;
/* baseline-station map on device/host */
char *bbd;
/* calculate no of cuda threads and blocks */
int ThreadsPerBlock=128;
int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
/* reshape x to make J: 2Nx2 complex double
*/
complex float *x;
if ((x=(complex float*)malloc((size_t)4*N*sizeof(complex float)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
/* map x: [(re,im)J_1(0,0) (re,im)J_1(0,1) (re,im)J_1(1,0) (re,im)J_1(1,1)...]
to
J: [J_1(0,0) J_1(1,0) J_2(0,0) J_2(1,0) ..... J_1(0,1) J_1(1,1) J_2(0,1) J_2(1,1)....]
*/
float *Jd=(float*)x;
/* re J(0,0) */
my_fcopy(N, &x0[0], 8, &Jd[0], 4);
/* im J(0,0) */
my_fcopy(N, &x0[1], 8, &Jd[1], 4);
/* re J(1,0) */
my_fcopy(N, &x0[4], 8, &Jd[2], 4);
/* im J(1,0) */
my_fcopy(N, &x0[5], 8, &Jd[3], 4);
/* re J(0,1) */
my_fcopy(N, &x0[2], 8, &Jd[4*N], 4);
/* im J(0,1) */
my_fcopy(N, &x0[3], 8, &Jd[4*N+1], 4);
/* re J(1,1) */
my_fcopy(N, &x0[6], 8, &Jd[4*N+2], 4);
/* im J(1,1) */
my_fcopy(N, &x0[7], 8, &Jd[4*N+3], 4);
int ci;
/***************************************************/
cuFloatComplex *xd,*fgradxd,*etad,*zd,*x_propd,*z_propd;
float *yd;
float *wtd,*qd; /* for robust weight and log(weight) */
float robust_nu=(float)dp->robust_nu;
float q_sum,robust_nu1;
float deltanu;
int Nd=100; /* no of points where nu is sampled, note Nd<N */
if (Nd>M) { Nd=M; }
deltanu=(float)(robust_nuhigh-robust_nulow)/(float)Nd;
/* for counting how many baselines contribute to each station
grad/hess calculation */
float *iwd,*iw;
if ((iw=(float*)malloc((size_t)N*sizeof(float)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
unsigned long int moff=0;
fgradxd=(cuFloatComplex*)&gWORK[moff];
moff+=8*N; /* 4N complex means 8N float */
etad=(cuFloatComplex*)&gWORK[moff];
moff+=8*N;
zd=(cuFloatComplex*)&gWORK[moff];
moff+=8*N;
x_propd=(cuFloatComplex*)&gWORK[moff];
moff+=8*N;
xd=(cuFloatComplex*)&gWORK[moff];
moff+=8*N;
z_propd=(cuFloatComplex*)&gWORK[moff];
moff+=8*N;
yd=&gWORK[moff];
moff+=8*M;
cohd=&gWORK[moff];
moff+=Nbase*8;
bbd=(char*)&gWORK[moff];
unsigned long int charstor=(Nbase*2*sizeof(char))/sizeof(float);
if (!charstor || charstor%4) {
moff+=(charstor/4+1)*4; /* NOTE +4 multiple to align memory */
} else {
moff+=charstor;
}
iwd=&gWORK[moff];
if (!(N%4)) {
moff+=N;
} else {
moff+=(N/4+1)*4;
}
wtd=&gWORK[moff];
if (!(M%4)) {
moff+=M;
} else {
moff+=(M/4+1)*4;
}
qd=&gWORK[moff];
if (!(M%4)) {
moff+=M;
} else {
moff+=(M/4+1)*4;
}
/* remaining memory */
float *gWORK1=&gWORK[moff];
/* yd <=y : V */
err=cudaMemcpy(yd, y, 8*M*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* need to give right offset for coherencies */
/* offset: cluster offset+time offset */
/* C */
err=cudaMemcpy(cohd, &(dp->ddcohf[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* correct offset for baselines */
err=cudaMemcpy(bbd, &(dp->ddbase[2*(dp->Nbase)*(tileoff)]), Nbase*2*sizeof(char), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* xd <=x : solution */
err=cudaMemcpy(xd, x, 8*N*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
float fx,fx0;
/* count how many baselines contribute to each station, store (inverse) in iwd */
count_baselines(Nbase,N,iw,&(dp->ddbase[2*(dp->Nbase)*(tileoff)]),dp->Nt);
err=cudaMemcpy(iwd, iw, N*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
free(iw);
/* set initial weights to 1 by a cuda kernel */
cudakernel_setweights_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, wtd, 1.0f);
fx=cudakernel_fns_f_robust(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,gWORK1);
fx0=fx;
#ifdef DEBUG
printf("Initial Cost=%g\n",fx0);
#endif
/***************************************************/
// gradient at x0;
cudakernel_fns_fgrad_robust(ThreadsPerBlock,BlocksPerGrid,N,M,xd,fgradxd,yd,cohd,bbd,iwd,wtd,1,cbhandle,gWORK1);
// Hessian
cudakernel_fns_fhess_robust(ThreadsPerBlock,BlocksPerGrid,N,M,xd,xd,zd,yd,cohd,bbd,iwd,wtd,cbhandle,gWORK1);
// initial step = 1/||Hess||
float hess_nrm;
cublasScnrm2(cbhandle,4*N,zd,1,&hess_nrm);
float t=1.0f/hess_nrm;
/* if initial step too small */
if (t<1e-6f) {
t=1e-6f;
}
/* z <= x */
cbstatus=cublasCcopy(cbhandle,4*N,xd,1,zd,1);
float theta=1.0f;
float ALPHA = 1.01f; // step-size growth factor
float BETA = 0.5f; // step-size shrinkage factor
int k;
cuFloatComplex alpha;
for (k=0; k<itmax; k++) {
/* x_prop <= x */
cbstatus=cublasCcopy(cbhandle,4*N,xd,1,x_propd,1);
/* z_prop <= z */
cbstatus=cublasCcopy(cbhandle,4*N,zd,1,z_propd,1);
/* x <= z - t * grad */
cbstatus=cublasCcopy(cbhandle,4*N,zd,1,xd,1);
alpha.x=-t;alpha.y=0.0f;
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, fgradxd, 1, xd, 1);
/* if ||x-z|| == t||grad|| is below threshold, stop iteration */
float grad_nrm,x_nrm;
cublasScnrm2(cbhandle,4*N,fgradxd,1,&grad_nrm);
cublasScnrm2(cbhandle,4*N,xd,1,&x_nrm);
/* norm(y-x)/max(1,norm(x)); */
if (grad_nrm*t/MAX(1.0f,x_nrm) < 1e-6f) {
break;
}
/* theta = 2/(1 + sqrt(1+4/(theta^2))); */
theta=2.0f/(1.0f + sqrtf(1.0f+4.0f/(theta*theta)));
/* z = x + (1-theta)*(x-x_prop);
z = (2-theta)*x - (1-theta) * x_prop */
cbstatus=cublasCcopy(cbhandle,4*N,xd,1,zd,1);
alpha.x=(2.0f-theta);alpha.y=0.0f;
cbstatus=cublasCscal(cbhandle,4*N,&alpha,zd,1);
alpha.x=-(1.0f-theta);alpha.y=0.0f;
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, x_propd, 1, zd, 1);
/* eta = grad_old;
grad <= grad_f( z ) */
cbstatus=cublasCcopy(cbhandle,4*N,fgradxd,1,etad,1);
cudakernel_fns_fgrad_robust(ThreadsPerBlock,BlocksPerGrid,N,M,zd,fgradxd,yd,cohd,bbd,iwd,wtd,1,cbhandle,gWORK1);
/* z_prop <= z_prop - z */
alpha.x=-1.0f;alpha.y=0.0f;
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, zd, 1, z_propd, 1);
/* eta <= eta - new_grad */
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, fgradxd, 1, etad, 1);
/* ||z-z_prop|| */
float ydiffnrm;
cublasScnrm2(cbhandle,4*N,z_propd,1,&ydiffnrm);
/* (z_zold)'*(grad-grad_old) */
float dot_ydiff_gdiff;
cbstatus=cublasSdot(cbhandle, 8*N, (float*)z_propd, 1, (float*)etad, 1, &dot_ydiff_gdiff);
#ifdef DEBUG
printf("num=%e den=%e\n",ydiffnrm,dot_ydiff_gdiff);
#endif
/* the above can be NAN, if so break loop */
if (isnan(dot_ydiff_gdiff) || isinf(dot_ydiff_gdiff)) {
break;
}
/* backtracking
t_hat = 0.5*(norm(y-y_old)^2)/abs((y - y_old)'*(g_old - g));
t = min( ALPHA*t, max( BETA*t, t_hat ));
*/
float t_hat=0.5f*(ydiffnrm*ydiffnrm)/fabsf(dot_ydiff_gdiff);
t=MIN(ALPHA*t,MAX(BETA*t,t_hat));
#ifdef DEBUG
printf("k=%d theta=%e step=%e\n",k,theta,t);
#endif
}
/* final residual */
fx=cudakernel_fns_f_robust(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,gWORK1);
info[1]=fx;
#ifdef DEBUG
printf("NEW NSD cost=%g\n",fx);
#endif
/***************************************************/
cudaDeviceSynchronize();
/* w <= (p+nu)/(1+error^2), q<=w-log(w) */
/* p = 2, use MAX() residual of XX,XY,YX,YY, not the sum */
cudakernel_fns_fupdate_weights_q(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,qd,robust_nu);
/* sumq<=sum(w-log(w))/N */
cbstatus=cublasSasum(cbhandle, M, qd, 1, &q_sum);
q_sum/=(float)M;
#ifdef DEBUG
printf("deltanu=%f sum(w-log(w))=%f\n",deltanu,q_sum);
#endif
/* for nu range 2~numax evaluate, p-variate T
psi((nu0+p)/2)-ln((nu0+p)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1
note: AECM not ECME
and find min(| |) */
int ThreadsPerBlock2=ThreadsPerBlock/4;
cudakernel_evaluatenu_fl_eight(ThreadsPerBlock2, (Nd+ThreadsPerBlock-1)/ThreadsPerBlock2, Nd, q_sum, qd, deltanu,(float)robust_nulow,robust_nu);
/* find min(abs()) value */
cbstatus=cublasIsamin(cbhandle, Nd, qd, 1, &ci); /* 1 based index */
robust_nu1=(float)robust_nulow+(float)(ci-1)*deltanu;
#ifdef DEBUG
printf("nu updated %d from %f [%lf,%lf] to %f\n",ci,robust_nu,robust_nulow,robust_nuhigh,robust_nu1);
#endif
/* seems pedantic, but make sure new value for robust_nu fits within bounds */
if (robust_nu1<robust_nulow) {
dp->robust_nu=robust_nulow;
} else if (robust_nu1>robust_nuhigh) {
dp->robust_nu=robust_nuhigh;
} else {
dp->robust_nu=(double)robust_nu1;
}
if(fx0>fx) {
//printf("Cost final %g initial %g\n",fx,fx0);

View File

@ -64,6 +64,7 @@ checkCublasError(cublasStatus_t cbstatus, char *file, int line)
/* cost function */
/* storage <= (2 Blocks+4) + 8N */
static float
cudakernel_fns_f_robust_admm(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *Y, cuFloatComplex *Z, float admm_rho, float *y, float *coh, char *bbh, float *wtd, cublasHandle_t cbhandle, float *gWORK){
cuFloatComplex *Yd;
@ -113,7 +114,7 @@ cudakernel_fns_proj_admm(int N, cuFloatComplex *x, cuFloatComplex *z, cuFloatCom
/* gradient, also projected to tangent space */
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
/* need 8N*M/ThreadsPerBlock+ 8N float storage */
static void
cudakernel_fns_fgrad_robust_admm(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *Y, cuFloatComplex *Z, float admm_rho, cuFloatComplex *eta, float *y, float *coh, char *bbh, float *iw, float *wtd, int negate, cublasHandle_t cbhandle,float *gWORK) {
@ -182,7 +183,7 @@ cudakernel_fns_fgrad_robust_admm(int ThreadsPerBlock, int BlocksPerGrid, int N,
}
/* Hessian, also projected to tangent space */
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
/* need 8N*M/ThreadsPerBlock+ 8N float storage */
static void
cudakernel_fns_fhess_robust_admm(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *Y, cuFloatComplex *Z, float admm_rho, cuFloatComplex *eta, cuFloatComplex *fhess, float *y, float *coh, char *bbh, float *iw, float *wtd, cublasHandle_t cbhandle, float *gWORK) {
cuFloatComplex *tempeta;
@ -276,6 +277,161 @@ armijostep(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex
}
/* Fine tune initial trust region radius, also update initial value for x
A. Sartenaer, 1995
returns : trust region estimate,
also modifies x
eta,Heta: used as storage
*/
/* need 8N*2 + MAX(8N+2 Blocks + 4, 8N (1 + ceil(M/Threads))) float storage */
static float
itrr(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *Y, cuFloatComplex *Z, float admm_rho, cuFloatComplex *eta, cuFloatComplex *Heta, float *y, float *coh, char *bbh, float *iw, float *wtd, cublasHandle_t cbhandle, float *gWORK) {
cuFloatComplex alpha;
cublasStatus_t cbstatus;
/* temp storage, re-using global storage */
cuFloatComplex *s, *x_prop;
unsigned long int moff=0;
s=(cuFloatComplex*)&gWORK[moff];
moff+=8*N;
x_prop=(cuFloatComplex*)&gWORK[moff];
moff+=8*N;
float *gWORK1=&gWORK[moff];
float f0,fk,mk,rho,rho1,Delta0;
/* initialize trust region radii */
float delta_0=1.0f;
float delta_m=0.0f;
float sigma=0.0f;
float delta=0.0f;
// initial cost
f0=cudakernel_fns_f_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,x,Y,Z,admm_rho,y,coh,bbh,wtd,cbhandle,gWORK1);
// gradient at x0;
cudakernel_fns_fgrad_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,x,Y,Z,admm_rho,eta,y,coh,bbh,iw,wtd,1,cbhandle, gWORK1);
// normalize
float eta_nrm;
cublasScnrm2(cbhandle,4*N,eta,1,&eta_nrm);
alpha.x=1.0f/eta_nrm;alpha.y=0.0f;
cbstatus=cublasCscal(cbhandle,4*N,&alpha,eta,1);
cbstatus=cublasCcopy(cbhandle,4*N,eta,1,s,1);
alpha.x=delta_0;alpha.y=0.0f;
cbstatus=cublasCscal(cbhandle,4*N,&alpha,s,1);
/* Hessian at s */
cudakernel_fns_fhess_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,x,Y,Z,admm_rho,s,Heta,y,coh,bbh,iw,wtd,cbhandle,gWORK1);
/* constants used */
float gamma_1=0.0625f; float gamma_2=5.0f; float gamma_3=0.5f; float gamma_4=2.0f;
float mu_0=0.5f; float mu_1=0.5f; float mu_2=0.35f;
float teta=0.25f;
int MK=4;
int m;
for (m=0; m<MK; m++) {
/* x_prop=x0-s */
cbstatus=cublasCcopy(cbhandle,4*N,x,1,x_prop,1);
alpha.x=-1.0f;alpha.y=0.0f;
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, s, 1, x_prop, 1);
/* model = f0 - g(x_prop,g0,s) - 0.5 g(x_prop,Hess,s) */
mk=f0-cudakernel_fns_g(N,x_prop,eta,s,cbhandle)-0.5f*cudakernel_fns_g(N,x_prop,Heta,s,cbhandle);
fk=cudakernel_fns_f_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,x_prop,Y,Z,admm_rho,y,coh,bbh,wtd,cbhandle,gWORK1);
if (f0==mk) {
rho=1e9f;
} else {
rho=(f0-fk)/(f0-mk);
}
rho1=fabsf(rho-1.0f);
/* update max radius */
if (rho1<mu_0) {
delta_m=MAX(delta_m,delta_0);
}
if ((f0-fk)>delta) {
delta=f0-fk;
sigma=delta_0;
}
/* radius update */
float beta_1,beta_2,beta_i;
beta_1=0.0f;
beta_2=0.0f;
if (m<MK) {
float g0_s=cudakernel_fns_g(N,x,eta,s,cbhandle);
float b1=(teta*(f0-g0_s)+(1.0f-teta)*mk-fk);
beta_1=(b1==0.0f?1e9f:-teta*g0_s/b1);
float b2=(-teta*(f0-g0_s)+(1.0f+teta)*mk-fk);
beta_2=(b2==0.0f?1e9f:teta*g0_s/b2);
float minbeta=MIN(beta_1,beta_2);
float maxbeta=MAX(beta_1,beta_2);
if (rho1>mu_1) {
if (minbeta>1.0f) {
beta_i=gamma_3;
} else if ((maxbeta<gamma_1) || (minbeta<gamma_1 && maxbeta>=1.0f)) {
beta_i=gamma_1;
} else if ((beta_1>=gamma_1 && beta_1<1.0f) && (beta_2<gamma_1 || beta_2>=1.0f)) {
beta_i=beta_1;
} else if ((beta_2>=gamma_1 && beta_2<1.0f) && (beta_1<gamma_1 || beta_1>=1.0f)) {
beta_i=beta_2;
} else {
beta_i=maxbeta;
}
} else if (rho1<=mu_2) {
if (maxbeta<1.0f) {
beta_i=gamma_4;
} else if (maxbeta>gamma_2) {
beta_i=gamma_2;
} else if ((beta_1>=1.0f && beta_1<=gamma_2) && beta_2<1.0f) {
beta_i=beta_1;
} else if ((beta_2>=1.0f && beta_2<=gamma_2) && beta_1<1.0f) {
beta_i=beta_2;
} else {
beta_i=maxbeta;
}
} else {
if (maxbeta<gamma_3) {
beta_i=gamma_3;
} else if (maxbeta>gamma_4) {
beta_i=gamma_4;
} else {
beta_i=maxbeta;
}
}
/* update radius */
delta_0=delta_0/beta_i;
}
#ifdef DEBUG
printf("m=%d delta_0=%e delta_max=%e beta=%e rho=%e\n",m,delta_0,delta_m,beta_i,rho);
#endif
cbstatus=cublasCcopy(cbhandle,4*N,eta,1,s,1);
alpha.x=delta_0;alpha.y=0.0f;
cbstatus=cublasCscal(cbhandle,4*N,&alpha,s,1);
}
// update initial value
if (delta>0.0f) {
alpha.x=-sigma; alpha.y=0.0f;
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, eta, 1, x, 1);
}
if (delta_m>0.0f) {
Delta0=delta_m;
} else {
Delta0=delta_0;
}
return Delta0;
}
/* truncated conjugate gradient method
x, grad, eta, r, z, delta, Hxd : size 2N x 2 complex
so, vector size is 4N complex double
@ -399,6 +555,11 @@ tcg_solve_cuda(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComp
}
/* storage:
8N * 5 + N + 8M * 2 + 2M + M (base storage)
MAX( 8N+ 2 * Blocks + 4, 8N(6 + ceil(M/Threads))) for functions
Blocks = ceil(M/Threads)
*/
int
rtr_solve_cuda_robust_admm_fl(
float *x0, /* initial values and updated solution at output (size 8*N float) */
@ -407,8 +568,7 @@ rtr_solve_cuda_robust_admm_fl(
float *y, /* data vector (size 8*M float) */
int N, /* no of stations */
int M, /* no of constraints */
int itmax_sd, /* maximum number of iterations RSD */
int itmax_rtr, /* maximum number of iterations RTR */
int itmax_rtr, /* maximum number of iterations */
float Delta_bar, float Delta0, /* Trust region radius and initial value */
float admm_rho, /* ADMM regularization */
double robust_nulow, double robust_nuhigh, /* robust nu range */
@ -572,12 +732,6 @@ rtr_solve_cuda_robust_admm_fl(
cudaMalloc((void **)&Yd, 4*N*sizeof(cuFloatComplex));
cudaMalloc((void **)&Zd, 4*N*sizeof(cuFloatComplex));
/* need 8N*(BlocksPerGrid+8) for tcg_solve+grad/hess storage,
so total storage needed is
8N*(BlocksPerGrid+8) + 8N*5 + 8*M + 8*Nbase + 2*Nbase + N + M + M
plus 8N + 8N for ADMM params (Y and BZ)
*/
/* remaining memory */
float *gWORK1=&gWORK[moff];
@ -616,28 +770,39 @@ rtr_solve_cuda_robust_admm_fl(
printf("Initial Cost=%g\n",fx0);
#endif
/***************************************************/
int rsdstat=0;
/* RSD solution */
for (ci=0; ci<itmax_sd; ci++) {
/* Armijo step */
rsdstat=armijostep(ThreadsPerBlock, BlocksPerGrid, N, M, xd, Yd,Zd,admm_rho, etad, yd, cohd, bbd,iwd,wtd,&fx,cbhandle,gWORK1);
/* x=R(x,teta); */
cudakernel_fns_R(N,xd,etad,x_propd,cbhandle);
if (!rsdstat) {
/* cost reduced, update solution */
cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
} else {
/* no cost reduction, break loop */
break;
}
}
// int rsdstat=0;
// /* RSD solution - disabled */
// for (ci=0; ci<itmax_sd; ci++) {
// /* Armijo step */
// rsdstat=armijostep(ThreadsPerBlock, BlocksPerGrid, N, M, xd, Yd,Zd,admm_rho, etad, yd, cohd, bbd,iwd,wtd,&fx,cbhandle,gWORK1);
// /* x=R(x,teta); */
// cudakernel_fns_R(N,xd,etad,x_propd,cbhandle);
// if (!rsdstat) {
// /* cost reduced, update solution */
// cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
// } else {
// /* no cost reduction, break loop */
// break;
// }
// }
float Delta_new=itrr(ThreadsPerBlock, BlocksPerGrid, N, M, xd, Yd,Zd,admm_rho, etad, Hetad, yd, cohd, bbd, iwd, wtd, cbhandle, gWORK1);
#ifdef DEBUG
printf("TR radius given=%f est=%f\n",Delta0,Delta_new);
#endif
//old values
//Delta_bar=MIN(fx,Delta_bar);
//Delta0=Delta_bar*0.125f;
Delta0=MIN(Delta_new,0.01f); /* need to be more restrictive for EM */
Delta_bar=Delta0*8.0f;
//printf("fx=%g Delta_bar=%g Delta0=%g\n",fx,Delta_bar,Delta0);
cudakernel_fns_fupdate_weights(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,robust_nu);
Delta_bar=MIN(fx,Delta_bar);
Delta0=Delta_bar*0.125f;
//printf("fx=%g Delta_bar=%g Delta0=%g\n",fx,Delta_bar,Delta0);
#ifdef DEBUG
printf("NEW RSD cost=%g\n",fx);
#endif
@ -787,7 +952,8 @@ printf("NEW RTR cost=%g\n",fx);
/***************************************************/
cudaDeviceSynchronize();
/* w <= (8+nu)/(1+error^2), q<=w-log(w) */
/* w <= (p+nu)/(1+error^2), q<=w-log(w) */
/* p = 2, use MAX() residual of XX,XY,YX,YY, not the sum */
cudakernel_fns_fupdate_weights_q(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,qd,robust_nu);
/* sumq<=sum(w-log(w))/N */
cbstatus=cublasSasum(cbhandle, M, qd, 1, &q_sum);
@ -807,7 +973,14 @@ printf("NEW RTR cost=%g\n",fx);
#ifdef DEBUG
printf("nu updated %d from %f [%lf,%lf] to %f\n",ci,robust_nu,robust_nulow,robust_nuhigh,robust_nu1);
#endif
dp->robust_nu=(double)robust_nu1;
/* seems pedantic, but make sure new value for robust_nu fits within bounds */
if (robust_nu1<robust_nulow) {
dp->robust_nu=robust_nulow;
} else if (robust_nu1>robust_nuhigh) {
dp->robust_nu=robust_nuhigh;
} else {
dp->robust_nu=(double)robust_nu1;
}
#ifdef DEBUG
printf("Cost final %g initial %g\n",fx,fx0);
@ -846,3 +1019,387 @@ printf("NEW RTR cost=%g\n",fx);
return 0;
}
/* storage:
8N * 6 + N + 8M * 2 + 2M + M (base storage)
MAX( 2 * Blocks + 4, 8N(1 + ceil(M/Threads))) for functions
Blocks = ceil(M/Threads)
*/
int
nsd_solve_cuda_robust_admm_fl(
float *x0, /* initial values and updated solution at output (size 8*N float) */
float *Y, /* Lagrange multiplier size 8N */
float *Z, /* consensus term B Z size 8N */
float *y, /* data vector (size 8*M float) */
int N, /* no of stations */
int M, /* no of constraints */
int itmax, /* maximum number of iterations */
float admm_rho, /* ADMM regularization */
double robust_nulow, double robust_nuhigh, /* robust nu range */
double *info, /* initial and final residuals */
cublasHandle_t cbhandle, /* device handle */
float *gWORK, /* GPU allocated memory */
int tileoff, /* tile offset when solving for many chunks */
int ntiles, /* total tile (data) size being solved for */
me_data_t *adata)
{
/* general note: all device variables end with a 'd' */
cudaError_t err;
cublasStatus_t cbstatus;
/* ME data */
me_data_t *dp=(me_data_t*)adata;
int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
/* coherency on device */
float *cohd;
/* baseline-station map on device/host */
char *bbd;
/* calculate no of cuda threads and blocks */
int ThreadsPerBlock=128;
int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
/* reshape x to make J: 2Nx2 complex double
*/
complex float *x;
if ((x=(complex float*)malloc((size_t)4*N*sizeof(complex float)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
/* map x: [(re,im)J_1(0,0) (re,im)J_1(0,1) (re,im)J_1(1,0) (re,im)J_1(1,1)...]
to
J: [J_1(0,0) J_1(1,0) J_2(0,0) J_2(1,0) ..... J_1(0,1) J_1(1,1) J_2(0,1) J_2(1,1)....]
*/
float *Jd=(float*)x;
/* re J(0,0) */
my_fcopy(N, &x0[0], 8, &Jd[0], 4);
/* im J(0,0) */
my_fcopy(N, &x0[1], 8, &Jd[1], 4);
/* re J(1,0) */
my_fcopy(N, &x0[4], 8, &Jd[2], 4);
/* im J(1,0) */
my_fcopy(N, &x0[5], 8, &Jd[3], 4);
/* re J(0,1) */
my_fcopy(N, &x0[2], 8, &Jd[4*N], 4);
/* im J(0,1) */
my_fcopy(N, &x0[3], 8, &Jd[4*N+1], 4);
/* re J(1,1) */
my_fcopy(N, &x0[6], 8, &Jd[4*N+2], 4);
/* im J(1,1) */
my_fcopy(N, &x0[7], 8, &Jd[4*N+3], 4);
complex float *Zx,*Yx;
if ((Zx=(complex float*)malloc((size_t)4*N*sizeof(complex float)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((Yx=(complex float*)malloc((size_t)4*N*sizeof(complex float)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
float *YY=(float*)Yx;
my_fcopy(N, &Y[0], 8, &YY[0], 4);
my_fcopy(N, &Y[1], 8, &YY[1], 4);
my_fcopy(N, &Y[4], 8, &YY[2], 4);
my_fcopy(N, &Y[5], 8, &YY[3], 4);
my_fcopy(N, &Y[2], 8, &YY[4*N], 4);
my_fcopy(N, &Y[3], 8, &YY[4*N+1], 4);
my_fcopy(N, &Y[6], 8, &YY[4*N+2], 4);
my_fcopy(N, &Y[7], 8, &YY[4*N+3], 4);
float *ZZ=(float*)Zx;
my_fcopy(N, &Z[0], 8, &ZZ[0], 4);
my_fcopy(N, &Z[1], 8, &ZZ[1], 4);
my_fcopy(N, &Z[4], 8, &ZZ[2], 4);
my_fcopy(N, &Z[5], 8, &ZZ[3], 4);
my_fcopy(N, &Z[2], 8, &ZZ[4*N], 4);
my_fcopy(N, &Z[3], 8, &ZZ[4*N+1], 4);
my_fcopy(N, &Z[6], 8, &ZZ[4*N+2], 4);
my_fcopy(N, &Z[7], 8, &ZZ[4*N+3], 4);
int ci;
/***************************************************/
cuFloatComplex *xd,*fgradxd,*etad,*zd,*x_propd,*z_propd,*Yd,*Zd;
float *yd;
float *wtd,*qd; /* for robust weight and log(weight) */
float robust_nu=(float)dp->robust_nu;
float q_sum,robust_nu1;
float deltanu;
int Nd=100; /* no of points where nu is sampled, note Nd<N */
if (Nd>M) { Nd=M; }
deltanu=(float)(robust_nuhigh-robust_nulow)/(float)Nd;
/* for counting how many baselines contribute to each station
grad/hess calculation */
float *iwd,*iw;
if ((iw=(float*)malloc((size_t)N*sizeof(float)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
unsigned long int moff=0;
fgradxd=(cuFloatComplex*)&gWORK[moff];
moff+=8*N; /* 4N complex means 8N float */
etad=(cuFloatComplex*)&gWORK[moff];
moff+=8*N;
zd=(cuFloatComplex*)&gWORK[moff];
moff+=8*N;
x_propd=(cuFloatComplex*)&gWORK[moff];
moff+=8*N;
xd=(cuFloatComplex*)&gWORK[moff];
moff+=8*N;
z_propd=(cuFloatComplex*)&gWORK[moff];
moff+=8*N;
yd=&gWORK[moff];
moff+=8*M;
cohd=&gWORK[moff];
moff+=Nbase*8;
bbd=(char*)&gWORK[moff];
unsigned long int charstor=(Nbase*2*sizeof(char))/sizeof(float);
if (!charstor || charstor%4) {
moff+=(charstor/4+1)*4; /* NOTE +4 multiple to align memory */
} else {
moff+=charstor;
}
iwd=&gWORK[moff];
if (!(N%4)) {
moff+=N;
} else {
moff+=(N/4+1)*4;
}
wtd=&gWORK[moff];
if (!(M%4)) {
moff+=M;
} else {
moff+=(M/4+1)*4;
}
qd=&gWORK[moff];
if (!(M%4)) {
moff+=M;
} else {
moff+=(M/4+1)*4;
}
cudaMalloc((void **)&Yd, 4*N*sizeof(cuFloatComplex));
cudaMalloc((void **)&Zd, 4*N*sizeof(cuFloatComplex));
/* remaining memory */
float *gWORK1=&gWORK[moff];
/* yd <=y : V */
err=cudaMemcpy(yd, y, 8*M*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* need to give right offset for coherencies */
/* offset: cluster offset+time offset */
/* C */
err=cudaMemcpy(cohd, &(dp->ddcohf[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* correct offset for baselines */
err=cudaMemcpy(bbd, &(dp->ddbase[2*(dp->Nbase)*(tileoff)]), Nbase*2*sizeof(char), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* xd <=x : solution */
err=cudaMemcpy(xd, x, 8*N*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(Yd, Yx, 8*N*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(Zd, Zx, 8*N*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
float fx,fx0;
/* count how many baselines contribute to each station, store (inverse) in iwd */
count_baselines(Nbase,N,iw,&(dp->ddbase[2*(dp->Nbase)*(tileoff)]),dp->Nt);
err=cudaMemcpy(iwd, iw, N*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
free(iw);
/* set initial weights to 1 by a cuda kernel */
cudakernel_setweights_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, wtd, 1.0f);
fx=cudakernel_fns_f_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,xd,Yd,Zd,admm_rho,yd,cohd,bbd,wtd,cbhandle,gWORK1);
fx0=fx;
#ifdef DEBUG
printf("Initial Cost=%g\n",fx0);
#endif
/***************************************************/
// gradient at x0;
cudakernel_fns_fgrad_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,xd,Yd,Zd,admm_rho,fgradxd,yd,cohd,bbd,iwd,wtd,1,cbhandle,gWORK1);
// Hessian
cudakernel_fns_fhess_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,xd,Yd,Zd,admm_rho,xd,zd,yd,cohd,bbd,iwd,wtd,cbhandle,gWORK1);
// initial step = 1/||Hess||
float hess_nrm;
cublasScnrm2(cbhandle,4*N,zd,1,&hess_nrm);
float t=1.0f/hess_nrm;
/* if initial step too small */
if (t<1e-6f) {
t=1e-6f;
}
/* z <= x */
cbstatus=cublasCcopy(cbhandle,4*N,xd,1,zd,1);
float theta=1.0f;
float ALPHA = 1.01f; // step-size growth factor
float BETA = 0.5f; // step-size shrinkage factor
int k;
cuFloatComplex alpha;
for (k=0; k<itmax; k++) {
/* x_prop <= x */
cbstatus=cublasCcopy(cbhandle,4*N,xd,1,x_propd,1);
/* z_prop <= z */
cbstatus=cublasCcopy(cbhandle,4*N,zd,1,z_propd,1);
/* x <= z - t * grad */
cbstatus=cublasCcopy(cbhandle,4*N,zd,1,xd,1);
alpha.x=-t;alpha.y=0.0f;
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, fgradxd, 1, xd, 1);
/* if ||x-z|| == t||grad|| is below threshold, stop iteration */
float grad_nrm,x_nrm;
cublasScnrm2(cbhandle,4*N,fgradxd,1,&grad_nrm);
cublasScnrm2(cbhandle,4*N,xd,1,&x_nrm);
/* norm(y-x)/max(1,norm(x)); */
if (grad_nrm*t/MAX(1.0f,x_nrm) < 1e-6f) {
break;
}
/* theta = 2/(1 + sqrt(1+4/(theta^2))); */
theta=2.0f/(1.0f + sqrtf(1.0f+4.0f/(theta*theta)));
/* z = x + (1-theta)*(x-x_prop);
z = (2-theta)*x - (1-theta) * x_prop */
cbstatus=cublasCcopy(cbhandle,4*N,xd,1,zd,1);
alpha.x=(2.0f-theta);alpha.y=0.0f;
cbstatus=cublasCscal(cbhandle,4*N,&alpha,zd,1);
alpha.x=-(1.0f-theta);alpha.y=0.0f;
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, x_propd, 1, zd, 1);
/* eta = grad_old;
grad <= grad_f( z ) */
cbstatus=cublasCcopy(cbhandle,4*N,fgradxd,1,etad,1);
cudakernel_fns_fgrad_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,zd,Yd,Zd,admm_rho,fgradxd,yd,cohd,bbd,iwd,wtd,1,cbhandle,gWORK1);
/* z_prop <= z_prop - z */
alpha.x=-1.0f;alpha.y=0.0f;
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, zd, 1, z_propd, 1);
/* eta <= eta - new_grad */
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, fgradxd, 1, etad, 1);
/* ||z-z_prop|| */
float ydiffnrm;
cublasScnrm2(cbhandle,4*N,z_propd,1,&ydiffnrm);
/* (z_zold)'*(grad-grad_old) */
float dot_ydiff_gdiff;
cbstatus=cublasSdot(cbhandle, 8*N, (float*)z_propd, 1, (float*)etad, 1, &dot_ydiff_gdiff);
#ifdef DEBUG
printf("num=%e den=%e\n",ydiffnrm,dot_ydiff_gdiff);
#endif
/* the above can be NAN, if so break loop */
if (isnan(dot_ydiff_gdiff) || isinf(dot_ydiff_gdiff)) {
break;
}
/* backtracking
t_hat = 0.5*(norm(y-y_old)^2)/abs((y - y_old)'*(g_old - g));
t = min( ALPHA*t, max( BETA*t, t_hat ));
*/
float t_hat=0.5f*(ydiffnrm*ydiffnrm)/fabsf(dot_ydiff_gdiff);
t=MIN(ALPHA*t,MAX(BETA*t,t_hat));
#ifdef DEBUG
printf("k=%d theta=%e step=%e\n",k,theta,t);
#endif
}
/* final residual */
fx=cudakernel_fns_f_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,xd,Yd,Zd,admm_rho,yd,cohd,bbd,wtd,cbhandle,gWORK1);
info[1]=fx;
#ifdef DEBUG
printf("NEW NSD cost=%g\n",fx);
#endif
/***************************************************/
cudaDeviceSynchronize();
/* w <= (p+nu)/(1+error^2), q<=w-log(w) */
/* p = 2, use MAX() residual of XX,XY,YX,YY, not the sum */
cudakernel_fns_fupdate_weights_q(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,qd,robust_nu);
/* sumq<=sum(w-log(w))/N */
cbstatus=cublasSasum(cbhandle, M, qd, 1, &q_sum);
q_sum/=(float)M;
#ifdef DEBUG
printf("deltanu=%f sum(w-log(w))=%f\n",deltanu,q_sum);
#endif
/* for nu range 2~numax evaluate, p-variate T
psi((nu0+p)/2)-ln((nu0+p)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1
note: AECM not ECME
and find min(| |) */
int ThreadsPerBlock2=ThreadsPerBlock/4;
cudakernel_evaluatenu_fl_eight(ThreadsPerBlock2, (Nd+ThreadsPerBlock-1)/ThreadsPerBlock2, Nd, q_sum, qd, deltanu,(float)robust_nulow,robust_nu);
/* find min(abs()) value */
cbstatus=cublasIsamin(cbhandle, Nd, qd, 1, &ci); /* 1 based index */
robust_nu1=(float)robust_nulow+(float)(ci-1)*deltanu;
#ifdef DEBUG
printf("nu updated %d from %f [%lf,%lf] to %f\n",ci,robust_nu,robust_nulow,robust_nuhigh,robust_nu1);
#endif
/* seems pedantic, but make sure new value for robust_nu fits within bounds */
if (robust_nu1<robust_nulow) {
dp->robust_nu=robust_nulow;
} else if (robust_nu1>robust_nuhigh) {
dp->robust_nu=robust_nuhigh;
} else {
dp->robust_nu=(double)robust_nu1;
}
if(fx0>fx) {
//printf("Cost final %g initial %g\n",fx,fx0);
/* copy back current solution */
err=cudaMemcpy(x,xd,8*N*sizeof(float),cudaMemcpyDeviceToHost);
checkCudaError(err,__FILE__,__LINE__);
/* copy back solution to x0 : format checked*/
/* re J(0,0) */
my_fcopy(N, &Jd[0], 4, &x0[0], 8);
/* im J(0,0) */
my_fcopy(N, &Jd[1], 4, &x0[1], 8);
/* re J(1,0) */
my_fcopy(N, &Jd[2], 4, &x0[4], 8);
/* im J(1,0) */
my_fcopy(N, &Jd[3], 4, &x0[5], 8);
/* re J(0,1) */
my_fcopy(N, &Jd[4*N], 4, &x0[2], 8);
/* im J(0,1) */
my_fcopy(N, &Jd[4*N+1], 4, &x0[3], 8);
/* re J(1,1) */
my_fcopy(N, &Jd[4*N+2], 4, &x0[6], 8);
/* im J(1,1) */
my_fcopy(N, &Jd[4*N+3], 4, &x0[7], 8);
}
cudaFree(Yd);
cudaFree(Zd);
free(x);
free(Yx);
free(Zx);
return 0;
}

View File

@ -71,6 +71,9 @@
#define STYPE_RING 3
#define STYPE_SHAPELET 4
/* max source name length, increase it if names get longer */
#define MAX_SNAME 2048
/********* constants - from levmar ******************/
#define CLM_INIT_MU 1E-03
#define CLM_STOP_THRESH 1E-17
@ -192,6 +195,8 @@ typedef struct thread_data_base_ {
/* following for ignoring clusters in simulation */
int *ignlist; /* Mx1 array, if any value 1, that cluster will not be simulated */
/* flag for adding model to data */
int add_to_data;
/* following used for multifrequency (channel) data */
double *freqs;
@ -264,6 +269,18 @@ typedef struct thread_data_setwt_ {
} thread_data_setwt_t;
/* structure for weight calculation for baselines */
typedef struct thread_data_baselinewt_ {
int Nb; /* no of baselines this handle */
int boff; /* baseline offset per thread */
double *wt; /* 8 values per baseline */
double *u,*v;
double freq0;
} thread_data_baselinewt_t;
/* structure for worker threads for jacobian calculation */
typedef struct thread_data_jac_ {
@ -436,6 +453,21 @@ read_solutions(FILE *sfp,double *p,clus_source_t *carr,int N,int M);
*/
extern int
update_ignorelist(const char *ignfile, int *ignlist, int M, clus_source_t *carr);
/* read ADMM regularization factor per cluster from text file, format:
cluster_id hybrid_parameter admm_rho
...
...
(M values)
and store it in array arho : size Mtx1, taking into account the hybrid parameter
also in array arhoslave : size Mx1, without taking hybrid params into account
admm_rho : can be 0 to ignore consensus, just normal calibration
*/
extern int
read_arho_fromfile(const char *admm_rho_file,int Mt,double *arho, int M, double *arhoslave);
/****************************** dataio.c ****************************/
/* open binary file for input/output
datfile: data file descriptor id
@ -874,7 +906,9 @@ __attribute__ ((target(MIC)))
extern int
lbfgs_fit_robust(
void (*func)(double *p, double *hx, int m, int n, void *adata),
double *p, double *x, int m, int n, int itmax, int lbfgs_m, int gpu_threads, void *adata);
double *p, double *x, int m, int n, int itmax, int lbfgs_m, int gpu_threads,
int whiten, /* if >0 whiten data 1: NCP, 2... */
void *adata);
#ifdef HAVE_CUDA
extern int
lbfgs_fit_robust_cuda(
@ -925,15 +959,17 @@ calculate_residuals_multifreq(double *u,double *v,double *w,double *p,double *x,
/*
calculate visibilities for multiple channels, no solutions are used
note: output column x is set to 0
note: output column x is set to 0 if add_to_data ==0, else model is added to data
*/
extern int
predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt);
predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt,int add_to_data);
/* predict with solutions in p , ignore clusters flagged in ignorelist (Mx1) array*/
/* predict with solutions in p , ignore clusters flagged in ignorelist (Mx1) array
also correct final data with solutions for cluster ccid, if valid
*/
extern int
predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,double *x,int *ignorelist,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt);
predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,double *x,int *ignorelist,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt,int add_to_data, int ccid, double rho);
/****************************** mderiv.cu ****************************/
/* cuda driver for kernel */
/* ThreadsPerBlock: keep <= 128
@ -1331,6 +1367,7 @@ osrlevmar_der_single_cuda_fl(
int ntiles, /* total tile (data) size being solved for */
double robust_nulow, double robust_nuhigh, /* robust nu range */
int randomize, /* if >0 randomize */
int whiten, /* if >0 whiten data 1: NCP, 2... */
void *adata);
#endif /* HAVE_CUDA */
@ -1357,6 +1394,7 @@ rlevmar_der_single_nocuda(
int linsolv, /* 0 Cholesky, 1 QR, 2 SVD */
int Nt, /* no of threads */
double robust_nulow, double robust_nuhigh, /* robust nu range */
int whiten, /* if >0 whiten data 1: NCP, 2... */
void *adata);
/* robust LM, OS acceleration */
@ -1396,6 +1434,7 @@ osrlevmar_der_single_nocuda(
int Nt, /* no of threads */
double robust_nulow, double robust_nuhigh, /* robust nu range */
int randomize, /* if >0 randomize */
int whiten, /* if >0 whiten data 1: NCP, 2... */
void *adata);
/****************************** updatenu.c ****************************/
@ -1425,6 +1464,15 @@ __attribute__ ((target(MIC)))
#endif
extern double
update_w_and_nu(double nu0, double *w, double *ed, int N, int Nt, double nulow, double nuhigh);
/* update weights array wt by multiplying it with the inverse density function
1/( 1+f(u,v) )
as u,v->inf, f(u,v) -> 0 so long baselines are not affected
wt : Nbase*8 x 1
u,v : Nbase x 1
note: u = u/c, v=v/c here, so need freq to convert to wavelengths */
extern void
add_whitening_weights(int Nbase, double *wt, double *u, double *v, double freq0, int Nt);
/****************************** clmfit_nocuda.c ****************************/
/* LM with LAPACK */
/** keep interface almost the same as in levmar **/
@ -1734,6 +1782,22 @@ rtr_solve_nocuda_robust(
double *info, /* initial and final residuals */
me_data_t *adata);
/* Nesterov's SD */
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern int
nsd_solve_nocuda_robust(
double *x, /* initial values and updated solution at output (size 8*N double) */
double *y, /* data vector (size 8*M double) */
int N, /* no. of stations */
int M, /* no. of constraints */
int itmax, /* maximum number of iterations */
double robust_nulow, double robust_nuhigh, /* robust nu range */
double *info, /* initial and final residuals */
me_data_t *adata); /* pointer to additional data
*/
/****************************** rtr_solve_robust_admm.c ****************************/
#ifdef USE_MIC
__attribute__ ((target(MIC)))
@ -1824,6 +1888,24 @@ rtr_solve_cuda_robust_fl(
int tileoff, /* tile offset when solving for many chunks */
int ntiles, /* total tile (data) size being solved for */
me_data_t *adata);
/* Nesterov's steepest descent */
extern int
nsd_solve_cuda_robust_fl(
float *x0, /* initial values and updated solution at output (size 8*N float) */
float *y, /* data vector (size 8*M float) */
int N, /* no of stations */
int M, /* no of constraints */
int itmax, /* maximum number of iterations */
double robust_nulow, double robust_nuhigh, /* robust nu range */
double *info, /* initial and final residuals */
cublasHandle_t cbhandle, /* device handle */
float *gWORK, /* GPU allocated memory */
int tileoff, /* tile offset when solving for many chunks */
int ntiles, /* total tile (data) size being solved for */
me_data_t *adata);
/****************************** rtr_solve_robust_cuda_admm.c ****************************/
/* ADMM solver */
extern int
@ -1834,8 +1916,7 @@ rtr_solve_cuda_robust_admm_fl(
float *y, /* data vector (size 8*M float) */
int N, /* no of stations */
int M, /* no of constraints */
int itmax_sd, /* maximum number of iterations RSD */
int itmax_rtr, /* maximum number of iterations RTR */
int itmax_rtr, /* maximum number of iterations */
float Delta_bar, float Delta0, /* Trust region radius and initial value */
float admm_rho, /* ADMM regularization */
double robust_nulow, double robust_nuhigh, /* robust nu range */
@ -1846,6 +1927,26 @@ rtr_solve_cuda_robust_admm_fl(
int tileoff, /* tile offset when solving for many chunks */
int ntiles, /* total tile (data) size being solved for */
me_data_t *adata);
/* Nesterov's SD */
extern int
nsd_solve_cuda_robust_admm_fl(
float *x0, /* initial values and updated solution at output (size 8*N float) */
float *Y, /* Lagrange multiplier size 8N */
float *Z, /* consensus term B Z size 8N */
float *y, /* data vector (size 8*M float) */
int N, /* no of stations */
int M, /* no of constraints */
int itmax, /* maximum number of iterations */
float admm_rho, /* ADMM regularization */
double robust_nulow, double robust_nuhigh, /* robust nu range */
double *info, /* initial and final residuals */
cublasHandle_t cbhandle, /* device handle */
float *gWORK, /* GPU allocated memory */
int tileoff, /* tile offset when solving for many chunks */
int ntiles, /* total tile (data) size being solved for */
me_data_t *adata);
#endif /* HAVE_CUDA */
/****************************** lmfit.c ****************************/
/****************************** lmfit_nocuda.c ****************************/
@ -1885,6 +1986,7 @@ random_permutation(int n, int weighted_iter, double *w);
#define SM_RLM_RLBFGS 2
#define SM_RTR_OSLM_LBFGS 4
#define SM_RTR_OSRLM_RLBFGS 5
#define SM_NSD_RLBFGS 6
/* fit visibilities
u,v,w: u,v,w coordinates (wavelengths) size Nbase*tilesz x 1
u,v,w are ordered with baselines, timeslots
@ -1913,6 +2015,7 @@ random_permutation(int n, int weighted_iter, double *w);
nulow,nuhigh: robust nu search range
randomize: if >0, randomize cluster selection in SAGE and OS subset selection
whiten : if >0 whiten data 1: NCP, 2...
mean_nu: output mean value of nu
res_0,res_1: initial and final residuals (output)
return val=0 if final residual< initial residual
@ -1923,7 +2026,7 @@ __attribute__ ((target(MIC)))
#endif
extern int
sagefit_visibilities(double *u, double *v, double *w, double *x, int N,
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt,int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv, int solver_mode, double nulow, double nuhigh, int randomize, double *mean_nu, double *res_0, double *res_1);
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt,int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv, int solver_mode, double nulow, double nuhigh, int randomize, int whiten, double *mean_nu, double *res_0, double *res_1);
/* same as above, but uses 2 GPUS in the LM stage */
extern int
@ -2010,6 +2113,9 @@ typedef struct pipeline_ {
#ifndef PT_DO_WORK_RRTR /* Robust RTR */
#define PT_DO_WORK_RRTR 8
#endif
#ifndef PT_DO_WORK_NSD /* Nesterov's SD */
#define PT_DO_WORK_NSD 9
#endif
#ifndef PT_DO_MEMRESET
#define PT_DO_MEMRESET 99
#endif
@ -2086,7 +2192,7 @@ typedef struct gb_data_admm_fl_ {
float *p[2]; /* pointer to parameters being solved by each thread */
float *Y[2]; /* pointer to Lagrange multiplier */
float *Z[2]; /* pointer to consensus term */
float admm_rho;
float admm_rho[2];
float *x[2]; /* pointer to data being fit by each thread */
int M[2];
int N[2];
@ -2233,23 +2339,23 @@ update_global_z(double *Z,int N,int M,int Npoly,double *z,double *Bi);
Y : Lagrange multiplier
BZ : consensus term
Y,BZ : size same as pp : 8*N*Mt x1 double values (re,img) for each station/direction
admm_rho : regularization factor
admm_rho : regularization factor array size Mx1
*/
extern int
sagefit_visibilities_admm(double *u, double *v, double *w, double *x, int N,
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, double admm_rho, double *mean_nu, double *res_0, double *res_1);
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, double *admm_rho, double *mean_nu, double *res_0, double *res_1);
/* ADMM cost function = normal_cost + ||Y^H(J-BZ)|| + rho/2 ||J-BZ||^2 */
/* extra params
Y : Lagrange multiplier
BZ : consensus term
Y,BZ : size same as pp : 8*N*Mt x1 double values (re,img) for each station/direction
admm_rho : regularization factor
admm_rho : regularization factor array size Mx1
*/
#ifdef HAVE_CUDA
extern int
sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x, int N,
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode, double nulow, double nuhigh, int randomize, double admm_rho, double *mean_nu, double *res_0, double *res_1);
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode, double nulow, double nuhigh, int randomize, double *admm_rho, double *mean_nu, double *res_0, double *res_1);
#endif
#ifdef __cplusplus

View File

@ -336,3 +336,151 @@ update_nu(double logsumw, int Nd, int Nt, double nulow, double nuhigh, int p, do
free(q);
return thisnu;
}
/* x = sqrt(u^2+v^2) */
static double
ncp_weight(double ud) {
/* fo(x) =
a1*exp(-((x-b1)/c1)^2) + a2*exp(-((x-b2)/c2)^2) +
a3*exp(-((x-b3)/c3)^2) + a4*exp(-((x-b4)/c4)^2) +
a5*exp(-((x-b5)/c5)^2) + a6*exp(-((x-b6)/c6)^2)
mean(fo(x)) is about 1
*/
float x=(float)ud;
if (x<40.0f) { return 1.0; }
if (x>800.0f) {
return 1.0;
}
/* else [40,285] */
float r[6];
float a1 =-0.9415f;
float b1 =117.1f;
float c1 =15.08f;
float a2 =5.231f;
float b2 =49.57f;
float c2 =13.79f;
float a3 =2.209f;
float b3 =67.29f;
float c3 =14.86f;
float a4 =10.43f;
float b4 =72.19f;
float c4 =200.8f;
float a5 =104.9f;
float b5 =98.72f;
float c5 =65.8f;
float a6 =-101.3f;
float b6 =101.2f;
float c6 =66.63f;
r[0]=(x-b1)/c1;
r[1]=(x-b2)/c2;
r[2]=(x-b3)/c3;
r[3]=(x-b4)/c4;
r[4]=(x-b5)/c5;
r[5]=(x-b6)/c6;
r[0]*=-r[0];
r[1]*=-r[1];
r[2]*=-r[2];
r[3]*=-r[3];
r[4]*=-r[4];
r[5]*=-r[5];
float sum=0.0f;
sum+=a1*expf(r[0]);
sum+=a2*expf(r[1]);
sum+=a3*expf(r[2]);
sum+=a4*expf(r[3]);
sum+=a5*expf(r[4]);
sum+=a6*expf(r[5]);
return (1.0/((double)sum+1.0)); /* as x-> inf, goes to 1 */
}
static void *
threadfn_setblweight(void *data) {
thread_data_baselinewt_t *t=(thread_data_baselinewt_t*)data;
int ci;
for (ci=0; ci<t->Nb; ci++) {
/* get sqrt(u^2+v^2) */
double uu=t->u[ci+t->boff]*t->freq0;
double vv=t->v[ci+t->boff]*t->freq0;
double a=ncp_weight(sqrt(uu*uu+vv*vv));
t->wt[8*(ci+t->boff)]*=a;
t->wt[8*(ci+t->boff)+1]*=a;
t->wt[8*(ci+t->boff)+2]*=a;
t->wt[8*(ci+t->boff)+3]*=a;
t->wt[8*(ci+t->boff)+4]*=a;
t->wt[8*(ci+t->boff)+5]*=a;
t->wt[8*(ci+t->boff)+6]*=a;
t->wt[8*(ci+t->boff)+7]*=a;
printf("%lf %lf %lf\n",uu,vv,a);
}
return NULL;
}
/* update weights array wt by multiplying it with the inverse density function
1/( 1+f(u,v) )
as u,v->inf, f(u,v) -> 0 so long baselines are not affected
wt : Nbase*8 x 1
u,v : Nbase x 1
note: u = u/c, v=v/c here, so need freq to convert to wavelengths */
void
add_whitening_weights(int Nbase, double *wt, double *u, double *v, double freq0, int Nt) {
pthread_attr_t attr;
pthread_t *th_array;
thread_data_baselinewt_t *threaddata;
int ci,nth1,nth;
int Nthb0,Nthb;
Nthb0=(Nbase+Nt-1)/Nt;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((threaddata=(thread_data_baselinewt_t*)malloc((size_t)Nt*sizeof(thread_data_baselinewt_t)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
/* iterate over threads, allocating baselines per thread */
ci=0;
for (nth=0; nth<Nt && ci<Nbase; nth++) {
if (ci+Nthb0<Nbase) {
Nthb=Nthb0;
} else {
Nthb=Nbase-ci;
}
threaddata[nth].Nb=Nthb;
threaddata[nth].boff=ci;
threaddata[nth].wt=wt;
threaddata[nth].u=u;
threaddata[nth].v=v;
threaddata[nth].freq0=freq0;
pthread_create(&th_array[nth],&attr,threadfn_setblweight,(void*)(&threaddata[nth]));
/* next baseline set */
ci=ci+Nthb;
}
/* now wait for threads to finish */
for(nth1=0; nth1<nth; nth1++) {
pthread_join(th_array[nth1],NULL);
}
pthread_attr_destroy(&attr);
free(th_array);
free(threaddata);
}