update to 0.3.8
This commit is contained in:
parent
b167c81854
commit
4f971a5cdd
|
@ -13,6 +13,7 @@ LDFLAGS=-Wl,--rpath,/usr/local/OpenBLAS/lib/
|
|||
#LDFLAGS=-Wl,-t,--rpath,/software/users/lofareor/SW/lib64
|
||||
# -Wl,--hash-style=both
|
||||
|
||||
# with multithread FFTW
|
||||
MY_LIBS=-lm -lsagecal
|
||||
INCLUDES=-I. -I./lib -I$(CASA_INCDIR) -I/usr/include
|
||||
LIBPATH=-L$(LAPACK_DIR) -L$(CASA_LIBDIR) -L./lib
|
||||
|
|
|
@ -33,7 +33,7 @@ using namespace Data;
|
|||
|
||||
void
|
||||
print_copyright(void) {
|
||||
cout<<"SAGECal-MPI 0.3.5 (C) 2011-2015 Sarod Yatawatta"<<endl;
|
||||
cout<<"SAGECal-MPI 0.3.9 (C) 2011-2015 Sarod Yatawatta"<<endl;
|
||||
}
|
||||
|
||||
|
||||
|
@ -44,10 +44,10 @@ print_help(void) {
|
|||
cout << "-f MSlist: text file with MS names" << endl;
|
||||
cout << "-s sky.txt: sky model file"<< endl;
|
||||
cout << "-c cluster.txt: cluster file"<< endl;
|
||||
cout << "-p solutions.txt: if given, save solution in this file, if not given 'XXX.MS.solutions' will be used"<< endl;
|
||||
cout << "-p solutions.txt: if given, save (global) solutions in this file, but slaves will always write to 'XXX.MS.solutions'"<< endl;
|
||||
cout << "-F sky model format: 0: LSM, 1: LSM with 3 order spectra : default "<< Data::format<<endl;
|
||||
cout << "-I input column (DATA/CORRECTED_DATA) : default " <<Data::DataField<< endl;
|
||||
cout << "-O ouput column (DATA/CORRECTED_DATA) : default " <<Data::OutField<< endl;
|
||||
cout << "-I input column (DATA/CORRECTED_DATA/...) : default " <<Data::DataField<< endl;
|
||||
cout << "-O ouput column (DATA/CORRECTED_DATA/...) : default " <<Data::OutField<< endl;
|
||||
cout << "-e max EM iterations : default " <<Data::max_emiter<< endl;
|
||||
cout << "-g max iterations (within single EM) : default " <<Data::max_iter<< endl;
|
||||
cout << "-l max LBFGS iterations : default " <<Data::max_lbfgs<< endl;
|
||||
|
@ -57,16 +57,16 @@ print_help(void) {
|
|||
cout << "-A ADMM iterations: default " <<Data::Nadmm<< endl;
|
||||
cout << "-P consensus polynomial order: default " <<Data::Npoly<< endl;
|
||||
cout << "-r regularization factor: default " <<Data::admm_rho<< endl;
|
||||
cout << "-G regularization factor of each cluster (text file instead of -r): default : None" << endl;
|
||||
cout << "-x exclude baselines length (lambda) lower than this in calibration : default "<<Data::min_uvcut << endl;
|
||||
cout << "-y exclude baselines length (lambda) higher than this in calibration : default "<<Data::max_uvcut << endl;
|
||||
cout <<endl<<"Advanced options:"<<endl;
|
||||
cout << "-k cluster_id : correct residuals with solution of this cluster : default "<<Data::ccid<< endl;
|
||||
cout << "-o robust rho, robust matrix inversion during correction: default "<<Data::rho<< endl;
|
||||
cout << "-j 0,1,2... 0 : OSaccel, 1 no OSaccel, 2: OSRLM, 3: RLM, 4: RTR, 5: RRTR: default "<<Data::solver_mode<< endl;
|
||||
cout << "-j 0,1,2... 0 : OSaccel, 1 no OSaccel, 2: OSRLM, 3: RLM, 4: RTR, 5: RRTR: 6: NSD, default "<<Data::solver_mode<< endl;
|
||||
cout << "-L robust nu, lower bound: default "<<Data::nulow<< endl;
|
||||
cout << "-H robust nu, upper bound: default "<<Data::nuhigh<< endl;
|
||||
cout << "-R randomize iterations: default "<<Data::randomize<< endl;
|
||||
cout << "-D 0,1,2 : if >0, enable diagnostics (Jacobian Leverage) 1 replace Jacobian Leverage as output, 2 only fractional noise/leverage is printed: default " <<Data::DoDiag<< endl;
|
||||
cout << "-T stop after this number of solutions (0 means no limit): default "<<Data::Nmaxtime<< endl;
|
||||
cout <<"Report bugs to <sarod@users.sf.net>"<<endl;
|
||||
}
|
||||
|
@ -75,7 +75,7 @@ print_help(void) {
|
|||
void
|
||||
ParseCmdLine(int ac, char **av) {
|
||||
char c;
|
||||
while((c=getopt(ac, av, "c:e:f:g:j:k:l:m:n:o:p:r:s:t:x:y:A:D:F:I:L:O:P:H:R:T:h"))!= -1)
|
||||
while((c=getopt(ac, av, "c:e:f:g:j:k:l:m:n:o:p:r:s:t:x:y:A:F:I:L:O:P:G:H:R:T:h"))!= -1)
|
||||
{
|
||||
switch(c)
|
||||
{
|
||||
|
@ -147,6 +147,9 @@ ParseCmdLine(int ac, char **av) {
|
|||
case 'r':
|
||||
admm_rho= atof(optarg);
|
||||
break;
|
||||
case 'G':
|
||||
admm_rho_file= optarg;
|
||||
break;
|
||||
case 'R':
|
||||
randomize= atoi(optarg);
|
||||
break;
|
||||
|
|
|
@ -67,40 +67,42 @@ sagecal_master(int argc, char **argv) {
|
|||
|
||||
iodata.Nms=ntasks-1;
|
||||
/**** get info from slaves ***************************************/
|
||||
int *bufint=new int[4];
|
||||
int *bufint=new int[5];
|
||||
double *bufdouble=new double[1];
|
||||
iodata.freqs=new double[iodata.Nms];
|
||||
iodata.freq0=0.0;
|
||||
iodata.N=iodata.M=iodata.totalt=0;
|
||||
int Mo=0;
|
||||
/* use iodata to store the results, also check for consistency of results */
|
||||
for (int cm=0; cm<iodata.Nms; cm++) {
|
||||
MPI_Recv(bufint, 4, /* N,M,tilesz,totalt */
|
||||
MPI_Recv(bufint, 5, /* N,Mo(actual clusters),M(with hybrid),tilesz,totalt */
|
||||
MPI_INT, cm+1, TAG_MSAUX, MPI_COMM_WORLD, &status);
|
||||
cout<<"Slave "<<cm+1<<" N="<<bufint[0]<<" M="<<bufint[1]<<" tilesz="<<bufint[2]<<" totaltime="<<bufint[3]<<endl;
|
||||
cout<<"Slave "<<cm+1<<" N="<<bufint[0]<<" M="<<bufint[1]<<"/"<<bufint[2]<<" tilesz="<<bufint[3]<<" totaltime="<<bufint[4]<<endl;
|
||||
if (cm==0) { /* update data */
|
||||
iodata.N=bufint[0];
|
||||
iodata.M=bufint[1];
|
||||
iodata.tilesz=bufint[2];
|
||||
iodata.totalt=bufint[3];
|
||||
Mo=bufint[1];
|
||||
iodata.M=bufint[2];
|
||||
iodata.tilesz=bufint[3];
|
||||
iodata.totalt=bufint[4];
|
||||
} else { /* compare against others */
|
||||
if ((iodata.N != bufint[0]) || (iodata.M != bufint[1]) || (iodata.tilesz != bufint[2])) {
|
||||
cout<<"Slave "<<cm+1<<" parameters do not match N="<<bufint[0]<<" M="<<bufint[1]<<" tilesz="<<bufint[2]<<endl;
|
||||
if ((iodata.N != bufint[0]) || (iodata.M != bufint[2]) || (iodata.tilesz != bufint[3])) {
|
||||
cout<<"Slave "<<cm+1<<" parameters do not match N="<<bufint[0]<<" M="<<bufint[2]<<" tilesz="<<bufint[3]<<endl;
|
||||
}
|
||||
if (iodata.totalt<bufint[3]) {
|
||||
if (iodata.totalt<bufint[4]) {
|
||||
/* use max value as total time */
|
||||
iodata.totalt=bufint[3];
|
||||
iodata.totalt=bufint[4];
|
||||
}
|
||||
}
|
||||
MPI_Recv(bufdouble, 1, /* freq */
|
||||
MPI_DOUBLE, cm+1, TAG_MSAUX, MPI_COMM_WORLD, &status);
|
||||
iodata.freqs[cm]=bufdouble[0];
|
||||
iodata.freq0 +=bufdouble[0];
|
||||
cout<<"Slave "<<cm+1<<" freq="<<bufdouble[0]<<endl;
|
||||
cout<<"Slave "<<cm+1<<" frequency (MHz)="<<bufdouble[0]*1e-6<<endl;
|
||||
}
|
||||
iodata.freq0/=(double)iodata.Nms;
|
||||
delete [] bufint;
|
||||
delete [] bufdouble;
|
||||
cout<<"Reference freq="<<iodata.freq0<<endl;
|
||||
cout<<"Reference frequency (MHz)="<<iodata.freq0*1.0e-6<<endl;
|
||||
/* ADMM memory */
|
||||
double *Z,*Y,*z;
|
||||
/* Z: 2Nx2 x Npoly x M */
|
||||
|
@ -126,6 +128,24 @@ cout<<"Reference freq="<<iodata.freq0<<endl;
|
|||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
/* file for saving solutions */
|
||||
FILE *sfp=0;
|
||||
if (solfile) {
|
||||
if ((sfp=fopen(solfile,"w+"))==0) {
|
||||
fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/* write additional info to solution file */
|
||||
if (solfile) {
|
||||
fprintf(sfp,"# solution file (Z) created by SAGECal\n");
|
||||
fprintf(sfp,"# reference_freq(MHz) polynomial_order stations clusters effective_clusters\n");
|
||||
fprintf(sfp,"%lf %d %d %d %d\n",iodata.freq0*1e-6,Npoly,iodata.N,Mo,iodata.M);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* interpolation polynomial */
|
||||
double *B,*Bi;
|
||||
|
@ -139,12 +159,38 @@ cout<<"Reference freq="<<iodata.freq0<<endl;
|
|||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
/* regularization factor array */
|
||||
double *arho;
|
||||
if ((arho=(double*)calloc((size_t)Nadmm,sizeof(double)))==0) {
|
||||
/* regularization factor array, size Mx1
|
||||
one per each hybrid cluster */
|
||||
double *arho,*arhoslave;
|
||||
if ((arho=(double*)calloc((size_t)iodata.M,sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((arhoslave=(double*)calloc((size_t)Mo,sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* if text file is given, read it and update rho array */
|
||||
if (Data::admm_rho_file) {
|
||||
read_arho_fromfile(Data::admm_rho_file,iodata.M,arho,Mo,arhoslave);
|
||||
} else {
|
||||
/* copy common value */
|
||||
/* setup regularization factor array */
|
||||
for (int p=0; p<iodata.M; p++) {
|
||||
arho[p]=admm_rho;
|
||||
}
|
||||
for (int p=0; p<Mo; p++) {
|
||||
arhoslave[p]=admm_rho;
|
||||
}
|
||||
}
|
||||
|
||||
/* send array to slaves */
|
||||
/* update rho on each slave */
|
||||
for(int cm=0; cm<iodata.Nms; cm++) {
|
||||
MPI_Send(arhoslave, Mo, MPI_DOUBLE, cm+1,TAG_RHO, MPI_COMM_WORLD);
|
||||
}
|
||||
free(arhoslave);
|
||||
|
||||
#ifdef DEBUG
|
||||
FILE *dfp;
|
||||
|
@ -159,11 +205,6 @@ cout<<"Reference freq="<<iodata.freq0<<endl;
|
|||
/* find sum B(:,i)B(:,i)^T, and its pseudoinverse */
|
||||
find_prod_inverse(B,Bi,Npoly,iodata.Nms);
|
||||
|
||||
/* setup regularization factor array */
|
||||
for (int p=0; p<Nadmm; p++) {
|
||||
arho[p]=admm_rho;
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
fprintf(dfp,"B=[\n");
|
||||
for (int p=0; p<Npoly; p++) {
|
||||
|
@ -174,7 +215,7 @@ cout<<"Reference freq="<<iodata.freq0<<endl;
|
|||
}
|
||||
fprintf(dfp,"];\n");
|
||||
fprintf(dfp,"rho=%lf;\narho=[",admm_rho);
|
||||
for (int p=0; p<Nadmm; p++) {
|
||||
for (int p=0; p<iodata.M; p++) {
|
||||
fprintf(dfp,"%lf ",arho[p]);
|
||||
}
|
||||
fprintf(dfp,"];\n");
|
||||
|
@ -197,8 +238,13 @@ cout<<"Reference freq="<<iodata.freq0<<endl;
|
|||
#ifdef DEBUG
|
||||
Ntime=1;
|
||||
#endif
|
||||
cout<<"Master total timeslots="<<Ntime<<endl;
|
||||
cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="<<admm_rho<<endl;
|
||||
cout<<"Master total timeslots="<<Ntime<<endl;
|
||||
|
||||
if (!Data::admm_rho_file) {
|
||||
cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="<<admm_rho<<endl;
|
||||
} else {
|
||||
cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization given by text file "<<Data::admm_rho_file<<endl;
|
||||
}
|
||||
int msgcode;
|
||||
for (int ct=0; ct<Ntime; ct++) {
|
||||
/* send start processing signal to slaves */
|
||||
|
@ -208,11 +254,6 @@ cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="
|
|||
}
|
||||
|
||||
for (int admm=0; admm<Nadmm; admm++) {
|
||||
/* update rho on each slave */
|
||||
for(int cm=0; cm<iodata.Nms; cm++) {
|
||||
MPI_Send(&arho[admm], 1, MPI_DOUBLE, cm+1,TAG_RHO, MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
/* get Y_i+rho J_i from each slave */
|
||||
/* note: for first iteration, reorder values as
|
||||
2Nx2 complex matrix blocks, M times from each slave
|
||||
|
@ -270,8 +311,16 @@ cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="
|
|||
my_daxpy(8*iodata.N*iodata.M, &Y[cm*8*iodata.N*iodata.M], B[cm*Npoly+ci], &z[ci*8*iodata.N*iodata.M]);
|
||||
}
|
||||
}
|
||||
/* also scale by 1/rho */
|
||||
my_dscal(8*iodata.N*iodata.M*Npoly,1.0/arho[admm],z);
|
||||
/* also scale by 1/rho, only if rho>0, otherwise set it to 0.0*/
|
||||
for (int cm=0; cm<iodata.M; cm++) {
|
||||
double invscale=0.0;
|
||||
if (arho[cm]>0.0) {
|
||||
invscale=1.0/arho[cm];
|
||||
}
|
||||
for (int ci=0; ci<Npoly; ci++) {
|
||||
my_dscal(8*iodata.N,invscale,&z[8*iodata.N*iodata.M*ci+8*iodata.N*cm]);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
fprintf(dfp,"%%%%%%%%%%%%%% time=%d admm=%d\n",ct,admm);
|
||||
|
@ -342,6 +391,19 @@ cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="
|
|||
resetcount++;
|
||||
}
|
||||
}
|
||||
|
||||
/* write Z to solution file, same format as J, but we have Npoly times more
|
||||
values per timeslot per column */
|
||||
if (solfile) {
|
||||
for (int p=0; p<iodata.N*8*Npoly; p++) {
|
||||
fprintf(sfp,"%d ",p);
|
||||
for (int pp=0; pp<iodata.M; pp++) {
|
||||
fprintf(sfp," %e",Z[pp*iodata.N*8*Npoly+p]);
|
||||
}
|
||||
fprintf(sfp,"\n");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* send end signal to each slave */
|
||||
|
@ -350,6 +412,10 @@ cout<<"ADMM iterations="<<Nadmm<<" polynomial order="<<Npoly<<" regularization="
|
|||
MPI_Send(&msgcode, 1, MPI_INT, cm+1,TAG_CTRL, MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
if (solfile) {
|
||||
fclose(sfp);
|
||||
}
|
||||
|
||||
|
||||
#ifdef DEBUG
|
||||
fclose(dfp);
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <pthread.h>
|
||||
#include <time.h>
|
||||
|
||||
#include<sagecal.h>
|
||||
#include <mpi.h>
|
||||
|
@ -71,21 +72,14 @@ sagecal_slave(int argc, char **argv) {
|
|||
double **pm;
|
||||
complex double *coh;
|
||||
FILE *sfp=0;
|
||||
if (solfile) {
|
||||
if ((sfp=fopen(solfile,"w+"))==0) {
|
||||
/* always create default solution file name MS+'.solutions' */
|
||||
std::string filebuff=std::string(Data::TableName)+std::string(".solutions\0");
|
||||
if ((sfp=fopen(filebuff.c_str(),"w+"))==0) {
|
||||
fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
/* create default solution file name MS+'.solutions' */
|
||||
std::string filebuff=std::string(Data::TableName)+std::string(".solutions\0");
|
||||
if ((sfp=fopen(filebuff.c_str(),"w+"))==0) {
|
||||
fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
}
|
||||
/* set solfile to non null value */
|
||||
solfile=const_cast<char*>(filebuff.c_str());
|
||||
}
|
||||
/* set solfile to non null value */
|
||||
solfile=const_cast<char*>(filebuff.c_str());
|
||||
|
||||
|
||||
double mean_nu;
|
||||
|
@ -192,17 +186,25 @@ sagecal_slave(int argc, char **argv) {
|
|||
msitr[cm]->origin();
|
||||
}
|
||||
|
||||
/* write additional info to solution file */
|
||||
if (solfile) {
|
||||
fprintf(sfp,"# solution file created by SAGECal\n");
|
||||
fprintf(sfp,"# freq(MHz) bandwidth(MHz) time_interval(min) stations clusters effective_clusters\n");
|
||||
fprintf(sfp,"%lf %lf %lf %d %d %d\n",iodata.freq0*1e-6,iodata.deltaf*1e-6,(double)iodata.tilesz*iodata.deltat/60.0,iodata.N,M,Mt);
|
||||
}
|
||||
|
||||
|
||||
/**** send info to master ***************************************/
|
||||
/* send freq (freq0), no. stations (N), total timeslots (totalt), no. of clusters (Mt), integration time (deltat), bandwidth (deltaf) */
|
||||
int *bufint=new int[4];
|
||||
/* send freq (freq0), no. stations (N), total timeslots (totalt), no. of clusters (M), true no. of clusters with hybrid (Mt), integration time (deltat), bandwidth (deltaf) */
|
||||
int *bufint=new int[5];
|
||||
double *bufdouble=new double[1];
|
||||
bufint[0]=iodata.N;
|
||||
bufint[1]=Mt;
|
||||
bufint[2]=iodata.tilesz;
|
||||
bufint[3]=iodata.totalt;
|
||||
bufint[1]=M;
|
||||
bufint[2]=Mt;
|
||||
bufint[3]=iodata.tilesz;
|
||||
bufint[4]=iodata.totalt;
|
||||
bufdouble[0]=iodata.freq0;
|
||||
MPI_Send(bufint, 4, MPI_INT, 0,TAG_MSAUX, MPI_COMM_WORLD);
|
||||
MPI_Send(bufint, 5, MPI_INT, 0,TAG_MSAUX, MPI_COMM_WORLD);
|
||||
MPI_Send(bufdouble, 1, MPI_DOUBLE, 0,TAG_MSAUX, MPI_COMM_WORLD);
|
||||
|
||||
delete [] bufint;
|
||||
|
@ -227,6 +229,15 @@ sagecal_slave(int argc, char **argv) {
|
|||
exit(1);
|
||||
}
|
||||
|
||||
double *arho;
|
||||
if ((arho=(double*)calloc((size_t)M,sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* get regularization factor array */
|
||||
MPI_Recv(arho,M,MPI_DOUBLE,0,TAG_RHO,MPI_COMM_WORLD,&status);
|
||||
|
||||
/* if we have more than 1 channel, need to backup raw data */
|
||||
double *xbackup=0;
|
||||
if (iodata.Nchan>1) {
|
||||
|
@ -236,6 +247,7 @@ sagecal_slave(int argc, char **argv) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
int msgcode=0;
|
||||
/* starting iterations doubled */
|
||||
int start_iter=1;
|
||||
|
@ -265,22 +277,19 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
|
|||
/******************** ADMM *******************************/
|
||||
|
||||
for (int admm=0; admm<Nadmm; admm++) {
|
||||
/* get current regularization factor */
|
||||
MPI_Recv(&admm_rho,1,MPI_DOUBLE,0,TAG_RHO,MPI_COMM_WORLD,&status);
|
||||
|
||||
/* ADMM 1: minimize cost function */
|
||||
if (admm==0) {
|
||||
#ifndef HAVE_CUDA
|
||||
if (start_iter) {
|
||||
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(Data::solver_mode==SM_RTR_OSRLM_RLBFGS?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
|
||||
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,(iodata.N<=64?2*Data::max_emiter:4*Data::max_emiter),Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(iodata.N<=64 && Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(iodata.N<=64 && (Data::solver_mode==SM_RTR_OSRLM_RLBFGS||Data::solver_mode==SM_NSD_RLBFGS)?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,0,&mean_nu,&res_0,&res_1); /* 0 for dummy whiten flag */
|
||||
start_iter=0;
|
||||
} else {
|
||||
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
|
||||
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,0,&mean_nu,&res_0,&res_1);
|
||||
}
|
||||
#endif /* !HAVE_CUDA */
|
||||
#ifdef HAVE_CUDA
|
||||
if (start_iter) {
|
||||
sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(Data::solver_mode==SM_RTR_OSRLM_RLBFGS?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
|
||||
sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,(iodata.N<=64?2*Data::max_emiter:4*Data::max_emiter),Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(iodata.N<=64 && Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(iodata.N<=64 && (Data::solver_mode==SM_RTR_OSRLM_RLBFGS||Data::solver_mode==SM_NSD_RLBFGS)?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
|
||||
start_iter=0;
|
||||
} else {
|
||||
sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,Data::max_lbfgs,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
|
||||
|
@ -301,11 +310,11 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
|
|||
}
|
||||
|
||||
#ifndef HAVE_CUDA
|
||||
sagefit_visibilities_admm(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,admm_rho,&mean_nu,&res_0,&res_1);
|
||||
sagefit_visibilities_admm(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,arho,&mean_nu,&res_0,&res_1);
|
||||
#endif /* !HAVE_CUDA */
|
||||
#ifdef HAVE_CUDA
|
||||
//sagefit_visibilities_admm(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,admm_rho,&mean_nu,&res_0,&res_1);
|
||||
sagefit_visibilities_admm_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,admm_rho,&mean_nu,&res_0,&res_1);
|
||||
//sagefit_visibilities_admm(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,arho,&mean_nu,&res_0,&res_1);
|
||||
sagefit_visibilities_admm_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Y,Z,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,0,Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,arho,&mean_nu,&res_0,&res_1);
|
||||
#endif /* HAVE_CUDA */
|
||||
}
|
||||
|
||||
|
@ -314,10 +323,25 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
|
|||
if (admm==0) {
|
||||
/* Y is set to 0 : so original is just rho * J*/
|
||||
my_dcopy(iodata.N*8*Mt, p, 1, Y, 1);
|
||||
my_dscal(iodata.N*8*Mt, admm_rho, Y);
|
||||
/* scale by individual rho for each cluster */
|
||||
/* if rho<=0, do nothing */
|
||||
ck=0;
|
||||
for (ci=0; ci<M; ci++) {
|
||||
/* Y will be set to 0 if rho<=0 */
|
||||
my_dscal(iodata.N*8*carr[ci].nchunk, arho[ci], &Y[ck]);
|
||||
ck+=iodata.N*8*carr[ci].nchunk;
|
||||
}
|
||||
} else {
|
||||
my_daxpy(iodata.N*8*Mt, p, admm_rho, Y);
|
||||
ck=0;
|
||||
for (ci=0; ci<M; ci++) {
|
||||
if (arho[ci]>0.0) {
|
||||
my_daxpy(iodata.N*8*carr[ci].nchunk, &p[ck], arho[ci], &Y[ck]);
|
||||
}
|
||||
ck+=iodata.N*8*carr[ci].nchunk;
|
||||
//cout<<"Clus="<<ci<<" Chunk="<<carr[ci].nchunk<<" Rho="<<arho[ci]<<endl;
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Send(Y, iodata.N*8*Mt, MPI_DOUBLE, 0,TAG_YDATA, MPI_COMM_WORLD);
|
||||
/* for initial ADMM iteration, get back Y with common unitary ambiguity */
|
||||
if (admm==0) {
|
||||
|
@ -329,7 +353,14 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
|
|||
|
||||
/* update Y_i <= Y_i + rho (J_i-B_i Z)
|
||||
since we already have Y_i + rho J_i, only need -rho (B_i Z) */
|
||||
my_daxpy(iodata.N*8*Mt, Z, -admm_rho, Y);
|
||||
ck=0;
|
||||
for (ci=0; ci<M; ci++) {
|
||||
if (arho[ci]>0.0) {
|
||||
my_daxpy(iodata.N*8*carr[ci].nchunk, &Z[ck], -arho[ci], &Y[ck]);
|
||||
}
|
||||
ck+=iodata.N*8*carr[ci].nchunk;
|
||||
}
|
||||
|
||||
/* calculate primal residual J-BZ */
|
||||
my_dcopy(iodata.N*8*Mt, p, 1, pres, 1);
|
||||
my_daxpy(iodata.N*8*Mt, Z, -1.0, pres);
|
||||
|
@ -365,21 +396,21 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
|
|||
/* if residual has increased too much, or all are flagged (0 residual)
|
||||
or NaN
|
||||
reset solutions to original
|
||||
initial values */
|
||||
if (res_1==0.0 || !isfinite(res_1) || res_1>res_ratio*res_prev) {
|
||||
initial values : use residual at 1st ADMM */
|
||||
if (res_01==0.0 || !isfinite(res_01) || res_01>res_ratio*res_prev) {
|
||||
cout<<"Resetting Solution"<<endl;
|
||||
/* reset solutions so next iteration has default initial values */
|
||||
memcpy(p,pinit,(size_t)iodata.N*8*Mt*sizeof(double));
|
||||
/* also assume iterations have restarted from scratch */
|
||||
start_iter=1;
|
||||
/* also forget min residual (otherwise will try to reset it always) */
|
||||
res_prev=res_1;
|
||||
} else if (res_1<res_prev) { /* only store the min value */
|
||||
res_prev=res_1;
|
||||
res_prev=res_01;
|
||||
} else if (res_01<res_prev) { /* only store the min value */
|
||||
res_prev=res_01;
|
||||
}
|
||||
end_time = time(0);
|
||||
elapsed_time = ((double) (end_time-start_time)) / 60.0;
|
||||
if (solver_mode==SM_OSLM_OSRLM_RLBFGS||solver_mode==SM_RLM_RLBFGS||solver_mode==SM_RTR_OSRLM_RLBFGS) {
|
||||
if (solver_mode==SM_OSLM_OSRLM_RLBFGS||solver_mode==SM_RLM_RLBFGS||solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
|
||||
cout<<"nu="<<mean_nu<<endl;
|
||||
}
|
||||
cout<<myrank<< ": Timeslot: "<<tilex<<" residual: initial="<<res_00<<"/"<<res_0<<",final="<<res_01<<"/"<<res_1<<", Time spent="<<elapsed_time<<" minutes"<<endl;
|
||||
|
@ -463,6 +494,7 @@ cout<<"Slave "<<myrank<<" quitting"<<endl;
|
|||
free(Z);
|
||||
free(Y);
|
||||
free(pres);
|
||||
free(arho);
|
||||
/**********************************************************/
|
||||
|
||||
cout<<"Done."<<endl;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
OUTPUT=
|
||||
CXX=g++
|
||||
CXXFLAGS=-O3 -Wall -g #-fnostack-protector
|
||||
CXXFLAGS=-O3 -Wall -g #-pg #-fnostack-protector
|
||||
CASA_LIBDIR=/opt/casacore/lib
|
||||
CASA_INCDIR=/opt/casacore/include/casacore
|
||||
CASA_LIBS=-lcasa_casa -lcasa_tables -lcasa_measures -lcasa_ms -lcfitsio
|
||||
|
|
|
@ -55,6 +55,7 @@ int Data::lbfgs_m=7;
|
|||
int Data::gpu_threads=128;
|
||||
int Data::linsolv=1;
|
||||
int Data::randomize=1;
|
||||
int Data::whiten=0;
|
||||
int Data::DoSim=0;
|
||||
int Data::DoDiag=0;
|
||||
int Data::doChan=0; /* if 1, solve for each channel in multi channel data */
|
||||
|
@ -69,6 +70,7 @@ char *Data::MSlist=NULL;
|
|||
int Data::Nadmm=1;
|
||||
int Data::Npoly=2;
|
||||
double Data::admm_rho=5.0;
|
||||
char *Data::admm_rho_file=NULL;
|
||||
|
||||
/* no upper limit, solve for all timeslots */
|
||||
int Data::Nmaxtime=0;
|
||||
|
@ -298,6 +300,7 @@ Data::loadData(Table ti, Data::IOData iodata) {
|
|||
for(int k = 0; k < iodata.Nchan; k++) {
|
||||
Complex *ptr = data[k].data();
|
||||
bool *flgptr=flag[k].data();
|
||||
//if (!flag.data()[k]){
|
||||
if (!flgptr[0] && !flgptr[1] && !flgptr[2] && !flgptr[3]){
|
||||
cxx+=ptr[0];
|
||||
cxy+=ptr[1];
|
||||
|
@ -438,6 +441,7 @@ Data::loadDataList(vector<MSIter*> msitr, Data::IOData iodata) {
|
|||
for(int k = 0; k < iodata.NchanMS[cm]; k++) {
|
||||
Complex *ptr = data[k].data();
|
||||
bool *flgptr=flag[k].data();
|
||||
//if (!flag.data()[k]){
|
||||
if (!flgptr[0] && !flgptr[1] && !flgptr[2] && !flgptr[3]){
|
||||
cxx+=ptr[0];
|
||||
cxy+=ptr[1];
|
||||
|
|
|
@ -114,6 +114,7 @@ namespace Data
|
|||
extern char *ignorefile;
|
||||
extern double nulow,nuhigh;
|
||||
extern int randomize;
|
||||
extern int whiten;
|
||||
extern int DoSim; /* if 1, simulation mode */
|
||||
extern int doChan; /* if 1, solve for each channel in multi channel data */
|
||||
extern int DoDiag; /* if >0, enables diagnostics (Leverage) 1: write leverage as output (no residual), 2: only calculate fractions of leverage/noise */
|
||||
|
@ -122,6 +123,7 @@ namespace Data
|
|||
extern int Nadmm; /* ADMM iterations >=1 */
|
||||
extern int Npoly; /* polynomial order >=1 */
|
||||
extern double admm_rho; /* regularization */
|
||||
extern char *admm_rho_file; /* text file for regularization of each cluster */
|
||||
/* for debugging, upper limit on time slots */
|
||||
extern int Nmaxtime;
|
||||
}
|
||||
|
|
|
@ -31,7 +31,7 @@ using namespace Data;
|
|||
|
||||
void
|
||||
print_copyright(void) {
|
||||
cout<<"SAGECal 0.3.5 (C) 2011-2015 Sarod Yatawatta"<<endl;
|
||||
cout<<"SAGECal 0.3.8 (C) 2011-2015 Sarod Yatawatta"<<endl;
|
||||
}
|
||||
|
||||
|
||||
|
@ -47,15 +47,15 @@ print_help(void) {
|
|||
cout << "-c cluster.txt: cluster file"<< endl;
|
||||
cout << "-p solutions.txt: if given, save solution in this file"<< endl;
|
||||
cout << "-F sky model format: 0: LSM, 1: LSM with 3 order spectra : default "<< Data::format<<endl;
|
||||
cout << "-I input column (DATA/CORRECTED_DATA) : default " <<Data::DataField<< endl;
|
||||
cout << "-O ouput column (DATA/CORRECTED_DATA) : default " <<Data::OutField<< endl;
|
||||
cout << "-I input column (DATA/CORRECTED_DATA/...) : default " <<Data::DataField<< endl;
|
||||
cout << "-O ouput column (DATA/CORRECTED_DATA/...) : default " <<Data::OutField<< endl;
|
||||
cout << "-e max EM iterations : default " <<Data::max_emiter<< endl;
|
||||
cout << "-g max iterations (within single EM) : default " <<Data::max_iter<< endl;
|
||||
cout << "-l max LBFGS iterations : default " <<Data::max_lbfgs<< endl;
|
||||
cout << "-m LBFGS memory size : default " <<Data::lbfgs_m<< endl;
|
||||
cout << "-n no of worker threads : default "<<Data::Nt << endl;
|
||||
cout << "-t tile size : default " <<Data::TileSize<< endl;
|
||||
cout << "-a 0,1 : if 1, only simulate (with solutions if solutions file is also given): default " <<Data::DoSim<< endl;
|
||||
cout << "-a 0,1,2 : if 1, only simulate, if 2, simulate and add to residual, (multiplied by solutions if solutions file is also given): default " <<Data::DoSim<< endl;
|
||||
cout << "-z ignore_clusters: if only doing a simulation, ignore the cluster ids listed in this file" << endl;
|
||||
cout << "-b 0,1 : if 1, solve for each channel: default " <<Data::doChan<< endl;
|
||||
cout << "-x exclude baselines length (lambda) lower than this in calibration : default "<<Data::min_uvcut << endl;
|
||||
|
@ -63,9 +63,10 @@ print_help(void) {
|
|||
cout <<endl<<"Advanced options:"<<endl;
|
||||
cout << "-k cluster_id : correct residuals with solution of this cluster : default "<<Data::ccid<< endl;
|
||||
cout << "-o robust rho, robust matrix inversion during correction: default "<<Data::rho<< endl;
|
||||
cout << "-j 0,1,2... 0 : OSaccel, 1 no OSaccel, 2: OSRLM, 3: RLM, 4: RTR, 5: RRTR: default "<<Data::solver_mode<< endl;
|
||||
cout << "-j 0,1,2... 0 : OSaccel, 1 no OSaccel, 2: OSRLM, 3: RLM, 4: RTR, 5: RRTR, 6:NSD : default "<<Data::solver_mode<< endl;
|
||||
cout << "-L robust nu, lower bound: default "<<Data::nulow<< endl;
|
||||
cout << "-H robust nu, upper bound: default "<<Data::nuhigh<< endl;
|
||||
cout << "-W pre-whiten data: default "<<Data::whiten<< endl;
|
||||
cout << "-R randomize iterations: default "<<Data::randomize<< endl;
|
||||
cout << "-D 0,1,2 : if >0, enable diagnostics (Jacobian Leverage) 1 replace Jacobian Leverage as output, 2 only fractional noise/leverage is printed: default " <<Data::DoDiag<< endl;
|
||||
cout <<"Report bugs to <sarod@users.sf.net>"<<endl;
|
||||
|
@ -81,7 +82,7 @@ ParseCmdLine(int ac, char **av) {
|
|||
print_help();
|
||||
exit(0);
|
||||
}
|
||||
while((c=getopt(ac, av, "a:b:c:d:e:f:g:j:k:l:m:n:o:p:s:t:x:y:z:D:F:I:O:L:H:R:h"))!= -1)
|
||||
while((c=getopt(ac, av, "a:b:c:d:e:f:g:j:k:l:m:n:o:p:s:t:x:y:z:D:F:I:O:L:H:R:W:h"))!= -1)
|
||||
{
|
||||
switch(c)
|
||||
{
|
||||
|
@ -105,7 +106,7 @@ ParseCmdLine(int ac, char **av) {
|
|||
break;
|
||||
case 'a':
|
||||
DoSim= atoi(optarg);
|
||||
if (DoSim>1) { DoSim=1; }
|
||||
if (DoSim<0) { DoSim=1; }
|
||||
break;
|
||||
case 'b':
|
||||
doChan= atoi(optarg);
|
||||
|
@ -154,6 +155,9 @@ ParseCmdLine(int ac, char **av) {
|
|||
case 'R':
|
||||
randomize= atoi(optarg);
|
||||
break;
|
||||
case 'W':
|
||||
whiten= atoi(optarg);
|
||||
break;
|
||||
case 'x':
|
||||
Data::min_uvcut= atof(optarg);
|
||||
break;
|
||||
|
@ -187,13 +191,13 @@ ParseCmdLine(int ac, char **av) {
|
|||
cout<<"Selecting baselines > "<<min_uvcut<<" and < "<<max_uvcut<<" wavelengths."<<endl;
|
||||
if (!DoSim) {
|
||||
cout<<"Using ";
|
||||
if (solver_mode==SM_LM_LBFGS || solver_mode==SM_OSLM_LBFGS || solver_mode==SM_RTR_OSLM_LBFGS) {
|
||||
if (solver_mode==SM_LM_LBFGS || solver_mode==SM_OSLM_LBFGS || solver_mode==SM_RTR_OSLM_LBFGS || solver_mode==SM_NSD_RLBFGS) {
|
||||
cout<<"Gaussian noise model for solver."<<endl;
|
||||
} else {
|
||||
cout<<"Robust noise model for solver with degrees of freedom ["<<nulow<<","<<nuhigh<<"]."<<endl;
|
||||
}
|
||||
} else {
|
||||
cout<<"Only doing simulation."<<endl;
|
||||
cout<<"Only doing simulation (with possible correction for cluster id "<<ccid<<")."<<endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -226,11 +230,11 @@ main(int argc, char **argv) {
|
|||
}
|
||||
/**********************************************************/
|
||||
int M,Mt,ci,cj,ck;
|
||||
/* parameters */
|
||||
double *p,*pinit,*pfreq;
|
||||
double **pm;
|
||||
complex double *coh;
|
||||
FILE *sfp=0;
|
||||
/* parameters */
|
||||
double *p,*pinit,*pfreq;
|
||||
double **pm;
|
||||
complex double *coh;
|
||||
FILE *sfp=0;
|
||||
if (solfile) {
|
||||
if (!Data::DoSim) {
|
||||
if ((sfp=fopen(solfile,"w+"))==0) {
|
||||
|
@ -240,9 +244,16 @@ main(int argc, char **argv) {
|
|||
} else {
|
||||
/* simulation mode, read only access */
|
||||
if ((sfp=fopen(solfile,"r"))==0) {
|
||||
fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
|
||||
fprintf(stderr,"%s: %d: no solution file present\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
}
|
||||
/* remember to skip first 3 lines from solution file */
|
||||
char chr;
|
||||
for (ci=0; ci<3; ci++) {
|
||||
do {
|
||||
chr = fgetc(sfp);
|
||||
} while (chr != '\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -390,7 +401,15 @@ main(int argc, char **argv) {
|
|||
for(int cm=0; cm<iodata.Nms; cm++) {
|
||||
msitr[cm]->origin();
|
||||
}
|
||||
/* starting iterations doubled */
|
||||
|
||||
/* write additional info to solution file */
|
||||
if (solfile && !Data::DoSim) {
|
||||
fprintf(sfp,"# solution file created by SAGECal\n");
|
||||
fprintf(sfp,"# freq(MHz) bandwidth(MHz) time_interval(min) stations clusters effective_clusters\n");
|
||||
fprintf(sfp,"%lf %lf %lf %d %d %d\n",iodata.freq0*1e-6,iodata.deltaf*1e-6,(double)iodata.tilesz*iodata.deltat/60.0,iodata.N,M,Mt);
|
||||
}
|
||||
|
||||
/* starting iterations are doubled */
|
||||
int start_iter=1;
|
||||
while (msitr[0]->more()) {
|
||||
start_time = time(0);
|
||||
|
@ -456,7 +475,8 @@ main(int argc, char **argv) {
|
|||
inout(p: length(8*mic_N*Mt))
|
||||
sagefit_visibilities_mic(mic_u,mic_v,mic_w,mic_x,mic_N,mic_Nbase,mic_tilesz,barr,mic_chunks,mic_pindex,coh,M,Mt,mic_freq0,mic_deltaf,p,mic_data_min_uvcut,mic_data_Nt,2*mic_data_max_emiter,mic_data_max_iter,(mic_data_dochan? 0 :mic_data_max_lbfgs),mic_data_lbfgs_m,mic_data_gpu_threads,mic_data_linsolv,mic_data_solver_mode,mic_data_nulow,mic_data_nuhigh,mic_data_randomize,&mean_nu,&res_0,&res_1);
|
||||
#else /* NOT MIC */
|
||||
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,(Data::doChan? 0 :Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(Data::solver_mode==SM_RTR_OSRLM_RLBFGS?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
|
||||
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,(iodata.N<=64?2*Data::max_emiter:4*Data::max_emiter),Data::max_iter,(Data::doChan? 0 :Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(iodata.N<=64 && Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(iodata.N<=64 && (Data::solver_mode==SM_RTR_OSRLM_RLBFGS||Data::solver_mode==SM_NSD_RLBFGS)?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,Data::whiten,&mean_nu,&res_0,&res_1);
|
||||
//sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,(Data::doChan? 0 :Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,Data::whiten,&mean_nu,&res_0,&res_1);
|
||||
#endif /* USE_MIC */
|
||||
start_iter=0;
|
||||
} else {
|
||||
|
@ -474,14 +494,14 @@ main(int argc, char **argv) {
|
|||
inout(p: length(8*mic_N*Mt))
|
||||
sagefit_visibilities_mic(mic_u,mic_v,mic_w,mic_x,mic_N,mic_Nbase,mic_tilesz,barr,mic_chunks,mic_pindex,coh,M,Mt,mic_freq0,mic_deltaf,p,mic_data_min_uvcut,mic_data_Nt,mic_data_max_emiter,mic_data_max_iter,(mic_data_dochan? 0: mic_data_max_lbfgs),mic_data_lbfgs_m,mic_data_gpu_threads,mic_data_linsolv,mic_data_solver_mode,mic_data_nulow,mic_data_nuhigh,mic_data_randomize,&mean_nu,&res_0,&res_1);
|
||||
#else /* NOT MIC */
|
||||
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,(Data::doChan? 0: Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
|
||||
sagefit_visibilities(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,(Data::doChan? 0: Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,Data::whiten,&mean_nu,&res_0,&res_1);
|
||||
#endif /* USE_MIC */
|
||||
}
|
||||
#endif /* !HAVE_CUDA */
|
||||
#ifdef HAVE_CUDA
|
||||
#ifdef ONE_GPU
|
||||
if (start_iter) {
|
||||
sagefit_visibilities_dual_pt_one_gpu(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,(Data::doChan? 0: Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(Data::solver_mode==SM_RTR_OSRLM_RLBFGS?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
|
||||
sagefit_visibilities_dual_pt_one_gpu(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt, (iodata.N<=64?2*Data::max_emiter:4*Data::max_emiter),Data::max_iter,(Data::doChan? 0: Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(iodata.N<=64 && Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(iodata.N<=64 && (Data::solver_mode==SM_RTR_OSRLM_RLBFGS||Data::solver_mode==SM_NSD_RLBFGS)?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
|
||||
start_iter=0;
|
||||
} else {
|
||||
sagefit_visibilities_dual_pt_one_gpu(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,Data::max_emiter,Data::max_iter,(Data::doChan? 0:Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
|
||||
|
@ -489,7 +509,7 @@ main(int argc, char **argv) {
|
|||
#endif /* ONE_GPU */
|
||||
#ifndef ONE_GPU
|
||||
if (start_iter) {
|
||||
sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,(Data::doChan? 0:Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(Data::solver_mode==SM_RTR_OSRLM_RLBFGS?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
|
||||
sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,(iodata.N<=64?2*Data::max_emiter:4*Data::max_emiter),Data::max_iter,(Data::doChan? 0:Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,(iodata.N<=64 && Data::solver_mode==SM_RTR_OSLM_LBFGS?SM_OSLM_LBFGS:(iodata.N<=64 && (Data::solver_mode==SM_RTR_OSRLM_RLBFGS||Data::solver_mode==SM_NSD_RLBFGS)?SM_OSLM_OSRLM_RLBFGS:Data::solver_mode)),Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
|
||||
///DBG sagefit_visibilities_dual_pt_flt(iodata.u,iodata.v,iodata.w,iodata.x,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,iodata.freq0,iodata.deltaf,p,Data::min_uvcut,Data::Nt,2*Data::max_emiter,Data::max_iter,(Data::doChan? 0:Data::max_lbfgs),Data::lbfgs_m,Data::gpu_threads,Data::linsolv,Data::solver_mode,Data::nulow,Data::nuhigh,Data::randomize,&mean_nu,&res_0,&res_1);
|
||||
start_iter=0;
|
||||
} else {
|
||||
|
@ -561,11 +581,11 @@ main(int argc, char **argv) {
|
|||
} else {
|
||||
/************ simulation only mode ***************************/
|
||||
if (!solfile) {
|
||||
predict_visibilities_multifreq(iodata.u,iodata.v,iodata.w,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt);
|
||||
predict_visibilities_multifreq(iodata.u,iodata.v,iodata.w,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,(Data::DoSim>1?1:0));
|
||||
} else {
|
||||
read_solutions(sfp,p,carr,iodata.N,M);
|
||||
/* if solution file is given, read in the solutions and predict */
|
||||
predict_visibilities_multifreq_withsol(iodata.u,iodata.v,iodata.w,p,iodata.xo,ignorelist,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt);
|
||||
predict_visibilities_multifreq_withsol(iodata.u,iodata.v,iodata.w,p,iodata.xo,ignorelist,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,(Data::DoSim>1?1:0),Data::ccid,Data::rho);
|
||||
}
|
||||
/************ end simulation only mode ***************************/
|
||||
}
|
||||
|
@ -613,10 +633,14 @@ main(int argc, char **argv) {
|
|||
}
|
||||
end_time = time(0);
|
||||
elapsed_time = ((double) (end_time-start_time)) / 60.0;
|
||||
if (solver_mode==SM_OSLM_OSRLM_RLBFGS||solver_mode==SM_RLM_RLBFGS||solver_mode==SM_RTR_OSRLM_RLBFGS) {
|
||||
if (!Data::DoSim) {
|
||||
if (solver_mode==SM_OSLM_OSRLM_RLBFGS||solver_mode==SM_RLM_RLBFGS||solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
|
||||
cout<<"nu="<<mean_nu<<endl;
|
||||
}
|
||||
cout<<"Timeslot: "<<tilex<<" Residual: initial="<<res_0<<",final="<<res_1<<", Time spent="<<elapsed_time<<" minutes"<<endl;
|
||||
cout<<"Timeslot: "<<tilex<<" Residual: initial="<<res_0<<",final="<<res_1<<", Time spent="<<elapsed_time<<" minutes"<<endl;
|
||||
} else {
|
||||
cout<<"Timeslot: "<<tilex<<", Time spent="<<elapsed_time<<" minutes"<<endl;
|
||||
}
|
||||
}
|
||||
|
||||
Data::freeData(iodata);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
CC=gcc
|
||||
CXX=g++
|
||||
CFLAGS= -Wall -O3 -g
|
||||
CFLAGS= -Wall -O3 -g #-pg
|
||||
CLIBS= -lm -lpthread
|
||||
#LAPACK=-L/usr/lib/atlas/sse -llapack -lblas
|
||||
#LAPACK=-L/usr/local/GotoBLAS2/lib -lgoto2 -lpthread -lgfortran
|
||||
|
@ -52,7 +52,6 @@ rtr_solve_robust_admm.o:rtr_solve_robust_admm.c
|
|||
admm_solve.o:admm_solve.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
|
||||
|
||||
RANLIB=ranlib
|
||||
libsagecal.a:$(OBJECTS) sagecal.h
|
||||
ar rv $@ $(OBJECTS); \
|
||||
|
|
|
@ -387,7 +387,7 @@ minimize_viz_full_pth(double *p, double *x, int m, int n, void *data) {
|
|||
|
||||
int
|
||||
sagefit_visibilities_admm(double *u, double *v, double *w, double *x, int N,
|
||||
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, double admm_rho, double *mean_nu, double *res_0, double *res_1) {
|
||||
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, double *admm_rho, double *mean_nu, double *res_0, double *res_1) {
|
||||
|
||||
int ci,cj,ck,tcj;
|
||||
double *p; // parameters: m x 1
|
||||
|
@ -511,7 +511,7 @@ printf("\n\ncluster %d iter=%d\n",cj,this_itermax);
|
|||
/* use a reasonable TR radius because cost function has extra
|
||||
regularization NB: ADMM very sensitive to this */
|
||||
double Delta0=2.0;
|
||||
rtr_solve_nocuda_robust_admm(&p[carr[cj].p[ck]], &Y[carr[cj].p[ck]], &BZ[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], N, ntiles*Nbase, this_itermax+5, this_itermax+10, Delta0, Delta0*0.125, admm_rho, nulow, nuhigh, info, &lmdata);
|
||||
rtr_solve_nocuda_robust_admm(&p[carr[cj].p[ck]], &Y[carr[cj].p[ck]], &BZ[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], N, ntiles*Nbase, this_itermax+5, this_itermax+10, Delta0, Delta0*0.125, admm_rho[cj], nulow, nuhigh, info, &lmdata);
|
||||
if (ci==max_emiter-1){
|
||||
robust_nuM[cj]+=lmdata.robust_nu;
|
||||
}
|
||||
|
@ -563,6 +563,8 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
|
|||
free(robust_nuM);
|
||||
if (robust_nu0<nulow) {
|
||||
robust_nu0=nulow;
|
||||
} else if (robust_nu0>nuhigh) {
|
||||
robust_nu0=nuhigh;
|
||||
}
|
||||
|
||||
/* final residual calculation */
|
||||
|
@ -601,7 +603,7 @@ pipeline_slave_code_admm_flt(void *data)
|
|||
sync_barrier(&(td->pline->gate2)); /* stop at gate 2 */
|
||||
/* do work : only one solver */
|
||||
//printf("state=%d, thread %d\n",gd->status[tid],tid);
|
||||
if (gd->status[tid]==PT_DO_WORK_RRTR ) {
|
||||
if (gd->status[tid]==PT_DO_WORK_RRTR || gd->status[tid]==PT_DO_WORK_NSD) {
|
||||
/************************* work *********************/
|
||||
me_data_t *t=(me_data_t *)gd->lmdata[tid];
|
||||
/* divide the tiles into chunks tilesz/nchunk */
|
||||
|
@ -626,10 +628,13 @@ pipeline_slave_code_admm_flt(void *data)
|
|||
ntiles=t->tilesz-cj;
|
||||
}
|
||||
|
||||
/* max trust region radius: keep reasonable */
|
||||
float Delta0=2.0f;
|
||||
/* storage (float) 8N*(BlocksPerGrid+8) + 8N*5 + 8*M + 8*Nbase + 2*Nbase + N + M */
|
||||
ret=rtr_solve_cuda_robust_admm_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->Y[tid][ci*(gd->M[tid])], &gd->Z[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+5, gd->itermax[tid]+10, Delta0, Delta0*0.125f, gd->admm_rho, gd->nulow, gd->nuhigh, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], cj, ntiles, (void*)gd->lmdata[tid]);
|
||||
if (gd->status[tid]==PT_DO_WORK_NSD) {
|
||||
ret=nsd_solve_cuda_robust_admm_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->Y[tid][ci*(gd->M[tid])], &gd->Z[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+15, gd->admm_rho[tid], gd->nulow, gd->nuhigh, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], cj, ntiles, (void*)gd->lmdata[tid]);
|
||||
} else {
|
||||
/* max trust region radius: keep reasonable */
|
||||
float Delta0=2.0f;
|
||||
ret=rtr_solve_cuda_robust_admm_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->Y[tid][ci*(gd->M[tid])], &gd->Z[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+10, Delta0, Delta0*0.125f, gd->admm_rho[tid], gd->nulow, gd->nuhigh, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], cj, ntiles, (void*)gd->lmdata[tid]);
|
||||
}
|
||||
|
||||
init_res+=gd->info[tid][0];
|
||||
final_res+=gd->info[tid][1];
|
||||
|
@ -712,7 +717,7 @@ destroy_pipeline_admm_flt(th_pipeline *pline)
|
|||
//#define DEBUG
|
||||
int
|
||||
sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x, int N,
|
||||
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode, double nulow, double nuhigh, int randomize,double admm_rho, double *mean_nu, double *res_0, double *res_1) {
|
||||
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode, double nulow, double nuhigh, int randomize,double *admm_rho, double *mean_nu, double *res_0, double *res_1) {
|
||||
|
||||
|
||||
int ci,cj;
|
||||
|
@ -857,23 +862,20 @@ sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x
|
|||
tpg.status[0]=tpg.status[1]=PT_DO_AGPU;
|
||||
/* also calculate the total storage needed to be allocated on a GPU */
|
||||
/* determine total size for memory allocation */
|
||||
int Mm=8*N;
|
||||
int64_t data_sz=0;
|
||||
/* size for RTR (float) : M*( 13 + ntiles*Nbase/128 +1+1) + 18*ntiles*Nbase
|
||||
where M: no of stations (params/8) ntiles: tile size Nbase: baselines
|
||||
/* size for RTR/NSD (float), 128 is the ThreadsPerBlock
|
||||
NSD is a bit lower
|
||||
*/
|
||||
int64_t data_sz_rtr=0;
|
||||
/* 2 x Nbase more than normal RTR for weight, log(weight) */
|
||||
data_sz_rtr=(N*(13+Nbase1/128+1+1)+20*Nbase1)*sizeof(float);
|
||||
/* size for ROBUSTLM */
|
||||
data_sz=(int64_t)(n+Mm*n+Mm*Mm+Mm+Mm*Mm+Mm+Mm+Mm+Mm+Nbase1*8+n+n+n+n)*sizeof(float)+(int64_t)Nbase1*2*sizeof(char);
|
||||
data_sz=MAX(data_sz,data_sz_rtr);
|
||||
data_sz+=(int64_t)Mm*sizeof(float);
|
||||
if (solver_mode==SM_NSD_RLBFGS) {
|
||||
data_sz=(8*N*(7+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
|
||||
} else { /* default is RTR */
|
||||
data_sz=(8*N*(11+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
|
||||
}
|
||||
|
||||
tpg.data_size=data_sz;
|
||||
tpg.nulow=nulow;
|
||||
tpg.nuhigh=nuhigh;
|
||||
tpg.randomize=randomize;
|
||||
tpg.admm_rho=(float)admm_rho;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2*/
|
||||
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
|
||||
|
@ -948,6 +950,7 @@ sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x
|
|||
tpg.p[0]=&pf[carr[c0].p[0]]; /* length carr[c0].nchunk times */
|
||||
tpg.Y[0]=&Yf[carr[c0].p[0]]; /* length carr[c0].nchunk times */
|
||||
tpg.Z[0]=&Zf[carr[c0].p[0]]; /* length carr[c0].nchunk times */
|
||||
tpg.admm_rho[0]=(float)admm_rho[c0];
|
||||
tpg.x[0]=xdummy0f;
|
||||
tpg.M[0]=8*N; /* even though size of p is > M, dont change this */
|
||||
tpg.N[0]=n; /* Nbase*tilesz*8 */
|
||||
|
@ -960,6 +963,7 @@ sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x
|
|||
tpg.p[1]=&pf[carr[c1].p[0]]; /* length carr[c1].nchunk times */
|
||||
tpg.Y[1]=&Yf[carr[c1].p[0]]; /* length carr[c1].nchunk times */
|
||||
tpg.Z[1]=&Zf[carr[c1].p[0]]; /* length carr[c1].nchunk times */
|
||||
tpg.admm_rho[1]=(float)admm_rho[c1];
|
||||
tpg.x[1]=xdummy1f;
|
||||
tpg.M[1]=8*N; /* even though size of p is > M, dont change this */
|
||||
tpg.N[1]=n; /* Nbase*tilesz*8 */
|
||||
|
@ -971,7 +975,11 @@ sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x
|
|||
/**************************************************************************/
|
||||
|
||||
/* both threads do work */
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_WORK_RRTR;
|
||||
if (solver_mode==SM_NSD_RLBFGS) {
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_WORK_NSD;
|
||||
} else {
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_WORK_RRTR;
|
||||
}
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1 */
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
|
||||
|
@ -1037,6 +1045,7 @@ printf("1: %lf -> %lf 2: %lf -> %lf\n\n\n",info0[0],info0[1],info1[0],info1[1]);
|
|||
tpg.p[0]=&pf[carr[c0].p[0]];
|
||||
tpg.Y[0]=&Yf[carr[c0].p[0]];
|
||||
tpg.Z[0]=&Zf[carr[c0].p[0]];
|
||||
tpg.admm_rho[0]=(float)admm_rho[c0];
|
||||
tpg.x[0]=xdummy0f;
|
||||
tpg.M[0]=8*N;
|
||||
tpg.N[0]=n;
|
||||
|
@ -1046,7 +1055,11 @@ printf("1: %lf -> %lf 2: %lf -> %lf\n\n\n",info0[0],info0[1],info1[0],info1[1]);
|
|||
tpg.linsolv=linsolv;
|
||||
tpg.lmdata[0]=&lmdata0;
|
||||
|
||||
tpg.status[0]=PT_DO_WORK_RRTR;
|
||||
if (solver_mode==SM_NSD_RLBFGS) {
|
||||
tpg.status[0]=PT_DO_WORK_NSD;
|
||||
} else {
|
||||
tpg.status[0]=PT_DO_WORK_RRTR;
|
||||
}
|
||||
|
||||
tpg.status[1]=PT_DO_NOTHING;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
|
||||
|
@ -1106,9 +1119,12 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
|
|||
printf("mean nu=%lf\n",robust_nu0);
|
||||
#endif
|
||||
free(robust_nuM);
|
||||
if (robust_nu0<nulow) {
|
||||
if (robust_nu0<nulow) {
|
||||
robust_nu0=nulow;
|
||||
}
|
||||
} else if (robust_nu0>nuhigh) {
|
||||
robust_nu0=nuhigh;
|
||||
}
|
||||
|
||||
/******** free threads ***************/
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_DGPU;
|
||||
|
@ -1139,7 +1155,7 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
|
|||
|
||||
int
|
||||
sagefit_visibilities_admm_dual_pt_flt_one(double *u, double *v, double *w, double *x, int N,
|
||||
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode, double nulow, double nuhigh, int randomize,double admm_rho, double *mean_nu, double *res_0, double *res_1) {
|
||||
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode, double nulow, double nuhigh, int randomize,double *admm_rho, double *mean_nu, double *res_0, double *res_1) {
|
||||
|
||||
|
||||
int ci,cj;
|
||||
|
@ -1284,24 +1300,16 @@ sagefit_visibilities_admm_dual_pt_flt_one(double *u, double *v, double *w, doubl
|
|||
tpg.status[0]=tpg.status[1]=PT_DO_AGPU;
|
||||
/* also calculate the total storage needed to be allocated on a GPU */
|
||||
/* determine total size for memory allocation */
|
||||
int Mm=8*N;
|
||||
int64_t data_sz=0;
|
||||
/* size for RTR (float) : M*( 13 + ntiles*Nbase/128 +1+1) + 18*ntiles*Nbase
|
||||
where M: no of stations (params/8) ntiles: tile size Nbase: baselines
|
||||
/* size for RTR/NSD (float), 128 is the ThreadsPerBlock
|
||||
NSD is a bit lower, but use the same
|
||||
*/
|
||||
int64_t data_sz_rtr=0;
|
||||
/* 2 x Nbase more than normal RTR for weight, log(weight) */
|
||||
data_sz_rtr=(N*(13+Nbase1/128+1+1)+20*Nbase1)*sizeof(float);
|
||||
/* size for ROBUSTLM */
|
||||
data_sz=(int64_t)(n+Mm*n+Mm*Mm+Mm+Mm*Mm+Mm+Mm+Mm+Mm+Nbase1*8+n+n+n+n)*sizeof(float)+(int64_t)Nbase1*2*sizeof(char);
|
||||
data_sz=MAX(data_sz,data_sz_rtr);
|
||||
data_sz+=(int64_t)Mm*sizeof(float);
|
||||
data_sz=(8*N*(11+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
|
||||
|
||||
tpg.data_size=data_sz;
|
||||
tpg.nulow=nulow;
|
||||
tpg.nuhigh=nuhigh;
|
||||
tpg.randomize=randomize;
|
||||
tpg.admm_rho=(float)admm_rho;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2*/
|
||||
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
|
||||
|
@ -1358,6 +1366,7 @@ sagefit_visibilities_admm_dual_pt_flt_one(double *u, double *v, double *w, doubl
|
|||
tpg.p[0]=&pf[carr[c0].p[0]];
|
||||
tpg.Y[0]=&Yf[carr[c0].p[0]];
|
||||
tpg.Z[0]=&Zf[carr[c0].p[0]];
|
||||
tpg.admm_rho[0]=(float)admm_rho[c0];
|
||||
tpg.x[0]=xdummy0f;
|
||||
tpg.M[0]=8*N;
|
||||
tpg.N[0]=n;
|
||||
|
@ -1427,9 +1436,13 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
|
|||
printf("mean nu=%lf\n",robust_nu0);
|
||||
#endif
|
||||
free(robust_nuM);
|
||||
if (robust_nu0<nulow) {
|
||||
if (robust_nu0<nulow) {
|
||||
robust_nu0=nulow;
|
||||
}
|
||||
} else if (robust_nu0>nuhigh) {
|
||||
robust_nu0=nuhigh;
|
||||
}
|
||||
|
||||
|
||||
/******** free threads ***************/
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_DGPU;
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "sagecal.h"
|
||||
#include <cuda_runtime.h>
|
||||
|
@ -1175,8 +1176,14 @@ attach_gpu_to_thread2(int card, cublasHandle_t *cbhandle,float **WORK, int64_t
|
|||
cudaError_t err;
|
||||
culaStatus status;
|
||||
cublasStatus_t cbstatus;
|
||||
status=culaSelectDevice(card); /* FIXME: Do not enable CULA if its not going to be used */
|
||||
if (usecula) {
|
||||
status=culaSelectDevice(card); /* Do not enable CULA if its not going to be used */
|
||||
/* if first try failed, wait and retry */
|
||||
if (status) {
|
||||
fprintf(stderr,"%s: %d: CULA device select failure, retrying\n",__FILE__,__LINE__);
|
||||
sleep(10);
|
||||
status=culaSelectDevice(card);
|
||||
}
|
||||
checkStatus(status,__FILE__,__LINE__);
|
||||
status=culaInitialize();
|
||||
checkStatus(status,__FILE__,__LINE__);
|
||||
|
@ -1189,13 +1196,21 @@ attach_gpu_to_thread2(int card, cublasHandle_t *cbhandle,float **WORK, int64_t
|
|||
exit(1);
|
||||
}
|
||||
checkStatus(status,__FILE__,__LINE__);
|
||||
cudaSetDevice(card); /* we need this */
|
||||
} else { /* not using CULA */
|
||||
cudaSetDevice(card);
|
||||
}
|
||||
|
||||
cudaSetDevice(card); /* do we need this because sometimes we do not use cula */
|
||||
cbstatus=cublasCreate(cbhandle);
|
||||
if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
|
||||
fprintf(stderr,"%s: %d: CUBLAS create fail\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
/* retry once more before exiting */
|
||||
fprintf(stderr,"%s: %d: CUBLAS create failure, retrying\n",__FILE__,__LINE__);
|
||||
sleep(10);
|
||||
cbstatus=cublasCreate(cbhandle);
|
||||
if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
|
||||
fprintf(stderr,"%s: %d: CUBLAS create fail\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
err=cudaMalloc((void**)WORK, (size_t)work_size);
|
||||
|
@ -1246,7 +1261,6 @@ detach_gpu_from_thread2(int card,cublasHandle_t cbhandle,float *WORK, int usecul
|
|||
}
|
||||
cudaFree(WORK);
|
||||
|
||||
//cudaSetDevice(card);
|
||||
cudaDeviceReset();
|
||||
}
|
||||
void
|
||||
|
|
|
@ -4,6 +4,8 @@ export LD_LIBRARY_PATH
|
|||
source /opt/intel/composerxe/bin/compilervars.sh intel64
|
||||
|
||||
export MIC_ENV_PREFIX=MIC
|
||||
##export MKL_MIC_ENABLE=1
|
||||
##export OMP_NUM_THREADS=16
|
||||
export MIC_PREFIX=MIC
|
||||
export MIC_OMP_NUM_THREADS=240
|
||||
#
|
||||
|
|
|
@ -1975,6 +1975,8 @@ sagefit_visibilities_dual_pt(double *u, double *v, double *w, double *x, int N,
|
|||
free(robust_nuM);
|
||||
if (robust_nu0<nulow) {
|
||||
robust_nu0=nulow;
|
||||
} else if (robust_nu0>nuhigh) {
|
||||
robust_nu0=nuhigh;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2363,7 +2365,10 @@ sagefit_visibilities_dual_pt_one_gpu(double *u, double *v, double *w, double *x,
|
|||
free(robust_nuM);
|
||||
if (robust_nu0<nulow) {
|
||||
robust_nu0=nulow;
|
||||
} else if (robust_nu0>nuhigh) {
|
||||
robust_nu0=nuhigh;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/******** free threads ***************/
|
||||
|
@ -2503,7 +2508,7 @@ pipeline_slave_code_flt(void *data)
|
|||
//printf("state=%d, thread %d\n",gd->status[tid],tid);
|
||||
if (gd->status[tid]==PT_DO_WORK_LM || gd->status[tid]==PT_DO_WORK_OSLM
|
||||
|| gd->status[tid]==PT_DO_WORK_RLM || gd->status[tid]==PT_DO_WORK_OSRLM
|
||||
|| gd->status[tid]==PT_DO_WORK_RTR || gd->status[tid]==PT_DO_WORK_RRTR ) {
|
||||
|| gd->status[tid]==PT_DO_WORK_RTR || gd->status[tid]==PT_DO_WORK_RRTR || gd->status[tid]==PT_DO_WORK_NSD) {
|
||||
/************************* work *********************/
|
||||
me_data_t *t=(me_data_t *)gd->lmdata[tid];
|
||||
/* divide the tiles into chunks tilesz/nchunk */
|
||||
|
@ -2535,21 +2540,18 @@ pipeline_slave_code_flt(void *data)
|
|||
} else if (gd->status[tid]==PT_DO_WORK_RLM) {
|
||||
ret=rlevmar_der_single_cuda_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid], 8*ntiles*t->Nbase, gd->itermax[tid], gd->opts[tid], gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], gd->linsolv, cj, ntiles, gd->nulow,gd->nuhigh,(void*)gd->lmdata[tid]);
|
||||
} else if (gd->status[tid]==PT_DO_WORK_OSRLM) {
|
||||
ret=osrlevmar_der_single_cuda_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid], 8*ntiles*t->Nbase, gd->itermax[tid], gd->opts[tid], gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], gd->linsolv, cj, ntiles, gd->nulow,gd->nuhigh,gd->randomize,(void*)gd->lmdata[tid]);
|
||||
ret=osrlevmar_der_single_cuda_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid], 8*ntiles*t->Nbase, gd->itermax[tid], gd->opts[tid], gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], gd->linsolv, cj, ntiles, gd->nulow,gd->nuhigh,gd->randomize,0,(void*)gd->lmdata[tid]); /* FIXME 0 for whiten */
|
||||
} else if (gd->status[tid]==PT_DO_WORK_RTR) {
|
||||
/* note stations: M/8, baselines ntiles*Nbase RSD+RTR */
|
||||
float Delta0=0.01f; /* use very small value because previous LM has already made the solution close to true value */
|
||||
/* storage:
|
||||
need (float) 8N*(BlocksPerGrid+8) + 8N*5 + 8*M + 8*Nbase + 2*Nbase + N
|
||||
where N<=M/8 M<=ntiles*Nbase, Nbase<=ntiles*Nbase,
|
||||
BlocksPerGrid=ceil(ntiles*Nbase/128)
|
||||
so(max): M*( 13 + ntiles*Nbase/128 +1 +1) + 18*ntiles*Nbase
|
||||
where M: no of stations (params/8), ntiles: tile size, Nbase: baselines */
|
||||
/* storage: see function header
|
||||
*/
|
||||
ret=rtr_solve_cuda_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+5, gd->itermax[tid]+10, Delta0, Delta0*0.125f, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], cj, ntiles, (void*)gd->lmdata[tid]);
|
||||
} else if (gd->status[tid]==PT_DO_WORK_RRTR) {
|
||||
float Delta0=0.01f;
|
||||
/* storage (float) 8N*(BlocksPerGrid+8) + 8N*5 + 8*M + 8*Nbase + 2*Nbase + N + M */
|
||||
ret=rtr_solve_cuda_robust_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+5, gd->itermax[tid]+10, Delta0, Delta0*0.125f, gd->nulow, gd->nuhigh, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], cj, ntiles, (void*)gd->lmdata[tid]);
|
||||
} else if (gd->status[tid]==PT_DO_WORK_NSD) {
|
||||
ret=nsd_solve_cuda_robust_fl(&gd->p[tid][ci*(gd->M[tid])], &gd->x[tid][8*cj*t->Nbase], gd->M[tid]/8, ntiles*t->Nbase, gd->itermax[tid]+15, gd->nulow, gd->nuhigh, gd->info[tid], gd->cbhandle[tid], gd->gWORK[tid], cj, ntiles, (void*)gd->lmdata[tid]);
|
||||
}
|
||||
init_res+=gd->info[tid][0];
|
||||
final_res+=gd->info[tid][1];
|
||||
|
@ -2654,7 +2656,6 @@ sagefit_visibilities_dual_pt_flt(double *u, double *v, double *w, double *x, int
|
|||
int *cr=0; /* array for random permutation of clusters */
|
||||
int c0,c1;
|
||||
|
||||
//opts[0]=LM_INIT_MU; opts[1]=1E-15; opts[2]=1E-15; opts[3]=1E-20;
|
||||
opts[0]=CLM_INIT_MU; opts[1]=1E-9; opts[2]=1E-9; opts[3]=1E-9;
|
||||
opts[4]=-CLM_DIFF_DELTA;
|
||||
|
||||
|
@ -2747,7 +2748,7 @@ sagefit_visibilities_dual_pt_flt(double *u, double *v, double *w, double *x, int
|
|||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
|
||||
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
|
||||
if ((robust_nuM=(double*)calloc((size_t)(M),sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
|
@ -2775,37 +2776,37 @@ sagefit_visibilities_dual_pt_flt(double *u, double *v, double *w, double *x, int
|
|||
int64_t data_sz=0;
|
||||
/* size for MLM (disabled) */
|
||||
//data_sz=(int64_t)(n+Mm*n+n+n+n+Mm+Mm+Mm*Mm+Mm*Mm+Mm+Mm+Mm+Mm+Mm+Nbase1*8)*sizeof(double)+(int64_t)Nbase1*2*sizeof(char);
|
||||
/* size for RTR (float) : M*( 13 + ntiles*Nbase/128 +1+1) + 18*ntiles*Nbase
|
||||
where M: no of stations (params/8) ntiles: tile size Nbase: baselines
|
||||
|
||||
/* size for RTR/NSD (float), 128 is the ThreadsPerBlock
|
||||
*/
|
||||
int64_t data_sz_rtr=0;
|
||||
if (solver_mode==SM_RTR_OSLM_LBFGS) {
|
||||
data_sz_rtr=(N*(13+Nbase1/128+1+1)+18*Nbase1)*sizeof(float);
|
||||
/* use same size as robust version, probably is lower */
|
||||
data_sz=(8*N*(11+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
|
||||
} else if (solver_mode==SM_RTR_OSRLM_RLBFGS) {
|
||||
/* 2 x Nbase more than normal RTR for weight, log(weight) */
|
||||
data_sz_rtr=(N*(13+Nbase1/128+1+1)+20*Nbase1)*sizeof(float);
|
||||
}
|
||||
if (solver_mode==SM_LM_LBFGS || solver_mode==SM_OSLM_LBFGS || solver_mode==SM_RTR_OSLM_LBFGS) {
|
||||
data_sz=(8*N*(11+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
|
||||
} else if (solver_mode==SM_NSD_RLBFGS) {
|
||||
data_sz=(8*N*(7+(Nbase1+128-1)/128)+N+8*Nbase1*2+3*Nbase1)*sizeof(float);
|
||||
} else if (solver_mode==SM_LM_LBFGS || solver_mode==SM_OSLM_LBFGS) {
|
||||
/* size for LM */
|
||||
data_sz=(int64_t)(n+Mm*n+Mm*Mm+Mm+Mm*Mm+Mm+Mm+Mm+Mm+Nbase1*8+n+n)*sizeof(float)+(int64_t)Nbase1*2*sizeof(char);
|
||||
if(solver_mode==SM_RTR_OSLM_LBFGS) {
|
||||
data_sz=MAX(data_sz,data_sz_rtr);
|
||||
if (linsolv==1) {
|
||||
data_sz+=(int64_t)Mm*sizeof(float);
|
||||
} else if (linsolv==2) {
|
||||
data_sz+=(int64_t)(Mm*Mm+Mm*Mm+Mm)*sizeof(float);
|
||||
}
|
||||
} else if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
|
||||
} else if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS) {
|
||||
/* size for ROBUSTLM */
|
||||
data_sz=(int64_t)(n+Mm*n+Mm*Mm+Mm+Mm*Mm+Mm+Mm+Mm+Mm+Nbase1*8+n+n+n+n)*sizeof(float)+(int64_t)Nbase1*2*sizeof(char);
|
||||
if(solver_mode==SM_RTR_OSRLM_RLBFGS) {
|
||||
data_sz=MAX(data_sz,data_sz_rtr);
|
||||
if (linsolv==1) {
|
||||
data_sz+=(int64_t)Mm*sizeof(float);
|
||||
} else if (linsolv==2) {
|
||||
data_sz+=(int64_t)(Mm*Mm+Mm*Mm+Mm)*sizeof(float);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr,"%s: %d: invalid mode for solver\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if (linsolv==1) {
|
||||
data_sz+=(int64_t)Mm*sizeof(float);
|
||||
} else if (linsolv==2) {
|
||||
data_sz+=(int64_t)(Mm*Mm+Mm*Mm+Mm)*sizeof(float);
|
||||
}
|
||||
|
||||
tpg.data_size=data_sz;
|
||||
tpg.nulow=nulow;
|
||||
tpg.nuhigh=nuhigh;
|
||||
|
@ -2938,6 +2939,11 @@ sagefit_visibilities_dual_pt_flt(double *u, double *v, double *w, double *x, int
|
|||
lmdata0.robust_nu=lmdata1.robust_nu=robust_nu0; /* initial robust nu */
|
||||
}
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_WORK_RRTR;
|
||||
} else if (solver_mode==SM_NSD_RLBFGS) {
|
||||
if (!ci) {
|
||||
lmdata0.robust_nu=lmdata1.robust_nu=robust_nu0; /* initial robust nu */
|
||||
}
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_WORK_NSD;
|
||||
} else {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: undefined solver mode\n",__FILE__,__LINE__);
|
||||
|
@ -2965,7 +2971,7 @@ printf("1: %lf -> %lf 2: %lf -> %lf\n\n\n",info0[0],info0[1],info1[0],info1[1]);
|
|||
nerr[c1]=0.0;
|
||||
}
|
||||
/* update robust_nu */
|
||||
if ((solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) && (ci==max_emiter-1)) {
|
||||
if ((solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) && (ci==max_emiter-1)) {
|
||||
robust_nuM[c0]+=lmdata0.robust_nu;
|
||||
robust_nuM[c1]+=lmdata1.robust_nu;
|
||||
}
|
||||
|
@ -3047,11 +3053,15 @@ printf("1: %lf -> %lf 2: %lf -> %lf\n\n\n",info0[0],info0[1],info1[0],info1[1]);
|
|||
} else if (solver_mode==SM_RTR_OSLM_LBFGS) {
|
||||
tpg.status[0]=PT_DO_WORK_RTR;
|
||||
} else if (solver_mode==SM_RTR_OSRLM_RLBFGS) {
|
||||
/* last iteration is OSRLM */
|
||||
if (!ci) {
|
||||
lmdata0.robust_nu=robust_nu0; /* initial robust nu */
|
||||
}
|
||||
tpg.status[0]=PT_DO_WORK_RRTR;
|
||||
} else if (solver_mode==SM_NSD_RLBFGS) {
|
||||
if (!ci) {
|
||||
lmdata0.robust_nu=robust_nu0; /* initial robust nu */
|
||||
}
|
||||
tpg.status[0]=PT_DO_WORK_NSD;
|
||||
} else {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: undefined solver mode\n",__FILE__,__LINE__);
|
||||
|
@ -3077,7 +3087,7 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
|
|||
nerr[c0]=0.0;
|
||||
}
|
||||
/* update robust_nu */
|
||||
if ((solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) && (ci==max_emiter-1)) {
|
||||
if ((solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) && (ci==max_emiter-1)) {
|
||||
robust_nuM[c0]+=lmdata0.robust_nu;
|
||||
}
|
||||
/* once again subtract solved model from data */
|
||||
|
@ -3108,7 +3118,7 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
|
|||
free(xdummy1f);
|
||||
free(pf);
|
||||
free(ddcohf);
|
||||
if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
|
||||
if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
|
||||
/* calculate mean robust_nu over all clusters */
|
||||
robust_nu0=my_dasum(M,robust_nuM)/(double)M;
|
||||
#ifdef DEBUG
|
||||
|
@ -3120,7 +3130,10 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
|
|||
free(robust_nuM);
|
||||
if (robust_nu0<nulow) {
|
||||
robust_nu0=nulow;
|
||||
} else if (robust_nu0>nuhigh) {
|
||||
robust_nu0=nuhigh;
|
||||
}
|
||||
|
||||
}
|
||||
/******** free threads ***************/
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
|
||||
|
@ -3133,11 +3146,7 @@ printf("1: %lf -> %lf\n\n\n",info0[0],info0[1]);
|
|||
|
||||
if (max_lbfgs>0) {
|
||||
/* use LBFGS */
|
||||
if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
|
||||
/* if RTR, divide by 8 */
|
||||
if (solver_mode==SM_RTR_OSRLM_RLBFGS) {
|
||||
robust_nu0 *=0.125;
|
||||
}
|
||||
if (solver_mode==SM_RLM_RLBFGS || solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
|
||||
lmdata0.robust_nu=robust_nu0;
|
||||
ret=lbfgs_fit_robust_cuda(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata0);
|
||||
} else {
|
||||
|
|
|
@ -646,7 +646,6 @@ predict_threadfn_withgain_full(void *data) {
|
|||
*/
|
||||
|
||||
px=(ci+t->boff)/((Ntilebase+t->carr[cm].nchunk-1)/t->carr[cm].nchunk);
|
||||
//pm=&(t->p[cm*8*N]);
|
||||
pm=&(t->p[t->carr[cm].p[px]]);
|
||||
G1[0]=(pm[sta1*8])+_Complex_I*(pm[sta1*8+1]);
|
||||
G1[1]=(pm[sta1*8+2])+_Complex_I*(pm[sta1*8+3]);
|
||||
|
@ -710,7 +709,6 @@ minimize_viz_full_pth(double *p, double *x, int m, int n, void *data) {
|
|||
int Nbase1=(dp->Nbase)*(dp->tilesz);
|
||||
|
||||
/* calculate min baselines a thread can handle */
|
||||
//Nthb0=ceil((double)Nbase1/(double)Nt);
|
||||
Nthb0=(Nbase1+Nt-1)/Nt;
|
||||
|
||||
/* setup threads */
|
||||
|
@ -800,7 +798,6 @@ minimize_viz_full_pth00(double *p, double *x, int m, int n, void *data) {
|
|||
int Ntilebase=(dp->Nbase)*(dp->tilesz);
|
||||
int px;
|
||||
|
||||
#pragma omp parallel for
|
||||
for (ci=0; ci<Ntilebase; ci++) {
|
||||
/* iterate over the sky model and calculate contribution */
|
||||
/* for this x[8*ci:8*(ci+1)-1] */
|
||||
|
@ -873,7 +870,7 @@ minimize_viz_full_pth00(double *p, double *x, int m, int n, void *data) {
|
|||
|
||||
int
|
||||
sagefit_visibilities(double *u, double *v, double *w, double *x, int N,
|
||||
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, double *mean_nu, double *res_0, double *res_1) {
|
||||
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, int whiten, double *mean_nu, double *res_0, double *res_1) {
|
||||
/* u,v,w : size Nbase*tilesz x 1 x: size Nbase*8*tilesz x 1 */
|
||||
/* barr: size Nbase*tilesz x 1 carr: size Mx1 */
|
||||
/* pp: size 8*N*M x 1 */
|
||||
|
@ -943,7 +940,7 @@ sagefit_visibilities(double *u, double *v, double *w, double *x, int N,
|
|||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
|
||||
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
|
||||
if ((robust_nuM=(double*)calloc((size_t)(M),sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
|
@ -1012,7 +1009,7 @@ printf("\n\ncluster %d iter=%d\n",cj,this_itermax);
|
|||
/* only the last EM iteration is robust */
|
||||
if (ci==max_emiter-1){
|
||||
lmdata.robust_nu=robust_nu0;
|
||||
ret=rlevmar_der_single_nocuda(mylm_fit_single_pth0, mylm_jac_single_pth, &p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], 8*N, 8*ntiles*Nbase, this_itermax, NULL, info, linsolv, Nt, nulow, nuhigh, (void*)&lmdata);
|
||||
ret=rlevmar_der_single_nocuda(mylm_fit_single_pth0, mylm_jac_single_pth, &p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], 8*N, 8*ntiles*Nbase, this_itermax, NULL, info, linsolv, Nt, nulow, nuhigh, whiten, (void*)&lmdata);
|
||||
/* get updated value of robust_nu */
|
||||
robust_nuM[cj]+=lmdata.robust_nu;
|
||||
} else {
|
||||
|
@ -1022,7 +1019,7 @@ printf("\n\ncluster %d iter=%d\n",cj,this_itermax);
|
|||
/* only the last EM iteration is robust */
|
||||
if (ci==max_emiter-1){
|
||||
lmdata.robust_nu=robust_nu0;
|
||||
ret=osrlevmar_der_single_nocuda(mylm_fit_single_pth0, mylm_jac_single_pth, &p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], 8*N, 8*ntiles*Nbase, this_itermax, NULL, info, linsolv, Nt, nulow, nuhigh, randomize, (void*)&lmdata);
|
||||
ret=osrlevmar_der_single_nocuda(mylm_fit_single_pth0, mylm_jac_single_pth, &p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], 8*N, 8*ntiles*Nbase, this_itermax, NULL, info, linsolv, Nt, nulow, nuhigh, randomize, whiten, (void*)&lmdata);
|
||||
/* get updated value of robust_nu */
|
||||
robust_nuM[cj]+=lmdata.robust_nu;
|
||||
} else {
|
||||
|
@ -1042,6 +1039,15 @@ printf("\n\ncluster %d iter=%d\n",cj,this_itermax);
|
|||
if (ci==max_emiter-1){
|
||||
robust_nuM[cj]+=lmdata.robust_nu;
|
||||
}
|
||||
} else if (solver_mode==SM_NSD_RLBFGS) { /* Nesterov's */
|
||||
/* NSD */
|
||||
if (!ci){
|
||||
lmdata.robust_nu=robust_nu0;
|
||||
}
|
||||
ret=nsd_solve_nocuda_robust(&p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], N, ntiles*Nbase, this_itermax+15, nulow, nuhigh, info, &lmdata);
|
||||
if (ci==max_emiter-1){
|
||||
robust_nuM[cj]+=lmdata.robust_nu;
|
||||
}
|
||||
} else { /* not used */
|
||||
//ret=mlm_der_single(mylm_fit_single_pth0, mylm_jac_single_pth, &p[carr[cj].p[ck]], &xdummy[8*tcj*Nbase], 8*N, 8*ntiles*Nbase, this_itermax, NULL, info, linsolv, (void*)&lmdata);
|
||||
#ifndef USE_MIC
|
||||
|
@ -1069,7 +1075,7 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
|
|||
mylm_fit_single_pth(p, xsub, 8*N, n, (void*)&lmdata);
|
||||
my_daxpy(n, xsub, -1.0, xdummy);
|
||||
/* if robust LM, calculate average nu over hybrid clusters */
|
||||
if ((solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) && (ci==max_emiter-1)) {
|
||||
if ((solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) && (ci==max_emiter-1)) {
|
||||
robust_nuM[cj]/=(double)carr[cj].nchunk;
|
||||
}
|
||||
}
|
||||
|
@ -1088,7 +1094,7 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
|
|||
}
|
||||
free(nerr);
|
||||
free(xdummy);
|
||||
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
|
||||
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
|
||||
/* calculate mean robust_nu over all clusters */
|
||||
robust_nu0=my_dasum(M,robust_nuM)/(double)M;
|
||||
#ifdef DEBUG
|
||||
|
@ -1100,6 +1106,8 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
|
|||
free(robust_nuM);
|
||||
if (robust_nu0<nulow) {
|
||||
robust_nu0=nulow;
|
||||
} else if (robust_nu0>nuhigh) {
|
||||
robust_nu0=nuhigh;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1108,13 +1116,10 @@ printf("residual init=%lf final=%lf\n\n",init_res,final_res);
|
|||
lmdata.Nt=32; /* FIXME increase threads for MIC */
|
||||
#endif
|
||||
/* use LBFGS */
|
||||
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS) {
|
||||
/* if RTR, divide by 8 */
|
||||
if (solver_mode==SM_RTR_OSRLM_RLBFGS) {
|
||||
robust_nu0 *=0.125;
|
||||
}
|
||||
if (solver_mode==SM_OSLM_OSRLM_RLBFGS || solver_mode==SM_RLM_RLBFGS || solver_mode==SM_RTR_OSRLM_RLBFGS || solver_mode==SM_NSD_RLBFGS) {
|
||||
lmdata.robust_nu=robust_nu0;
|
||||
ret=lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
|
||||
/* pre-whiten data when calculating residual */
|
||||
ret=lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, whiten, (void*)&lmdata);
|
||||
} else {
|
||||
ret=lbfgs_fit(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
|
||||
}
|
||||
|
@ -1273,7 +1278,7 @@ bfgsfit_visibilities(double *u, double *v, double *w, double *x, int N,
|
|||
/* use LBFGS */
|
||||
if (solver_mode==2 || solver_mode==3) {
|
||||
lmdata.robust_nu=mean_nu;
|
||||
ret=lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
|
||||
ret=lbfgs_fit_robust(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, 0, (void*)&lmdata); /* 0 to disable whitening */
|
||||
} else {
|
||||
ret=lbfgs_fit(minimize_viz_full_pth, p, x, m, n, max_lbfgs, lbfgs_m, gpu_threads, (void*)&lmdata);
|
||||
}
|
||||
|
|
|
@ -1352,7 +1352,7 @@ kernel_fns_fupdate_weights(int N, int Nbase, cuFloatComplex *x, float *y, float
|
|||
1) its not flagged (sta1,sta2)>=0
|
||||
*/
|
||||
float sumn=0.0f;
|
||||
float temp1,temp2,tt,yy,c=0.0f;
|
||||
float temp1,temp2,tt;
|
||||
if (sta1>=0 && sta2>=0) {
|
||||
cuFloatComplex G1[4];
|
||||
cuFloatComplex G2[4];
|
||||
|
@ -1382,25 +1382,28 @@ kernel_fns_fupdate_weights(int N, int Nbase, cuFloatComplex *x, float *y, float
|
|||
/* T=T*G2' */
|
||||
ambt(T1,G2,T2);
|
||||
|
||||
/* error using Kahan summation */
|
||||
/* use p=2, find MAX value of residual error out of XX,XY,YX,YY
|
||||
instead of the sum */
|
||||
/* V->U */
|
||||
temp1=y[8*n]-T2[0].x;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
temp1=y[8*n+1]-T2[0].y;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
temp2=y[8*n+1]-T2[0].y;
|
||||
sumn=temp1*temp1+temp2*temp2;
|
||||
temp1=y[8*n+2]-T2[1].x;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
temp1=y[8*n+3]-T2[1].y;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
temp2=y[8*n+3]-T2[1].y;
|
||||
tt=temp1*temp1+temp2*temp2;
|
||||
if (sumn<tt) { sumn=tt; }
|
||||
|
||||
temp1=y[8*n+4]-T2[2].x;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
temp1=y[8*n+5]-T2[2].y;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
temp2=y[8*n+5]-T2[2].y;
|
||||
tt=temp1*temp1+temp2*temp2;
|
||||
if (sumn<tt) { sumn=tt; }
|
||||
|
||||
temp1=y[8*n+6]-T2[3].x;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
temp1=y[8*n+7]-T2[3].y;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
wtd[n]=(nu0+8.0f)/(nu0+sumn); /* 8 variate T distribution */
|
||||
temp2=y[8*n+7]-T2[3].y;
|
||||
tt=temp1*temp1+temp2*temp2;
|
||||
if (sumn<tt) { sumn=tt; }
|
||||
//wtd[n]=(nu0+8.0f)/(nu0+sumn); /* 8 variate T distribution */
|
||||
wtd[n]=(nu0+2.0f)/(nu0+sumn); /* 2 variate T distribution */
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1422,7 +1425,7 @@ kernel_fns_fupdate_weights_q(int N, int Nbase, cuFloatComplex *x, float *y, floa
|
|||
1) its not flagged (sta1,sta2)>=0
|
||||
*/
|
||||
float sumn=0.0f;
|
||||
float temp1,temp2,tt,yy,c=0.0f;
|
||||
float temp1,temp2,tt;
|
||||
if (sta1>=0 && sta2>=0) {
|
||||
cuFloatComplex G1[4];
|
||||
cuFloatComplex G2[4];
|
||||
|
@ -1452,25 +1455,28 @@ kernel_fns_fupdate_weights_q(int N, int Nbase, cuFloatComplex *x, float *y, floa
|
|||
/* T=T*G2' */
|
||||
ambt(T1,G2,T2);
|
||||
|
||||
/* error using Kahan summation */
|
||||
/* use p=2, find MAX value of residual error out of XX,XY,YX,YY
|
||||
instead of the sum */
|
||||
/* V->U */
|
||||
temp1=y[8*n]-T2[0].x;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
temp1=y[8*n+1]-T2[0].y;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
temp2=y[8*n+1]-T2[0].y;
|
||||
sumn=temp1*temp1+temp2*temp2;
|
||||
temp1=y[8*n+2]-T2[1].x;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
temp1=y[8*n+3]-T2[1].y;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
temp2=y[8*n+3]-T2[1].y;
|
||||
tt=temp1*temp1+temp2*temp2;
|
||||
if (sumn<tt) { sumn=tt; }
|
||||
|
||||
temp1=y[8*n+4]-T2[2].x;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
temp1=y[8*n+5]-T2[2].y;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
temp2=y[8*n+5]-T2[2].y;
|
||||
tt=temp1*temp1+temp2*temp2;
|
||||
if (sumn<tt) { sumn=tt; }
|
||||
|
||||
temp1=y[8*n+6]-T2[3].x;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
temp1=y[8*n+7]-T2[3].y;
|
||||
temp2=temp1*temp1; yy=temp2-c; tt=sumn+yy; c=(tt-sumn)-yy; sumn=tt;
|
||||
wtd[n]=(nu0+8.0f)/(nu0+sumn); /* 8 variate T distribution */
|
||||
temp2=y[8*n+7]-T2[3].y;
|
||||
tt=temp1*temp1+temp2*temp2;
|
||||
if (sumn<tt) { sumn=tt; }
|
||||
//wtd[n]=(nu0+8.0f)/(nu0+sumn); /* 8 variate T distribution */
|
||||
wtd[n]=(nu0+2.0f)/(nu0+sumn); /* 2 variate T distribution */
|
||||
qd[n]=wtd[n]-logf(wtd[n]);
|
||||
}
|
||||
}
|
||||
|
@ -1667,7 +1673,7 @@ cudakernel_fns_f(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatCo
|
|||
|
||||
return ed: error vector, BlocksPerGridx1
|
||||
*/
|
||||
/* need BlocksPerGrid+1+L float storage */
|
||||
/* need BlocksPerGrid+4+L float storage <= (2 BlocksPerGrid + 4) */
|
||||
float
|
||||
cudakernel_fns_f_robust(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, float *y, float *coh, char *bbh, float *wtd, float *gWORK) {
|
||||
#ifdef CUDA_DBG
|
||||
|
|
|
@ -87,7 +87,7 @@ calculate_uv_mode_vectors_scalar(double u, double v, double beta, int n0, double
|
|||
shpvl[zci][xci]=H_e(xval,xci)*expval/sqrt((double)(2<<xci)*fact[xci]);
|
||||
}
|
||||
zci=1;
|
||||
xval=v*beta;
|
||||
xval=v*beta;
|
||||
expval=exp(-0.5*xval*xval);
|
||||
for (xci=0; xci<n0; xci++) {
|
||||
shpvl[zci][xci]=H_e(xval,xci)*expval/sqrt((double)(2<<xci)*fact[xci]);
|
||||
|
|
|
@ -209,10 +209,10 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
|
|||
/* each element of list is a list of source names */
|
||||
if ((cfp=fopen(clusterfile,"r"))==0) {
|
||||
fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
/* allocate memory for buffer */
|
||||
buff_len = 2048; /* FIXME: handle long names */
|
||||
buff_len = MAX_SNAME; /* handle long names */
|
||||
if((buff = (char*)malloc(sizeof(char)*(size_t)(buff_len)))==NULL) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
|
@ -231,7 +231,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
|
|||
/* new cluster found */
|
||||
if ((clus= (clust_t*)malloc(sizeof(clust_t)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
sscanf(buff,"%d",&clus->id);
|
||||
clus->slist=NULL;
|
||||
|
@ -248,11 +248,11 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
|
|||
/* source found for this cluster */
|
||||
if ((sclus= (clust_n*)malloc(sizeof(clust_n)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
if ((sclus->name=(char*)malloc((size_t)(strlen(buff)+1)*sizeof(char)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
strcpy(sclus->name,buff);
|
||||
clus->slist=g_list_prepend(clus->slist,sclus);
|
||||
|
@ -277,10 +277,10 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
|
|||
*/
|
||||
if ((cfp=fopen(skymodel,"r"))==0) {
|
||||
fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ((buff = (char*)realloc((void*)(buff),sizeof(char)*(size_t)(128)))==NULL) {
|
||||
if ((buff = (char*)realloc((void*)(buff),sizeof(char)*(size_t)(MAX_SNAME)))==NULL) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
@ -298,12 +298,12 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
|
|||
if (c!=EOF && c>0) {
|
||||
if ((hkey=(char*)malloc((size_t)(strlen(buff)+1)*sizeof(char)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
strcpy(hkey,buff);
|
||||
if ((source=(sinfo_t*)malloc(sizeof(sinfo_t)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
/* calculate l,m */
|
||||
/* Rad=(hr+min/60+sec/60*60)*pi/12 */
|
||||
|
@ -328,11 +328,11 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
|
|||
fratio=log(freq0/f0);
|
||||
fratio1=fratio*fratio;
|
||||
fratio2=fratio1*fratio;
|
||||
/* catch -ve sI */
|
||||
/* catch -ve and 0 sI */
|
||||
if (sI>0.0) {
|
||||
source->sI=exp(log(sI)+spec_idx*fratio+spec_idx1*fratio1+spec_idx2*fratio2);
|
||||
} else {
|
||||
source->sI=-exp(log(-sI)+spec_idx*fratio+spec_idx1*fratio1+spec_idx2*fratio2);
|
||||
source->sI=(sI==0.0?0.0:-exp(log(-sI)+spec_idx*fratio+spec_idx1*fratio1+spec_idx2*fratio2));
|
||||
}
|
||||
} else {
|
||||
source->sI=sI;
|
||||
|
@ -365,7 +365,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
|
|||
source->stype=STYPE_GAUSSIAN;
|
||||
if((exg=(exinfo_gaussian *)malloc(sizeof(exinfo_gaussian)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
exg->eX=2.0*eX; /* scale by 2 */
|
||||
exg->eY=2.0*eY;
|
||||
|
@ -387,7 +387,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
|
|||
source->stype=STYPE_DISK;
|
||||
if((exd=(exinfo_disk*)malloc(sizeof(exinfo_disk)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
exd->eX=eX;
|
||||
/* negate angles */
|
||||
|
@ -407,7 +407,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
|
|||
source->stype=STYPE_RING;
|
||||
if((exr=(exinfo_ring*)malloc(sizeof(exinfo_ring)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
exr->eX=eX;
|
||||
/* negate angles */
|
||||
|
@ -427,7 +427,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
|
|||
source->stype=STYPE_SHAPELET;
|
||||
if((exs=(exinfo_shapelet*)malloc(sizeof(exinfo_shapelet)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
exs->eX=eX;
|
||||
exs->eY=eY;
|
||||
|
@ -474,7 +474,7 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
|
|||
/* setup the array of cluster/source information */
|
||||
if ((*carr=(clus_source_t*)malloc((size_t)(g_list_length(clusters))*sizeof(clus_source_t)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
ci=0;
|
||||
|
@ -491,48 +491,48 @@ read_sky_cluster(const char *skymodel, const char *clusterfile, clus_source_t **
|
|||
|
||||
if (((*carr)[ci].ll=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
if (((*carr)[ci].mm=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
if (((*carr)[ci].nn=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
if (((*carr)[ci].sI=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
if (((*carr)[ci].stype=(unsigned char*)malloc((size_t)((*carr)[ci].N)*sizeof(unsigned char)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
if (((*carr)[ci].ex=(void**)malloc((size_t)((*carr)[ci].N)*sizeof(void*)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
/* for handling multi channel data */
|
||||
if (((*carr)[ci].sI0=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
if (((*carr)[ci].f0=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
if (((*carr)[ci].spec_idx=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
if (((*carr)[ci].spec_idx1=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
if (((*carr)[ci].spec_idx2=(double*)malloc((size_t)((*carr)[ci].N)*sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
return 1;
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
|
@ -689,3 +689,55 @@ update_ignorelist(const char *ignfile, int *ignlist, int M, clus_source_t *carr)
|
|||
printf("Total %d clustes ignored in simulation.\n",cn);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
int
|
||||
read_arho_fromfile(const char *admm_rho_file,int Mt,double *arho, int M, double *arhoslave) {
|
||||
|
||||
FILE *cfp;
|
||||
int c,ci,cj,cluster_id,hybrid,hb;
|
||||
double admm_rho;
|
||||
if ((cfp=fopen(admm_rho_file,"r"))==0) {
|
||||
fprintf(stderr,"%s: %d: no file\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
c=skip_lines(cfp);
|
||||
ci=0; /* store it in reverse order */
|
||||
cj=0;
|
||||
while(c>=0) {
|
||||
c=fscanf(cfp,"%d %d %lf",&cluster_id,&hybrid,&admm_rho);
|
||||
/* add this value to arho array */
|
||||
if (c!=EOF && c>0) {
|
||||
/* found a valid line */
|
||||
arhoslave[M-1-cj]=admm_rho; /* reverse order */
|
||||
for (hb=0; hb<hybrid; hb++) {
|
||||
if (hb==0) {
|
||||
arhoslave[M-1-cj]=admm_rho; /* reverse order */
|
||||
//printf("clus=%d arr=%d rhoslave=%lf\n",cluster_id,M-1-cj,admm_rho);
|
||||
cj++;
|
||||
}
|
||||
arho[Mt-1-ci]=admm_rho; /* reverse order */
|
||||
//printf("clus=%d arr=%d rho=%lf\n",cluster_id,Mt-1-ci,admm_rho);
|
||||
if (ci<Mt-1) {
|
||||
ci++;
|
||||
} else {
|
||||
/* array size does not match one given by text file */
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
c=skip_restof_line(cfp);
|
||||
c=skip_lines(cfp);
|
||||
}
|
||||
/* report any errors */
|
||||
if (!(c==EOF && ci==Mt-1)) {
|
||||
fprintf(stderr,"%s: %d: Error: cluster numbers in cluster file and regularization file do not match up.\n",__FILE__,__LINE__);
|
||||
}
|
||||
fclose(cfp);
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -573,11 +573,11 @@ residual_threadfn_onefreq(void *data) {
|
|||
fratio=log(freq0/t->carr[cm].f0[cn]);
|
||||
fratio1=fratio*fratio;
|
||||
fratio2=fratio1*fratio;
|
||||
/* catch -ve sI */
|
||||
/* catch -ve and 0 sI */
|
||||
if (t->carr[cm].sI0[cn]>0.0) {
|
||||
prodterm=exp(log(t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
|
||||
} else {
|
||||
prodterm=-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
|
||||
prodterm=(t->carr[cm].sI0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph));
|
||||
}
|
||||
} else {
|
||||
prodterm=t->carr[cm].sI[cn]*(cosph+_Complex_I*sinph);
|
||||
|
@ -846,11 +846,11 @@ residual_threadfn_multifreq(void *data) {
|
|||
fratio=log(freq0/t->carr[cm].f0[cn]);
|
||||
fratio1=fratio*fratio;
|
||||
fratio2=fratio1*fratio;
|
||||
/* catch -ve sI */
|
||||
/* catch -ve and 0 sI */
|
||||
if (t->carr[cm].sI0[cn]>0.0) {
|
||||
prodterm=exp(log(t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
|
||||
} else {
|
||||
prodterm=-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
|
||||
prodterm=(t->carr[cm].sI0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph));
|
||||
}
|
||||
} else {
|
||||
prodterm=t->carr[cm].sI[cn]*(cosph+_Complex_I*sinph);
|
||||
|
@ -1087,11 +1087,11 @@ visibilities_threadfn_multifreq(void *data) {
|
|||
fratio=log(freq0/t->carr[cm].f0[cn]);
|
||||
fratio1=fratio*fratio;
|
||||
fratio2=fratio1*fratio;
|
||||
/* catch -ve sI */
|
||||
/* catch -ve and 0 sI */
|
||||
if (t->carr[cm].sI0[cn]>0.0) {
|
||||
prodterm=exp(log(t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
|
||||
} else {
|
||||
prodterm=-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
|
||||
prodterm=(t->carr[cm].sI0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph));
|
||||
}
|
||||
} else {
|
||||
prodterm=t->carr[cm].sI[cn]*(cosph+_Complex_I*sinph);
|
||||
|
@ -1151,7 +1151,7 @@ visibilities_threadfn_multifreq(void *data) {
|
|||
|
||||
/* FIXME: tail timeslots still not written properly (probably due to flagging while reading data) */
|
||||
int
|
||||
predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,int Nt) {
|
||||
predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,int Nt, int add_to_data) {
|
||||
int nth,nth1,ci;
|
||||
|
||||
int Nthb0,Nthb;
|
||||
|
@ -1177,8 +1177,10 @@ predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int
|
|||
exit(1);
|
||||
}
|
||||
|
||||
/* set output column to zero */
|
||||
memset(x,0,sizeof(double)*8*Nbase*tilesz*Nchan);
|
||||
if (!add_to_data) {
|
||||
/* set output column to zero */
|
||||
memset(x,0,sizeof(double)*8*Nbase*tilesz*Nchan);
|
||||
}
|
||||
|
||||
/* iterate over threads, allocating baselines per thread */
|
||||
ci=0;
|
||||
|
@ -1211,6 +1213,8 @@ predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int
|
|||
threaddata[nth].fdelta=fdelta/(double)Nchan;
|
||||
threaddata[nth].tdelta=tdelta;
|
||||
threaddata[nth].dec0=dec0;
|
||||
threaddata[nth].add_to_data=add_to_data;
|
||||
|
||||
|
||||
pthread_create(&th_array[nth],&attr,visibilities_threadfn_multifreq,(void*)(&threaddata[nth]));
|
||||
/* next baseline set */
|
||||
|
@ -1252,8 +1256,10 @@ predictwithgain_threadfn_multifreq(void *data) {
|
|||
/* iterate over the sky model and calculate contribution */
|
||||
/* for this x[8*ci:8*(ci+1)-1] */
|
||||
/* if this baseline is flagged, we do not compute */
|
||||
for (cf=0; cf<t->Nchan; cf++) {
|
||||
memset(&t->x[8*ci+cf*Ntilebase*8],0,sizeof(double)*8);
|
||||
if (!t->add_to_data) { /* only model is written as output */
|
||||
for (cf=0; cf<t->Nchan; cf++) {
|
||||
memset(&t->x[8*ci+cf*Ntilebase*8],0,sizeof(double)*8);
|
||||
}
|
||||
}
|
||||
|
||||
/* stations for this baseline */
|
||||
|
@ -1303,11 +1309,11 @@ predictwithgain_threadfn_multifreq(void *data) {
|
|||
fratio=log(freq0/t->carr[cm].f0[cn]);
|
||||
fratio1=fratio*fratio;
|
||||
fratio2=fratio1*fratio;
|
||||
/* catch -ve sI */
|
||||
/* catch -ve and 0 sI */
|
||||
if (t->carr[cm].sI0[cn]>0.0) {
|
||||
prodterm=exp(log(t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
|
||||
} else {
|
||||
prodterm=-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph);
|
||||
prodterm=(t->carr[cm].sI0[cn]==0.0?0.0:-exp(log(-t->carr[cm].sI0[cn])+t->carr[cm].spec_idx[cn]*fratio+t->carr[cm].spec_idx1[cn]*fratio1+t->carr[cm].spec_idx2[cn]*fratio2)*(cosph+_Complex_I*sinph));
|
||||
}
|
||||
} else {
|
||||
prodterm=t->carr[cm].sI[cn]*(cosph+_Complex_I*sinph);
|
||||
|
@ -1364,12 +1370,44 @@ predictwithgain_threadfn_multifreq(void *data) {
|
|||
}
|
||||
}
|
||||
}
|
||||
/* if valid cluster is given, correct with its solutions */
|
||||
if (t->pinv) {
|
||||
cm=t->ccid;
|
||||
px=(ci+t->boff)/((Ntilebase+t->carr[cm].nchunk-1)/t->carr[cm].nchunk);
|
||||
pm=&(t->pinv[8*t->N*px]);
|
||||
G1[0]=(pm[sta1*8])+_Complex_I*(pm[sta1*8+1]);
|
||||
G1[1]=(pm[sta1*8+2])+_Complex_I*(pm[sta1*8+3]);
|
||||
G1[2]=(pm[sta1*8+4])+_Complex_I*(pm[sta1*8+5]);
|
||||
G1[3]=(pm[sta1*8+6])+_Complex_I*(pm[sta1*8+7]);
|
||||
G2[0]=(pm[sta2*8])+_Complex_I*(pm[sta2*8+1]);
|
||||
G2[1]=(pm[sta2*8+2])+_Complex_I*(pm[sta2*8+3]);
|
||||
G2[2]=(pm[sta2*8+4])+_Complex_I*(pm[sta2*8+5]);
|
||||
G2[3]=(pm[sta2*8+6])+_Complex_I*(pm[sta2*8+7]);
|
||||
|
||||
/* now do correction, if any */
|
||||
C[0]=t->x[8*ci]+_Complex_I*t->x[8*ci+1];
|
||||
C[1]=t->x[8*ci+2]+_Complex_I*t->x[8*ci+3];
|
||||
C[2]=t->x[8*ci+4]+_Complex_I*t->x[8*ci+5];
|
||||
C[3]=t->x[8*ci+6]+_Complex_I*t->x[8*ci+7];
|
||||
/* T1=G1*C */
|
||||
amb(G1,C,T1);
|
||||
/* T2=T1*G2' */
|
||||
ambt(T1,G2,T2);
|
||||
t->x[8*ci]=creal(T2[0]);
|
||||
t->x[8*ci+1]=cimag(T2[0]);
|
||||
t->x[8*ci+2]=creal(T2[1]);
|
||||
t->x[8*ci+3]=cimag(T2[1]);
|
||||
t->x[8*ci+4]=creal(T2[2]);
|
||||
t->x[8*ci+5]=cimag(T2[2]);
|
||||
t->x[8*ci+6]=creal(T2[3]);
|
||||
t->x[8*ci+7]=cimag(T2[3]);
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int
|
||||
predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,double *x,int *ignlist,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt) {
|
||||
predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,double *x,int *ignlist,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt, int add_to_data, int ccid, double rho) {
|
||||
int nth,nth1,ci;
|
||||
|
||||
int Nthb0,Nthb;
|
||||
|
@ -1379,6 +1417,32 @@ predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,d
|
|||
|
||||
int Nbase1=Nbase*tilesz;
|
||||
|
||||
|
||||
int cm,cj;
|
||||
double *pm,*pinv=0;
|
||||
cm=-1;
|
||||
/* find if any cluster is specified for correction of data */
|
||||
for (cj=0; cj<M; cj++) { /* clusters */
|
||||
/* check if cluster id == ccid to do a correction */
|
||||
if (carr[cj].id==ccid) {
|
||||
cm=cj;
|
||||
ci=1; /* correction cluster found */
|
||||
}
|
||||
}
|
||||
if (cm>=0) { /* valid cluser for correction */
|
||||
/* allocate memory for inverse J */
|
||||
if ((pinv=(double*)malloc((size_t)8*N*carr[cm].nchunk*sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
for (cj=0; cj<carr[cm].nchunk; cj++) {
|
||||
pm=&(p[carr[cm].p[cj]]); /* start of solutions */
|
||||
/* invert N solutions */
|
||||
for (ci=0; ci<N; ci++) {
|
||||
mat_invert(&pm[8*ci],&pinv[8*ci+8*N*cj], rho);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* calculate min baselines a thread can handle */
|
||||
Nthb0=(Nbase1+Nt-1)/Nt;
|
||||
|
@ -1427,6 +1491,11 @@ predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,d
|
|||
threaddata[nth].fdelta=fdelta/(double)Nchan;
|
||||
threaddata[nth].tdelta=tdelta;
|
||||
threaddata[nth].dec0=dec0;
|
||||
threaddata[nth].add_to_data=add_to_data;
|
||||
/* for correction of data */
|
||||
threaddata[nth].pinv=pinv;
|
||||
threaddata[nth].ccid=cm;
|
||||
|
||||
|
||||
pthread_create(&th_array[nth],&attr,predictwithgain_threadfn_multifreq,(void*)(&threaddata[nth]));
|
||||
/* next baseline set */
|
||||
|
|
|
@ -321,10 +321,11 @@ __global__ void
|
|||
kernel_evaluatenu_fl_eight(int Nd, float qsum, float *q, float deltanu,float nulow, float nu0) {
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* each block calculte psi((nu+8)/2)-log((nu+8)/2) */
|
||||
/* actually p=2, so psi((nu+2)/2)-log((nu+2)/2) */
|
||||
float dgm0;
|
||||
if (threadIdx.x==0) {
|
||||
dgm0=digamma_fl(nu0*0.5f+4.0f);
|
||||
dgm0=dgm0-logf((nu0+8.0f)*0.5f); /* psi((nu0+8)/2)-log((nu0+8)/2) */
|
||||
dgm0=digamma_fl(nu0*0.5f+1.0f);
|
||||
dgm0=dgm0-logf((nu0+2.0f)*0.5f); /* psi((nu0+8)/2)-log((nu0+8)/2) */
|
||||
}
|
||||
__syncthreads();
|
||||
if (tid<Nd) {
|
||||
|
@ -453,7 +454,7 @@ cudakernel_evaluatenu_fl(int ThreadsPerBlock, int BlocksPerGrid, int Nd, float q
|
|||
|
||||
|
||||
/* evaluate expression for finding optimum nu for
|
||||
a range of nu values, using AECM
|
||||
a range of nu values, using AECM (p=8 before, but now p=2)
|
||||
nu0: current value of robust_nu*/
|
||||
void
|
||||
cudakernel_evaluatenu_fl_eight(int ThreadsPerBlock, int BlocksPerGrid, int Nd, float qsum, float *q, float deltanu,float nulow, float nu0) {
|
||||
|
|
|
@ -911,7 +911,9 @@ func_grad_robust(
|
|||
int
|
||||
lbfgs_fit_robust(
|
||||
void (*func)(double *p, double *hx, int m, int n, void *adata),
|
||||
double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, void *adata) {
|
||||
double *p, double *x, int m, int n, int itmax, int M, int gpu_threads,
|
||||
int whiten, /* if >0 whiten data 1: NCP, 2... */
|
||||
void *adata) {
|
||||
|
||||
double *gk; /* gradients at both k+1 and k iter */
|
||||
double *xk1,*xk; /* parameters at k+1 and k iter */
|
||||
|
|
|
@ -1204,6 +1204,7 @@ osrlevmar_der_single_cuda_fl(
|
|||
int ntiles, /* total tile (data) size being solved for */
|
||||
double robust_nulow, double robust_nuhigh, /* robust nu range */
|
||||
int randomize, /* if >0 randomize */
|
||||
int whiten, /* if >0 whiten data 1: NCP, 2... */
|
||||
void *adata) /* pointer to possibly additional data, passed uninterpreted to func & jacf.
|
||||
* Set to NULL if not needed
|
||||
*/
|
||||
|
@ -1883,6 +1884,7 @@ rlevmar_der_single_nocuda(
|
|||
int linsolv, /* 0 Cholesky, 1 QR, 2 SVD */
|
||||
int Nt, /* no of threads */
|
||||
double robust_nulow, double robust_nuhigh, /* robust nu range */
|
||||
int whiten, /* if >0 whiten data 1: NCP, 2... */
|
||||
void *adata) /* pointer to possibly additional data, passed uninterpreted to func & jacf.
|
||||
* Set to NULL if not needed
|
||||
*/
|
||||
|
@ -2019,17 +2021,19 @@ rlevmar_der_single_nocuda(
|
|||
exit(1);
|
||||
}
|
||||
WORK=Ud=Sd=VTd=0;
|
||||
// for (ci=0;ci<M; ci++) {
|
||||
// aones[ci]=1.0;
|
||||
// }
|
||||
me_data_t *dt=(me_data_t*)adata;
|
||||
setweights(M,aones,1.0,dt->Nt);
|
||||
int nw,wt_itmax=3;
|
||||
me_data_t *lmdata=(me_data_t*)adata;
|
||||
double wt_sum,lambda,robust_nu=lmdata->robust_nu;
|
||||
double robust_nu1;
|
||||
|
||||
setweights(M,aones,1.0,lmdata->Nt);
|
||||
/*W set initial weights to 1 */
|
||||
// for (ci=0;ci<N; ci++) {
|
||||
// wtd[ci]=1.0;
|
||||
// }
|
||||
setweights(N,wtd,1.0,dt->Nt);
|
||||
setweights(N,wtd,1.0,lmdata->Nt);
|
||||
/* modify weights with whitening weights */
|
||||
if (whiten) {
|
||||
/* use correct offset for u,v based on tile offset */
|
||||
add_whitening_weights(N/8, wtd, &lmdata->u[lmdata->tileoff*lmdata->Nbase], &lmdata->v[lmdata->tileoff*lmdata->Nbase], *(lmdata->freq0), lmdata->Nt);
|
||||
}
|
||||
|
||||
/* memory allocation: different solvers */
|
||||
if (solve_axb==0) {
|
||||
|
@ -2078,10 +2082,6 @@ rlevmar_der_single_nocuda(
|
|||
}
|
||||
}
|
||||
|
||||
int nw,wt_itmax=3;
|
||||
me_data_t *lmdata=(me_data_t*)adata;
|
||||
double wt_sum,lambda,robust_nu=lmdata->robust_nu;
|
||||
double robust_nu1;
|
||||
|
||||
/* EM iteration loop */
|
||||
/************************************************************/
|
||||
|
@ -2403,6 +2403,10 @@ printf("norm ||dp|| =%lf, norm ||p||=%lf\n",Dp_L2,p_L2);
|
|||
printf("nu updated from %lf in [%lf,%lf] to %lf\n",robust_nu,robust_nulow, robust_nuhigh,robust_nu1);
|
||||
#endif
|
||||
robust_nu=robust_nu1;
|
||||
if (whiten) {
|
||||
/* use correct offset for u,v based on tile offset */
|
||||
add_whitening_weights(N/8, wtd, &lmdata->u[lmdata->tileoff*lmdata->Nbase], &lmdata->v[lmdata->tileoff*lmdata->Nbase], *(lmdata->freq0), lmdata->Nt);
|
||||
}
|
||||
|
||||
/* normalize weights */
|
||||
wt_sum=lambda/(double)N;
|
||||
|
@ -2491,6 +2495,7 @@ osrlevmar_der_single_nocuda(
|
|||
int Nt, /* no of threads */
|
||||
double robust_nulow, double robust_nuhigh, /* robust nu range */
|
||||
int randomize, /* if >0 randomize */
|
||||
int whiten, /* if >0 whiten data 1: NCP, 2... */
|
||||
void *adata) /* pointer to possibly additional data, passed uninterpreted to func & jacf.
|
||||
* Set to NULL if not needed
|
||||
*/
|
||||
|
@ -2611,16 +2616,20 @@ osrlevmar_der_single_nocuda(
|
|||
exit(1);
|
||||
}
|
||||
WORK=Ud=Sd=VTd=0;
|
||||
// for (ci=0;ci<M; ci++) {
|
||||
// aones[ci]=1.0;
|
||||
// }
|
||||
me_data_t *dt=(me_data_t*)adata;
|
||||
setweights(M,aones,1.0,dt->Nt);
|
||||
me_data_t *lmdata0=(me_data_t*)adata;
|
||||
int nw,wt_itmax=3;
|
||||
double wt_sum,lambda,robust_nu=lmdata0->robust_nu;
|
||||
double robust_nu1;
|
||||
|
||||
|
||||
setweights(M,aones,1.0,lmdata0->Nt);
|
||||
/*W set initial weights to 1 */
|
||||
// for (ci=0;ci<N; ci++) {
|
||||
// wtd[ci]=1.0;
|
||||
// }
|
||||
setweights(N,wtd,1.0,dt->Nt);
|
||||
setweights(N,wtd,1.0,lmdata0->Nt);
|
||||
/* modify weights with whitening weights */
|
||||
if (whiten) {
|
||||
add_whitening_weights(N/8, wtd, &lmdata0->u[lmdata0->tileoff*lmdata0->Nbase], &lmdata0->v[lmdata0->tileoff*lmdata0->Nbase], *(lmdata0->freq0), lmdata0->Nt);
|
||||
}
|
||||
|
||||
/* memory allocation: different solvers */
|
||||
if (solve_axb==0) {
|
||||
|
||||
|
@ -2668,10 +2677,7 @@ osrlevmar_der_single_nocuda(
|
|||
}
|
||||
}
|
||||
|
||||
int nw,wt_itmax=3;
|
||||
me_data_t *lmdata0=(me_data_t*)adata;
|
||||
double wt_sum,lambda,robust_nu=lmdata0->robust_nu;
|
||||
double robust_nu1;
|
||||
|
||||
/* setup OS subsets and stating offsets */
|
||||
/* ME data for Jacobian calculation (need a new one) */
|
||||
me_data_t lmdata;
|
||||
|
@ -2737,7 +2743,6 @@ osrlevmar_der_single_nocuda(
|
|||
l=l+Ntpersubset;
|
||||
}
|
||||
|
||||
|
||||
/* EM iteration loop */
|
||||
/************************************************************/
|
||||
for (nw=0; nw<wt_itmax; nw++) {
|
||||
|
@ -3051,6 +3056,9 @@ printf("norm ||dp|| =%lf, norm ||p||=%lf\n",Dp_L2,p_L2);
|
|||
printf("nu updated from %lf in [%lf,%lf] to %lf\n",robust_nu,robust_nulow, robust_nuhigh,robust_nu1);
|
||||
#endif
|
||||
robust_nu=robust_nu1;
|
||||
if (whiten) {
|
||||
add_whitening_weights(N/8, wtd, &lmdata0->u[lmdata0->tileoff*lmdata0->Nbase], &lmdata0->v[lmdata0->tileoff*lmdata0->Nbase], *(lmdata0->freq0), lmdata0->Nt);
|
||||
}
|
||||
|
||||
/* normalize weights */
|
||||
wt_sum=lambda/(double)N;
|
||||
|
|
|
@ -312,6 +312,7 @@ fns_f(complex double *x, double *y, global_data_rtr_t *gdata) {
|
|||
|
||||
|
||||
/* worker thread function for weight update (nu+8)/(nu+error^2) */
|
||||
/* update: error: min of XX,XY,YX,YY errors, so p=2 */
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
|
@ -363,7 +364,8 @@ threadfn_fns_fupdate_weights(void *data) {
|
|||
double r11=t->y[8*ci+6]-creal(T2[3]);
|
||||
double i11=t->y[8*ci+7]-cimag(T2[3]);
|
||||
|
||||
t->wtd[ci] = (t->nu0+8.0)/(t->nu0+(r00*r00+i00*i00+r01*r01+i01*i01+r10*r10+i10*i10+r11*r11+i11*i11));
|
||||
//t->wtd[ci] = (t->nu0+8.0)/(t->nu0+(r00*r00+i00*i00+r01*r01+i01*i01+r10*r10+i10*i10+r11*r11+i11*i11));
|
||||
t->wtd[ci] = (t->nu0+2.0)/(t->nu0+MAX(r00*r00+i00*i00,MAX(r01*r01+i01*i01,MAX(r10*r10+i10*i10,r11*r11+i11*i11))));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -475,12 +477,15 @@ fns_fupdate_weights(complex double *x, double *y, global_data_rtr_t *gdata) {
|
|||
pthread_join(gdata->th_array[nth1],NULL);
|
||||
}
|
||||
sumlogw/=(double)Nbase1;
|
||||
free(threaddata);
|
||||
free(threaddata);
|
||||
|
||||
/* find new value for nu, p-variate T dist, p=8 */
|
||||
/* find new value for nu, p-variate T dist, p=8 (update p=2 because using MAX() for residual calculation, not sum) */
|
||||
/* psi((nu_old+p)/2)-ln((nu_old+p)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1 = 0, AECM */
|
||||
double nu1=update_nu(sumlogw, 30, Nt, gdata->nulow, gdata->nuhigh, 8, dp->robust_nu);
|
||||
double nu1=update_nu(sumlogw, 30, Nt, gdata->nulow, gdata->nuhigh, 2, dp->robust_nu);
|
||||
|
||||
/* make sure new value stays within bounds */
|
||||
if (nu1<gdata->nulow) { return gdata->nulow; }
|
||||
if (nu1>gdata->nuhigh) { return gdata->nuhigh; }
|
||||
return nu1;
|
||||
}
|
||||
|
||||
|
@ -1400,6 +1405,7 @@ armijostep(int N,complex double *x,complex double *teta, double *y, global_data_
|
|||
return nocostred;
|
||||
}
|
||||
|
||||
|
||||
/* Fine tune initial trust region radius, also update initial value for x
|
||||
A. Sartenaer, 1995
|
||||
returns : trust region estimate,
|
||||
|
@ -1427,10 +1433,10 @@ itrr(int N,complex double *x,complex double *eta, complex double *Heta, double *
|
|||
fns_fgrad(x,eta,y,gdata,1);
|
||||
//normalize
|
||||
double eta_nrm=my_cnrm2(4*N,eta);
|
||||
my_cscal(4*N, 1.0/eta_nrm+0.0*_Complex_I, eta);
|
||||
my_cscal(4*N, 1.0/eta_nrm+0.0*_Complex_I, eta);
|
||||
|
||||
my_ccopy(4*N,eta,1,s,1);
|
||||
my_cscal(4*N, delta_0+0.0*_Complex_I, s);
|
||||
my_cscal(4*N, delta_0+0.0*_Complex_I, s);
|
||||
//Hessian at s
|
||||
fns_fhess(x,s,Heta,y,gdata);
|
||||
|
||||
|
@ -1439,7 +1445,7 @@ itrr(int N,complex double *x,complex double *eta, complex double *Heta, double *
|
|||
double mu_0=0.5; double mu_1=0.5; double mu_2=0.35;
|
||||
double teta=0.25;
|
||||
|
||||
|
||||
|
||||
int m,MK=4;
|
||||
for (m=0; m<MK; m++) {
|
||||
/* x_prop=x0-s */
|
||||
|
@ -1449,7 +1455,7 @@ itrr(int N,complex double *x,complex double *eta, complex double *Heta, double *
|
|||
/* model = f0 - g(x_prop,g0,s) - 0.5 g(x_prop,Hess,s) */
|
||||
mk=f0-fns_g(N,x_prop,eta,s)-0.5*fns_g(N,x_prop,Heta,s);
|
||||
fk=fns_f(x_prop,y,gdata);
|
||||
|
||||
|
||||
if (f0==mk) {
|
||||
rho=1e9;
|
||||
} else {
|
||||
|
@ -1521,7 +1527,7 @@ printf("m=%d delta_0=%e delta_max=%e beta=%e rho=%e\n",m,delta_0,delta_m,beta_i,
|
|||
#endif
|
||||
|
||||
my_ccopy(4*N,eta,1,s,1);
|
||||
my_cscal(4*N,delta_0+0.0*_Complex_I, s);
|
||||
my_cscal(4*N,delta_0+0.0*_Complex_I, s);
|
||||
}
|
||||
|
||||
|
||||
|
@ -1539,6 +1545,8 @@ printf("m=%d delta_0=%e delta_max=%e beta=%e rho=%e\n",m,delta_0,delta_m,beta_i,
|
|||
return Delta0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int
|
||||
rtr_solve_nocuda_robust(
|
||||
double *x0, /* initial values and updated solution at output (size 8*N double) */
|
||||
|
@ -1694,6 +1702,7 @@ rtr_solve_nocuda_robust(
|
|||
int rsdstat=0;
|
||||
/***************************************************/
|
||||
/* RSD solution */
|
||||
//for (ci=0; ci<itmax_rsd; ci++) {
|
||||
for (ci=0; ci<0; ci++) {
|
||||
/* Armijo step */
|
||||
/* teta=armijostep(V,C,N,x); */
|
||||
|
@ -1707,11 +1716,13 @@ rtr_solve_nocuda_robust(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
double Delta_new=itrr(N,x,eta,Heta, y, &gdata, fgradx, x_prop);
|
||||
#ifdef DEBUG
|
||||
printf("TR radius given=%lf est=%lf\n",Delta0,Delta_new);
|
||||
#endif
|
||||
|
||||
//old values
|
||||
//Delta_bar=MIN(fx,0.01);
|
||||
//Delta0=Delta_bar*0.125;
|
||||
Delta0=MIN(Delta_new,0.01); /* need to be more restrictive for EM */
|
||||
|
@ -1971,3 +1982,265 @@ Delta = Delta0;
|
|||
free(x);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
nsd_solve_nocuda_robust(
|
||||
double *x0, /* initial values and updated solution at output (size 8*N double) */
|
||||
double *y, /* data vector (size 8*M double) */
|
||||
int N, /* no. of stations */
|
||||
int M, /* no. of constraints */
|
||||
int itermax, /* maximum number of iterations RSD */
|
||||
double robust_nulow, double robust_nuhigh, /* robust nu range */
|
||||
double *info, /* initial and final residuals */
|
||||
me_data_t *adata) { /* pointer to additional data */
|
||||
|
||||
/* reshape x to make J: 2Nx2 complex double
|
||||
*/
|
||||
complex double *x;
|
||||
if ((x=(complex double*)malloc((size_t)4*N*sizeof(complex double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
/* map x: [(re,im)J_1(0,0) (re,im)J_1(0,1) (re,im)J_1(1,0) (re,im)J_1(1,1)...]
|
||||
to
|
||||
J: [J_1(0,0) J_1(1,0) J_2(0,0) J_2(1,0) ..... J_1(0,1) J_1(1,1) J_2(0,1) J_2(1,1)....]
|
||||
*/
|
||||
double *Jd=(double*)x;
|
||||
/* re J(0,0) */
|
||||
my_dcopy(N, &x0[0], 8, &Jd[0], 4);
|
||||
/* im J(0,0) */
|
||||
my_dcopy(N, &x0[1], 8, &Jd[1], 4);
|
||||
/* re J(1,0) */
|
||||
my_dcopy(N, &x0[4], 8, &Jd[2], 4);
|
||||
/* im J(1,0) */
|
||||
my_dcopy(N, &x0[5], 8, &Jd[3], 4);
|
||||
/* re J(0,1) */
|
||||
my_dcopy(N, &x0[2], 8, &Jd[4*N], 4);
|
||||
/* im J(0,1) */
|
||||
my_dcopy(N, &x0[3], 8, &Jd[4*N+1], 4);
|
||||
/* re J(1,1) */
|
||||
my_dcopy(N, &x0[6], 8, &Jd[4*N+2], 4);
|
||||
/* im J(1,1) */
|
||||
my_dcopy(N, &x0[7], 8, &Jd[4*N+3], 4);
|
||||
|
||||
|
||||
|
||||
int Nt=adata->Nt;
|
||||
int ci;
|
||||
global_data_rtr_t gdata;
|
||||
|
||||
gdata.medata=adata;
|
||||
/* setup threads */
|
||||
pthread_attr_init(&gdata.attr);
|
||||
pthread_attr_setdetachstate(&gdata.attr,PTHREAD_CREATE_JOINABLE);
|
||||
|
||||
if ((gdata.th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ((gdata.mx_array=(pthread_mutex_t*)malloc((size_t)N*sizeof(pthread_mutex_t)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if ((gdata.iw=(double*)malloc((size_t)N*sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
/* weights for robust LS, length could be less than total no of baselines
|
||||
therefore use relative offset boff */
|
||||
if ((gdata.wtd=(double*)malloc((size_t)M*sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
for (ci=0; ci<N; ci++) {
|
||||
pthread_mutex_init(&gdata.mx_array[ci],NULL);
|
||||
}
|
||||
/* count baseline->station contributions
|
||||
NOTE: has to be done here because the baseline offset would change */
|
||||
fns_fcount(&gdata);
|
||||
/***************************************************/
|
||||
complex double *fgradx,*eta,*z,*x_prop,*z_prop;
|
||||
if ((fgradx=(complex double*)calloc((size_t)4*N,sizeof(complex double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if ((eta=(complex double*)calloc((size_t)4*N,sizeof(complex double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if ((z=(complex double*)calloc((size_t)4*N,sizeof(complex double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if ((x_prop=(complex double*)calloc((size_t)4*N,sizeof(complex double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if ((z_prop=(complex double*)calloc((size_t)4*N,sizeof(complex double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*set initial weights to 1 */
|
||||
setweights(M,gdata.wtd,1.0,Nt);
|
||||
gdata.nulow=robust_nulow;
|
||||
gdata.nuhigh=robust_nuhigh;
|
||||
|
||||
double fx;
|
||||
fx=fns_f(x,y,&gdata);
|
||||
double fx0=fx;
|
||||
/***************************************************/
|
||||
/* gradient at x0 */
|
||||
fns_fgrad(x,fgradx,y,&gdata,1);
|
||||
/* Hessian at x0,x0 */
|
||||
fns_fhess(x,x,z,y,&gdata);
|
||||
/* intial step ~ 1/||Hessian|| */
|
||||
double hess_nrm=my_cnrm2(4*N,z);
|
||||
double t=1.0/hess_nrm;
|
||||
/* if initial step too small */
|
||||
if (t<1e-6) {
|
||||
t=1e-6;
|
||||
}
|
||||
|
||||
/* z <= x */
|
||||
my_ccopy(4*N,x,1,z,1);
|
||||
double theta=1.0;
|
||||
double ALPHA=1.01; /* step-size growth factor */
|
||||
double BETA=0.5; /* step-size shrinkage factor */
|
||||
|
||||
int k;
|
||||
for (k=0; k<itermax; k++) {
|
||||
/* x_prop <= x */
|
||||
my_ccopy(4*N,x,1,x_prop,1);
|
||||
/* z_prop <= z */
|
||||
my_ccopy(4*N,z,1,z_prop,1);
|
||||
|
||||
/* x <= z - t * grad */
|
||||
my_ccopy(4*N,z,1,x,1);
|
||||
my_caxpy(4*N, fgradx, -t+0.0*_Complex_I, x);
|
||||
|
||||
/* if ||x-z|| == t||grad|| is below threshold, stop iteration */
|
||||
double grad_nrm=my_cnrm2(4*N,fgradx);
|
||||
double x_nrm=my_cnrm2(4*N,x);
|
||||
/* norm(y-x)/max(1,norm(x)); */
|
||||
if (grad_nrm*t/MAX(1.0,x_nrm) < 1e-6) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* theta = 2/(1 + sqrt(1+4/(theta^2))); */
|
||||
theta=2.0/(1.0 + sqrt(1.0+4.0/(theta*theta)));
|
||||
|
||||
/* z = x + (1-theta)*(x-x_prop);
|
||||
z = (2-theta)*x - (1-theta) * x_prop */
|
||||
my_ccopy(4*N,x,1,z,1);
|
||||
my_cscal(4*N, 2.0-theta+0.0*_Complex_I, z);
|
||||
my_caxpy(4*N, x_prop, -(1.0-theta)+0.0*_Complex_I, z);
|
||||
|
||||
/* eta = grad_old;
|
||||
grad <= grad_f( z ) */
|
||||
my_ccopy(4*N,fgradx,1,eta,1);
|
||||
fns_fgrad(z,fgradx,y,&gdata,1);
|
||||
|
||||
/* z_prop <= z_prop - z */
|
||||
my_caxpy(4*N, z, -1.0+0.0*_Complex_I, z_prop);
|
||||
/* eta <= eta - new_grad */
|
||||
my_caxpy(4*N, fgradx, -1.0+0.0*_Complex_I, eta);
|
||||
|
||||
/* ||z-z_prop|| */
|
||||
double ydiffnrm=my_cnrm2(4*N,z_prop);
|
||||
/* (z-z_prop)'*(grad-grad_old) */
|
||||
double dot_ydiff_gdiff=my_ddot(8*N, (double *)z_prop, (double *)eta);
|
||||
#ifdef DEBUG
|
||||
printf("num=%e den=%e\n",ydiffnrm,dot_ydiff_gdiff);
|
||||
#endif
|
||||
/* the above can be NAN, if so break loop */
|
||||
if (isnan(dot_ydiff_gdiff) || isinf(dot_ydiff_gdiff)) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* backtracking
|
||||
t_hat = 0.5*(norm(y-y_old)^2)/abs((y - y_old)'*(g_old - g));
|
||||
t = min( ALPHA*t, max( BETA*t, t_hat ));
|
||||
*/
|
||||
double t_hat=0.5*(ydiffnrm*ydiffnrm)/fabs(dot_ydiff_gdiff);
|
||||
t=MIN(ALPHA*t,MAX(BETA*t,t_hat));
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("k=%d theta=%e step=%e\n",k,theta,t);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
fx=fns_f(x,y,&gdata);
|
||||
/* final residual */
|
||||
info[1]=fx;
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("k=%d cost initial=%e final=%e\n",k,fx0,fx);
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
free(fgradx);
|
||||
free(eta);
|
||||
free(z);
|
||||
free(z_prop);
|
||||
free(x_prop);
|
||||
/***************************************************/
|
||||
double robust_nu1=fns_fupdate_weights(x,y,&gdata);
|
||||
adata->robust_nu=robust_nu1;
|
||||
if (fx0>fx) {
|
||||
/* copy back solution to x0 */
|
||||
/* re J(0,0) */
|
||||
my_dcopy(N, &Jd[0], 4, &x0[0], 8);
|
||||
/* im J(0,0) */
|
||||
my_dcopy(N, &Jd[1], 4, &x0[1], 8);
|
||||
/* re J(1,0) */
|
||||
my_dcopy(N, &Jd[2], 4, &x0[4], 8);
|
||||
/* im J(1,0) */
|
||||
my_dcopy(N, &Jd[3], 4, &x0[5], 8);
|
||||
/* re J(0,1) */
|
||||
my_dcopy(N, &Jd[4*N], 4, &x0[2], 8);
|
||||
/* im J(0,1) */
|
||||
my_dcopy(N, &Jd[4*N+1], 4, &x0[3], 8);
|
||||
/* re J(1,1) */
|
||||
my_dcopy(N, &Jd[4*N+2], 4, &x0[6], 8);
|
||||
/* im J(1,1) */
|
||||
my_dcopy(N, &Jd[4*N+3], 4, &x0[7], 8);
|
||||
}
|
||||
|
||||
for (ci=0; ci<N; ci++) {
|
||||
pthread_mutex_destroy(&gdata.mx_array[ci]);
|
||||
}
|
||||
pthread_attr_destroy(&gdata.attr);
|
||||
free(gdata.th_array);
|
||||
free(gdata.mx_array);
|
||||
free(gdata.iw);
|
||||
free(gdata.wtd);
|
||||
free(x);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -331,7 +331,8 @@ fns_f(complex double *x, double *y, global_data_rtr_t *gdata) {
|
|||
}
|
||||
|
||||
|
||||
/* worker thread function for weight update (nu+8)/(nu+error^2) */
|
||||
/* worker thread function for weight update (nu+p)/(nu+error^2) */
|
||||
/* p=2, not p=8 because using MAX() not sum for error^2 */
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
|
@ -383,7 +384,8 @@ threadfn_fns_fupdate_weights(void *data) {
|
|||
double r11=t->y[8*ci+6]-creal(T2[3]);
|
||||
double i11=t->y[8*ci+7]-cimag(T2[3]);
|
||||
|
||||
t->wtd[ci] = (t->nu0+8.0)/(t->nu0+(r00*r00+i00*i00+r01*r01+i01*i01+r10*r10+i10*i10+r11*r11+i11*i11));
|
||||
//t->wtd[ci] = (t->nu0+8.0)/(t->nu0+(r00*r00+i00*i00+r01*r01+i01*i01+r10*r10+i10*i10+r11*r11+i11*i11));
|
||||
t->wtd[ci] = (t->nu0+2.0)/(t->nu0+MAX(r00*r00+i00*i00,MAX(r01*r01+i01*i01,MAX(r10*r10+i10*i10,r11*r11+i11*i11))));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -495,11 +497,14 @@ fns_fupdate_weights(complex double *x, double *y, global_data_rtr_t *gdata) {
|
|||
pthread_join(gdata->th_array[nth1],NULL);
|
||||
}
|
||||
sumlogw/=(double)Nbase1;
|
||||
free(threaddata);
|
||||
free(threaddata);
|
||||
|
||||
/* find new value for nu, p-variate T dist, p=8 */
|
||||
/* find new value for nu, p-variate T dist, p=8 (update p=2 because using MAX() for residual calculation, not sum) */
|
||||
/* psi((nu_old+p)/2)-ln((nu_old+p)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1 = 0, AECM */
|
||||
double nu1=update_nu(sumlogw, 30, Nt, gdata->nulow, gdata->nuhigh, 8, dp->robust_nu);
|
||||
double nu1=update_nu(sumlogw, 30, Nt, gdata->nulow, gdata->nuhigh, 2, dp->robust_nu);
|
||||
/* make sure new value stays within bounds */
|
||||
if (nu1<gdata->nulow) { return gdata->nulow; }
|
||||
if (nu1>gdata->nuhigh) { return gdata->nuhigh; }
|
||||
|
||||
return nu1;
|
||||
}
|
||||
|
@ -1385,6 +1390,145 @@ armijostep(int N,complex double *x,complex double *teta, double *y, global_data_
|
|||
return nocostred;
|
||||
}
|
||||
|
||||
/* Fine tune initial trust region radius, also update initial value for x
|
||||
A. Sartenaer, 1995
|
||||
returns : trust region estimate,
|
||||
also modifies x
|
||||
eta,Heta,s,x_prop: used as storage
|
||||
*/
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
static double
|
||||
itrr(int N,complex double *x,complex double *eta, complex double *Heta, double *y, global_data_rtr_t *gdata, complex double *s, complex double *x_prop) {
|
||||
|
||||
double f0,fk,mk,rho,rho1,Delta0;
|
||||
|
||||
/* initialize trust region radii */
|
||||
double delta_0=1.0;
|
||||
double delta_m=0.0;
|
||||
|
||||
double sigma=0.0;
|
||||
double delta=0.0;
|
||||
|
||||
// initial cost
|
||||
f0=fns_f(x,y,gdata);
|
||||
// gradient at x0
|
||||
fns_fgrad(x,eta,y,gdata,1);
|
||||
//normalize
|
||||
double eta_nrm=my_cnrm2(4*N,eta);
|
||||
my_cscal(4*N, 1.0/eta_nrm+0.0*_Complex_I, eta);
|
||||
|
||||
my_ccopy(4*N,eta,1,s,1);
|
||||
my_cscal(4*N, delta_0+0.0*_Complex_I, s);
|
||||
//Hessian at s
|
||||
fns_fhess(x,s,Heta,y,gdata);
|
||||
|
||||
/* constants used */
|
||||
double gamma_1=0.0625; double gamma_2=5.0; double gamma_3=0.5; double gamma_4=2.0;
|
||||
double mu_0=0.5; double mu_1=0.5; double mu_2=0.35;
|
||||
double teta=0.25;
|
||||
|
||||
|
||||
int m,MK=4;
|
||||
for (m=0; m<MK; m++) {
|
||||
/* x_prop=x0-s */
|
||||
my_ccopy(4*N,x,1,x_prop,1);
|
||||
my_caxpy(4*N, s, -1.0+0.0*_Complex_I, x_prop);
|
||||
|
||||
/* model = f0 - g(x_prop,g0,s) - 0.5 g(x_prop,Hess,s) */
|
||||
mk=f0-fns_g(N,x_prop,eta,s)-0.5*fns_g(N,x_prop,Heta,s);
|
||||
fk=fns_f(x_prop,y,gdata);
|
||||
|
||||
if (f0==mk) {
|
||||
rho=1e9;
|
||||
} else {
|
||||
rho=(f0-fk)/(f0-mk);
|
||||
}
|
||||
rho1=fabs(rho-1.0);
|
||||
|
||||
/* update max radius */
|
||||
if (rho1<mu_0) {
|
||||
delta_m=MAX(delta_m,delta_0);
|
||||
}
|
||||
if ((f0-fk)>delta) {
|
||||
delta=f0-fk;
|
||||
sigma=delta_0;
|
||||
}
|
||||
/* radius update */
|
||||
double beta_1,beta_2,beta_i;
|
||||
beta_1=0.0;
|
||||
beta_2=0.0;
|
||||
if (m<MK) {
|
||||
double g0_s=fns_g(N,x,eta,s);
|
||||
double b1=(teta*(f0-g0_s)+(1.0-teta)*mk-fk);
|
||||
beta_1=(b1==0.0?1e9:-teta*g0_s/b1);
|
||||
|
||||
double b2=(-teta*(f0-g0_s)+(1.0+teta)*mk-fk);
|
||||
beta_2=(b2==0.0?1e9:teta*g0_s/b2);
|
||||
|
||||
double minbeta=MIN(beta_1,beta_2);
|
||||
double maxbeta=MAX(beta_1,beta_2);
|
||||
if (rho1>mu_1) {
|
||||
if (minbeta>1.0) {
|
||||
beta_i=gamma_3;
|
||||
} else if ((maxbeta<gamma_1) || (minbeta<gamma_1 && maxbeta>=1.0)) {
|
||||
beta_i=gamma_1;
|
||||
} else if ((beta_1>=gamma_1 && beta_1<1.0) && (beta_2<gamma_1 || beta_2>=1.0)) {
|
||||
beta_i=beta_1;
|
||||
} else if ((beta_2>=gamma_1 && beta_2<1.0) && (beta_1<gamma_1 || beta_1>=1.0)) {
|
||||
beta_i=beta_2;
|
||||
} else {
|
||||
beta_i=maxbeta;
|
||||
}
|
||||
} else if (rho1<=mu_2) {
|
||||
if (maxbeta<1.0) {
|
||||
beta_i=gamma_4;
|
||||
} else if (maxbeta>gamma_2) {
|
||||
beta_i=gamma_2;
|
||||
} else if ((beta_1>=1.0 && beta_1<=gamma_2) && beta_2<1.0) {
|
||||
beta_i=beta_1;
|
||||
} else if ((beta_2>=1.0 && beta_2<=gamma_2) && beta_1<1.0) {
|
||||
beta_i=beta_2;
|
||||
} else {
|
||||
beta_i=maxbeta;
|
||||
}
|
||||
} else {
|
||||
if (maxbeta<gamma_3) {
|
||||
beta_i=gamma_3;
|
||||
} else if (maxbeta>gamma_4) {
|
||||
beta_i=gamma_4;
|
||||
} else {
|
||||
beta_i=maxbeta;
|
||||
}
|
||||
}
|
||||
/* update radius */
|
||||
delta_0=delta_0/beta_i;
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("m=%d delta_0=%e delta_max=%e beta=%e rho=%e\n",m,delta_0,delta_m,beta_i,rho);
|
||||
#endif
|
||||
|
||||
my_ccopy(4*N,eta,1,s,1);
|
||||
my_cscal(4*N,delta_0+0.0*_Complex_I, s);
|
||||
}
|
||||
|
||||
|
||||
// update initial value
|
||||
if (delta>0.0) {
|
||||
my_caxpy(4*N, eta, -sigma+0.0*_Complex_I, x);
|
||||
}
|
||||
|
||||
if (delta_m>0.0) {
|
||||
Delta0=delta_m;
|
||||
} else {
|
||||
Delta0=delta_0;
|
||||
}
|
||||
|
||||
return Delta0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int
|
||||
|
@ -1583,7 +1727,8 @@ rtr_solve_nocuda_robust_admm(
|
|||
int rsdstat=0;
|
||||
/***************************************************/
|
||||
/* RSD solution */
|
||||
for (ci=0; ci<itmax_rsd; ci++) {
|
||||
//for (ci=0; ci<itmax_rsd; ci++) {
|
||||
for (ci=0; ci<0; ci++) {
|
||||
/* Armijo step */
|
||||
/* teta=armijostep(V,C,N,x); */
|
||||
rsdstat=armijostep(N,x,eta,y,&gdata,fgradx,x_prop,&fx); /* NOTE last two are just storage */
|
||||
|
@ -1596,7 +1741,16 @@ rtr_solve_nocuda_robust_admm(
|
|||
}
|
||||
}
|
||||
|
||||
Delta_bar=MIN(fx,Delta_bar);
|
||||
double Delta_new=itrr(N,x,eta,Heta, y, &gdata, fgradx, x_prop);
|
||||
#ifdef DEBUG
|
||||
printf("TR radius given=%lf est=%lf\n",Delta0,Delta_new);
|
||||
#endif
|
||||
|
||||
//old values
|
||||
//Delta_bar=MIN(fx,Delta_bar);
|
||||
Delta0=MIN(Delta_new,0.01); /* need to be more restrictive for EM */
|
||||
Delta_bar=Delta0*8.0;
|
||||
|
||||
rho_regularization=fx*1e-6;
|
||||
//printf("fx=%g Delta_bar=%g Delta0=%g\n",fx,Delta_bar,Delta0);
|
||||
|
||||
|
|
|
@ -116,7 +116,7 @@ cudakernel_fns_fgrad_robust1(int ThreadsPerBlock, int BlocksPerGrid, int N, int
|
|||
to tangent space before it is averaged
|
||||
so calculate grad using N(N-1)/2 constraints each (total M)
|
||||
*/
|
||||
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
|
||||
/* need 8N*M/ThreadsPerBlock+ 8N float storage */
|
||||
static void
|
||||
cudakernel_fns_fgrad_robust(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, float *y, float *coh, char *bbh, float *iw, float *wtd, int negate, cublasHandle_t cbhandle,float *gWORK) {
|
||||
/* baselines per timeslot = N(N-1)/2 ~2400, timeslots = M/baselines ~120
|
||||
|
@ -213,11 +213,10 @@ printf("N=%d Baselines=%d timeslots=%d total=%d,Threads=%d Blocks=%d\n",N,nbase,
|
|||
}
|
||||
cudaMemcpy(eta,tempeta,4*N*sizeof(cuFloatComplex),cudaMemcpyDeviceToDevice);
|
||||
cudaFree(Bd);
|
||||
//cudakernel_fns_proj(N, x, tempeta, eta, cbhandle);
|
||||
}
|
||||
|
||||
/* Hessian, also projected to tangent space */
|
||||
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
|
||||
/* need 8N*M/ThreadsPerBlock+ 8N*2 float storage */
|
||||
static void
|
||||
cudakernel_fns_fhess_robust1(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, cuFloatComplex *fhess, float *y, float *coh, char *bbh, float *iw, float *wtd, cublasHandle_t cbhandle, float *gWORK) {
|
||||
cuFloatComplex *tempeta,*tempb;
|
||||
|
@ -265,7 +264,7 @@ cudakernel_fns_fhess_robust1(int ThreadsPerBlock, int BlocksPerGrid, int N, int
|
|||
to tangent space before it is averaged
|
||||
so calculate grad using N(N-1)/2 constraints each (total M)
|
||||
*/
|
||||
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
|
||||
/* need 8N*M/ThreadsPerBlock+ 8N float storage */
|
||||
static void
|
||||
cudakernel_fns_fhess_robust(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, cuFloatComplex *fhess, float *y, float *coh, char *bbh, float *iw, float *wtd, cublasHandle_t cbhandle, float *gWORK) {
|
||||
cuFloatComplex *tempeta;
|
||||
|
@ -350,7 +349,6 @@ printf("N=%d Baselines=%d timeslots=%d total=%d,Threads=%d Blocks=%d\n",N,nbase,
|
|||
cudaMemcpy(fhess,tempeta,4*N*sizeof(cuFloatComplex),cudaMemcpyDeviceToDevice);
|
||||
cudaFree(Bd);
|
||||
|
||||
//cudakernel_fns_proj(N, x, tempeta, fhess, cbhandle);
|
||||
}
|
||||
|
||||
|
||||
|
@ -449,6 +447,160 @@ printf("m=%d lhs=%e rhs=%e rat=%e norm=%e\n",m,lhs,rhs,lhs/rhs,metric);
|
|||
}
|
||||
|
||||
|
||||
/* Fine tune initial trust region radius, also update initial value for x
|
||||
A. Sartenaer, 1995
|
||||
returns : trust region estimate,
|
||||
also modifies x
|
||||
eta,Heta: used as storage
|
||||
*/
|
||||
/* need 8N*2 + MAX(2 Blocks + 4, 8N (1 + ceil(M/Threads))) float storage */
|
||||
static float
|
||||
itrr(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, cuFloatComplex *Heta, float *y, float *coh, char *bbh, float *iw, float *wtd, cublasHandle_t cbhandle, float *gWORK) {
|
||||
cuFloatComplex alpha;
|
||||
cublasStatus_t cbstatus;
|
||||
/* temp storage, re-using global storage */
|
||||
cuFloatComplex *s, *x_prop;
|
||||
unsigned long int moff=0;
|
||||
s=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N;
|
||||
x_prop=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N;
|
||||
float *gWORK1=&gWORK[moff];
|
||||
|
||||
|
||||
float f0,fk,mk,rho,rho1,Delta0;
|
||||
/* initialize trust region radii */
|
||||
float delta_0=1.0f;
|
||||
float delta_m=0.0f;
|
||||
|
||||
float sigma=0.0f;
|
||||
float delta=0.0f;
|
||||
|
||||
// initial cost
|
||||
f0=cudakernel_fns_f_robust(ThreadsPerBlock,BlocksPerGrid,N,M,x,y,coh,bbh,wtd,gWORK1);
|
||||
// gradient at x0;
|
||||
cudakernel_fns_fgrad_robust(ThreadsPerBlock,BlocksPerGrid,N,M,x,eta,y,coh,bbh,iw,wtd,1,cbhandle, gWORK1);
|
||||
// normalize
|
||||
float eta_nrm;
|
||||
cublasScnrm2(cbhandle,4*N,eta,1,&eta_nrm);
|
||||
alpha.x=1.0f/eta_nrm;alpha.y=0.0f;
|
||||
cbstatus=cublasCscal(cbhandle,4*N,&alpha,eta,1);
|
||||
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,eta,1,s,1);
|
||||
alpha.x=delta_0;alpha.y=0.0f;
|
||||
cbstatus=cublasCscal(cbhandle,4*N,&alpha,s,1);
|
||||
/* Hessian at s */
|
||||
cudakernel_fns_fhess_robust(ThreadsPerBlock,BlocksPerGrid,N,M,x,s,Heta,y,coh,bbh,iw,wtd,cbhandle, gWORK1);
|
||||
|
||||
/* constants used */
|
||||
float gamma_1=0.0625f; float gamma_2=5.0f; float gamma_3=0.5f; float gamma_4=2.0f;
|
||||
float mu_0=0.5f; float mu_1=0.5f; float mu_2=0.35f;
|
||||
float teta=0.25f;
|
||||
|
||||
|
||||
int MK=4;
|
||||
int m;
|
||||
for (m=0; m<MK; m++) {
|
||||
/* x_prop=x0-s */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,x,1,x_prop,1);
|
||||
alpha.x=-1.0f;alpha.y=0.0f;
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, s, 1, x_prop, 1);
|
||||
|
||||
/* model = f0 - g(x_prop,g0,s) - 0.5 g(x_prop,Hess,s) */
|
||||
mk=f0-cudakernel_fns_g(N,x_prop,eta,s,cbhandle)-0.5f*cudakernel_fns_g(N,x_prop,Heta,s,cbhandle);
|
||||
fk=cudakernel_fns_f_robust(ThreadsPerBlock,BlocksPerGrid,N,M,x_prop,y,coh,bbh,wtd,gWORK1);
|
||||
|
||||
if (f0==mk) {
|
||||
rho=1e9f;
|
||||
} else {
|
||||
rho=(f0-fk)/(f0-mk);
|
||||
}
|
||||
rho1=fabsf(rho-1.0f);
|
||||
|
||||
/* update max radius */
|
||||
if (rho1<mu_0) {
|
||||
delta_m=MAX(delta_m,delta_0);
|
||||
}
|
||||
if ((f0-fk)>delta) {
|
||||
delta=f0-fk;
|
||||
sigma=delta_0;
|
||||
}
|
||||
/* radius update */
|
||||
float beta_1,beta_2,beta_i;
|
||||
beta_1=0.0f;
|
||||
beta_2=0.0f;
|
||||
|
||||
if (m<MK) {
|
||||
float g0_s=cudakernel_fns_g(N,x,eta,s,cbhandle);
|
||||
float b1=(teta*(f0-g0_s)+(1.0f-teta)*mk-fk);
|
||||
beta_1=(b1==0.0f?1e9f:-teta*g0_s/b1);
|
||||
|
||||
float b2=(-teta*(f0-g0_s)+(1.0f+teta)*mk-fk);
|
||||
beta_2=(b2==0.0f?1e9f:teta*g0_s/b2);
|
||||
|
||||
float minbeta=MIN(beta_1,beta_2);
|
||||
float maxbeta=MAX(beta_1,beta_2);
|
||||
if (rho1>mu_1) {
|
||||
if (minbeta>1.0f) {
|
||||
beta_i=gamma_3;
|
||||
} else if ((maxbeta<gamma_1) || (minbeta<gamma_1 && maxbeta>=1.0f)) {
|
||||
beta_i=gamma_1;
|
||||
} else if ((beta_1>=gamma_1 && beta_1<1.0f) && (beta_2<gamma_1 || beta_2>=1.0f)) {
|
||||
beta_i=beta_1;
|
||||
} else if ((beta_2>=gamma_1 && beta_2<1.0f) && (beta_1<gamma_1 || beta_1>=1.0f)) {
|
||||
beta_i=beta_2;
|
||||
} else {
|
||||
beta_i=maxbeta;
|
||||
}
|
||||
} else if (rho1<=mu_2) {
|
||||
if (maxbeta<1.0f) {
|
||||
beta_i=gamma_4;
|
||||
} else if (maxbeta>gamma_2) {
|
||||
beta_i=gamma_2;
|
||||
} else if ((beta_1>=1.0f && beta_1<=gamma_2) && beta_2<1.0f) {
|
||||
beta_i=beta_1;
|
||||
} else if ((beta_2>=1.0f && beta_2<=gamma_2) && beta_1<1.0f) {
|
||||
beta_i=beta_2;
|
||||
} else {
|
||||
beta_i=maxbeta;
|
||||
}
|
||||
} else {
|
||||
if (maxbeta<gamma_3) {
|
||||
beta_i=gamma_3;
|
||||
} else if (maxbeta>gamma_4) {
|
||||
beta_i=gamma_4;
|
||||
} else {
|
||||
beta_i=maxbeta;
|
||||
}
|
||||
}
|
||||
/* update radius */
|
||||
delta_0=delta_0/beta_i;
|
||||
}
|
||||
#ifdef DEBUG
|
||||
printf("m=%d delta_0=%e delta_max=%e beta=%e rho=%e\n",m,delta_0,delta_m,beta_i,rho);
|
||||
#endif
|
||||
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,eta,1,s,1);
|
||||
alpha.x=delta_0;alpha.y=0.0f;
|
||||
cbstatus=cublasCscal(cbhandle,4*N,&alpha,s,1);
|
||||
}
|
||||
|
||||
// update initial value
|
||||
if (delta>0.0f) {
|
||||
alpha.x=-sigma; alpha.y=0.0f;
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, eta, 1, x, 1);
|
||||
}
|
||||
|
||||
if (delta_m>0.0f) {
|
||||
Delta0=delta_m;
|
||||
} else {
|
||||
Delta0=delta_0;
|
||||
}
|
||||
|
||||
return Delta0;
|
||||
}
|
||||
|
||||
|
||||
/* truncated conjugate gradient method
|
||||
x, grad, eta, r, z, delta, Hxd : size 2N x 2 complex
|
||||
so, vector size is 4N complex double
|
||||
|
@ -572,6 +724,12 @@ tcg_solve_cuda(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComp
|
|||
}
|
||||
|
||||
|
||||
|
||||
/* storage:
|
||||
8N * 5 + N + 8M * 2 + 2M + M (base storage)
|
||||
MAX( 2 * Blocks + 4, 8N(6 + ceil(M/Threads))) for functions
|
||||
Blocks = ceil(M/Threads)
|
||||
*/
|
||||
int
|
||||
rtr_solve_cuda_robust_fl(
|
||||
float *x0, /* initial values and updated solution at output (size 8*N float) */
|
||||
|
@ -745,28 +903,36 @@ rtr_solve_cuda_robust_fl(
|
|||
printf("Initial Cost=%g\n",fx0);
|
||||
#endif
|
||||
/***************************************************/
|
||||
int rsdstat=0;
|
||||
/* RSD solution */
|
||||
for (ci=0; ci<itmax_sd; ci++) {
|
||||
/* Armijo step */
|
||||
/* teta=armijostep(V,C,N,x); */
|
||||
rsdstat=armijostep(ThreadsPerBlock, BlocksPerGrid, N, M, xd, etad, yd, cohd, bbd,iwd,wtd,&fx,cbhandle,gWORK1);
|
||||
/* x=R(x,teta); */
|
||||
cudakernel_fns_R(N,xd,etad,x_propd,cbhandle);
|
||||
//my_ccopy(4*N,x_propd,1,xd,1);
|
||||
if (!rsdstat) {
|
||||
/* cost reduced, update solution */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
|
||||
} else {
|
||||
/* no cost reduction, break loop */
|
||||
break;
|
||||
}
|
||||
}
|
||||
// int rsdstat=0;
|
||||
// /* RSD solution - disabled */
|
||||
// for (ci=0; ci<itmax_sd; ci++) {
|
||||
// /* Armijo step */
|
||||
// /* teta=armijostep(V,C,N,x); */
|
||||
// rsdstat=armijostep(ThreadsPerBlock, BlocksPerGrid, N, M, xd, etad, yd, cohd, bbd,iwd,wtd,&fx,cbhandle,gWORK1);
|
||||
// /* x=R(x,teta); */
|
||||
// cudakernel_fns_R(N,xd,etad,x_propd,cbhandle);
|
||||
// if (!rsdstat) {
|
||||
// /* cost reduced, update solution */
|
||||
// cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
|
||||
// } else {
|
||||
// /* no cost reduction, break loop */
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
|
||||
float Delta_new=itrr(ThreadsPerBlock, BlocksPerGrid, N, M, xd, etad, Hetad, yd, cohd, bbd, iwd, wtd, cbhandle, gWORK1);
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("TR radius given=%f est=%f\n",Delta0,Delta_new);
|
||||
#endif
|
||||
|
||||
//old values
|
||||
//Delta_bar=MIN(fx,0.01f);
|
||||
//Delta0=Delta_bar*0.125f;
|
||||
Delta0=MIN(Delta_new,0.01f); /* need to be more restrictive for EM */
|
||||
Delta_bar=Delta0*8.0f;
|
||||
|
||||
cudakernel_fns_fupdate_weights(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,robust_nu);
|
||||
|
||||
Delta_bar=MIN(fx,0.01f);
|
||||
Delta0=Delta_bar*0.125f;
|
||||
//printf("fx=%g Delta_bar=%g Delta0=%g\n",fx,Delta_bar,Delta0);
|
||||
|
||||
#ifdef DEBUG
|
||||
|
@ -918,7 +1084,8 @@ printf("NEW RTR cost=%g\n",fx);
|
|||
|
||||
/***************************************************/
|
||||
cudaDeviceSynchronize();
|
||||
/* w <= (8+nu)/(1+error^2), q<=w-log(w) */
|
||||
/* w <= (p+nu)/(1+error^2), q<=w-log(w) */
|
||||
/* p = 2, use MAX() residual of XX,XY,YX,YY, not the sum */
|
||||
cudakernel_fns_fupdate_weights_q(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,qd,robust_nu);
|
||||
/* sumq<=sum(w-log(w))/N */
|
||||
cbstatus=cublasSasum(cbhandle, M, qd, 1, &q_sum);
|
||||
|
@ -938,7 +1105,349 @@ printf("NEW RTR cost=%g\n",fx);
|
|||
#ifdef DEBUG
|
||||
printf("nu updated %d from %f [%lf,%lf] to %f\n",ci,robust_nu,robust_nulow,robust_nuhigh,robust_nu1);
|
||||
#endif
|
||||
dp->robust_nu=(double)robust_nu1;
|
||||
/* seems pedantic, but make sure new value for robust_nu fits within bounds */
|
||||
if (robust_nu1<robust_nulow) {
|
||||
dp->robust_nu=robust_nulow;
|
||||
} else if (robust_nu1>robust_nuhigh) {
|
||||
dp->robust_nu=robust_nuhigh;
|
||||
} else {
|
||||
dp->robust_nu=(double)robust_nu1;
|
||||
}
|
||||
|
||||
if(fx0>fx) {
|
||||
//printf("Cost final %g initial %g\n",fx,fx0);
|
||||
/* copy back current solution */
|
||||
err=cudaMemcpy(x,xd,8*N*sizeof(float),cudaMemcpyDeviceToHost);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
/* copy back solution to x0 : format checked*/
|
||||
/* re J(0,0) */
|
||||
my_fcopy(N, &Jd[0], 4, &x0[0], 8);
|
||||
/* im J(0,0) */
|
||||
my_fcopy(N, &Jd[1], 4, &x0[1], 8);
|
||||
/* re J(1,0) */
|
||||
my_fcopy(N, &Jd[2], 4, &x0[4], 8);
|
||||
/* im J(1,0) */
|
||||
my_fcopy(N, &Jd[3], 4, &x0[5], 8);
|
||||
/* re J(0,1) */
|
||||
my_fcopy(N, &Jd[4*N], 4, &x0[2], 8);
|
||||
/* im J(0,1) */
|
||||
my_fcopy(N, &Jd[4*N+1], 4, &x0[3], 8);
|
||||
/* re J(1,1) */
|
||||
my_fcopy(N, &Jd[4*N+2], 4, &x0[6], 8);
|
||||
/* im J(1,1) */
|
||||
my_fcopy(N, &Jd[4*N+3], 4, &x0[7], 8);
|
||||
|
||||
}
|
||||
free(x);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* storage:
|
||||
8N * 6 + N + 8M * 2 + 2M + M (base storage)
|
||||
MAX( 2 * Blocks + 4, 8N(1 + ceil(M/Threads))) for functions
|
||||
Blocks = ceil(M/Threads)
|
||||
*/
|
||||
int
|
||||
nsd_solve_cuda_robust_fl(
|
||||
float *x0, /* initial values and updated solution at output (size 8*N float) */
|
||||
float *y, /* data vector (size 8*M float) */
|
||||
int N, /* no of stations */
|
||||
int M, /* no of constraints */
|
||||
int itmax, /* maximum number of iterations */
|
||||
double robust_nulow, double robust_nuhigh, /* robust nu range */
|
||||
double *info, /* initial and final residuals */
|
||||
cublasHandle_t cbhandle, /* device handle */
|
||||
float *gWORK, /* GPU allocated memory */
|
||||
int tileoff, /* tile offset when solving for many chunks */
|
||||
int ntiles, /* total tile (data) size being solved for */
|
||||
me_data_t *adata)
|
||||
{
|
||||
|
||||
/* general note: all device variables end with a 'd' */
|
||||
cudaError_t err;
|
||||
cublasStatus_t cbstatus;
|
||||
|
||||
/* ME data */
|
||||
me_data_t *dp=(me_data_t*)adata;
|
||||
int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
|
||||
/* coherency on device */
|
||||
float *cohd;
|
||||
/* baseline-station map on device/host */
|
||||
char *bbd;
|
||||
|
||||
/* calculate no of cuda threads and blocks */
|
||||
int ThreadsPerBlock=128;
|
||||
int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
|
||||
|
||||
|
||||
/* reshape x to make J: 2Nx2 complex double
|
||||
*/
|
||||
complex float *x;
|
||||
if ((x=(complex float*)malloc((size_t)4*N*sizeof(complex float)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
/* map x: [(re,im)J_1(0,0) (re,im)J_1(0,1) (re,im)J_1(1,0) (re,im)J_1(1,1)...]
|
||||
to
|
||||
J: [J_1(0,0) J_1(1,0) J_2(0,0) J_2(1,0) ..... J_1(0,1) J_1(1,1) J_2(0,1) J_2(1,1)....]
|
||||
*/
|
||||
float *Jd=(float*)x;
|
||||
/* re J(0,0) */
|
||||
my_fcopy(N, &x0[0], 8, &Jd[0], 4);
|
||||
/* im J(0,0) */
|
||||
my_fcopy(N, &x0[1], 8, &Jd[1], 4);
|
||||
/* re J(1,0) */
|
||||
my_fcopy(N, &x0[4], 8, &Jd[2], 4);
|
||||
/* im J(1,0) */
|
||||
my_fcopy(N, &x0[5], 8, &Jd[3], 4);
|
||||
/* re J(0,1) */
|
||||
my_fcopy(N, &x0[2], 8, &Jd[4*N], 4);
|
||||
/* im J(0,1) */
|
||||
my_fcopy(N, &x0[3], 8, &Jd[4*N+1], 4);
|
||||
/* re J(1,1) */
|
||||
my_fcopy(N, &x0[6], 8, &Jd[4*N+2], 4);
|
||||
/* im J(1,1) */
|
||||
my_fcopy(N, &x0[7], 8, &Jd[4*N+3], 4);
|
||||
|
||||
|
||||
int ci;
|
||||
|
||||
/***************************************************/
|
||||
cuFloatComplex *xd,*fgradxd,*etad,*zd,*x_propd,*z_propd;
|
||||
float *yd;
|
||||
float *wtd,*qd; /* for robust weight and log(weight) */
|
||||
float robust_nu=(float)dp->robust_nu;
|
||||
float q_sum,robust_nu1;
|
||||
float deltanu;
|
||||
int Nd=100; /* no of points where nu is sampled, note Nd<N */
|
||||
if (Nd>M) { Nd=M; }
|
||||
deltanu=(float)(robust_nuhigh-robust_nulow)/(float)Nd;
|
||||
|
||||
/* for counting how many baselines contribute to each station
|
||||
grad/hess calculation */
|
||||
float *iwd,*iw;
|
||||
if ((iw=(float*)malloc((size_t)N*sizeof(float)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
unsigned long int moff=0;
|
||||
fgradxd=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N; /* 4N complex means 8N float */
|
||||
etad=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N;
|
||||
zd=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N;
|
||||
x_propd=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N;
|
||||
xd=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N;
|
||||
z_propd=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N;
|
||||
|
||||
yd=&gWORK[moff];
|
||||
moff+=8*M;
|
||||
cohd=&gWORK[moff];
|
||||
moff+=Nbase*8;
|
||||
bbd=(char*)&gWORK[moff];
|
||||
unsigned long int charstor=(Nbase*2*sizeof(char))/sizeof(float);
|
||||
if (!charstor || charstor%4) {
|
||||
moff+=(charstor/4+1)*4; /* NOTE +4 multiple to align memory */
|
||||
} else {
|
||||
moff+=charstor;
|
||||
}
|
||||
iwd=&gWORK[moff];
|
||||
if (!(N%4)) {
|
||||
moff+=N;
|
||||
} else {
|
||||
moff+=(N/4+1)*4;
|
||||
}
|
||||
wtd=&gWORK[moff];
|
||||
if (!(M%4)) {
|
||||
moff+=M;
|
||||
} else {
|
||||
moff+=(M/4+1)*4;
|
||||
}
|
||||
qd=&gWORK[moff];
|
||||
if (!(M%4)) {
|
||||
moff+=M;
|
||||
} else {
|
||||
moff+=(M/4+1)*4;
|
||||
}
|
||||
|
||||
|
||||
/* remaining memory */
|
||||
float *gWORK1=&gWORK[moff];
|
||||
|
||||
/* yd <=y : V */
|
||||
err=cudaMemcpy(yd, y, 8*M*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* need to give right offset for coherencies */
|
||||
/* offset: cluster offset+time offset */
|
||||
/* C */
|
||||
err=cudaMemcpy(cohd, &(dp->ddcohf[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* correct offset for baselines */
|
||||
err=cudaMemcpy(bbd, &(dp->ddbase[2*(dp->Nbase)*(tileoff)]), Nbase*2*sizeof(char), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* xd <=x : solution */
|
||||
err=cudaMemcpy(xd, x, 8*N*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
float fx,fx0;
|
||||
|
||||
/* count how many baselines contribute to each station, store (inverse) in iwd */
|
||||
count_baselines(Nbase,N,iw,&(dp->ddbase[2*(dp->Nbase)*(tileoff)]),dp->Nt);
|
||||
err=cudaMemcpy(iwd, iw, N*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
free(iw);
|
||||
|
||||
/* set initial weights to 1 by a cuda kernel */
|
||||
cudakernel_setweights_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, wtd, 1.0f);
|
||||
fx=cudakernel_fns_f_robust(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,gWORK1);
|
||||
fx0=fx;
|
||||
#ifdef DEBUG
|
||||
printf("Initial Cost=%g\n",fx0);
|
||||
#endif
|
||||
/***************************************************/
|
||||
// gradient at x0;
|
||||
cudakernel_fns_fgrad_robust(ThreadsPerBlock,BlocksPerGrid,N,M,xd,fgradxd,yd,cohd,bbd,iwd,wtd,1,cbhandle,gWORK1);
|
||||
// Hessian
|
||||
cudakernel_fns_fhess_robust(ThreadsPerBlock,BlocksPerGrid,N,M,xd,xd,zd,yd,cohd,bbd,iwd,wtd,cbhandle,gWORK1);
|
||||
// initial step = 1/||Hess||
|
||||
float hess_nrm;
|
||||
cublasScnrm2(cbhandle,4*N,zd,1,&hess_nrm);
|
||||
float t=1.0f/hess_nrm;
|
||||
/* if initial step too small */
|
||||
if (t<1e-6f) {
|
||||
t=1e-6f;
|
||||
}
|
||||
|
||||
/* z <= x */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,xd,1,zd,1);
|
||||
float theta=1.0f;
|
||||
float ALPHA = 1.01f; // step-size growth factor
|
||||
float BETA = 0.5f; // step-size shrinkage factor
|
||||
int k;
|
||||
cuFloatComplex alpha;
|
||||
|
||||
for (k=0; k<itmax; k++) {
|
||||
/* x_prop <= x */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,xd,1,x_propd,1);
|
||||
/* z_prop <= z */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,zd,1,z_propd,1);
|
||||
|
||||
/* x <= z - t * grad */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,zd,1,xd,1);
|
||||
alpha.x=-t;alpha.y=0.0f;
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, fgradxd, 1, xd, 1);
|
||||
|
||||
/* if ||x-z|| == t||grad|| is below threshold, stop iteration */
|
||||
float grad_nrm,x_nrm;
|
||||
cublasScnrm2(cbhandle,4*N,fgradxd,1,&grad_nrm);
|
||||
cublasScnrm2(cbhandle,4*N,xd,1,&x_nrm);
|
||||
/* norm(y-x)/max(1,norm(x)); */
|
||||
if (grad_nrm*t/MAX(1.0f,x_nrm) < 1e-6f) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
/* theta = 2/(1 + sqrt(1+4/(theta^2))); */
|
||||
theta=2.0f/(1.0f + sqrtf(1.0f+4.0f/(theta*theta)));
|
||||
|
||||
/* z = x + (1-theta)*(x-x_prop);
|
||||
z = (2-theta)*x - (1-theta) * x_prop */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,xd,1,zd,1);
|
||||
alpha.x=(2.0f-theta);alpha.y=0.0f;
|
||||
cbstatus=cublasCscal(cbhandle,4*N,&alpha,zd,1);
|
||||
alpha.x=-(1.0f-theta);alpha.y=0.0f;
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, x_propd, 1, zd, 1);
|
||||
|
||||
/* eta = grad_old;
|
||||
grad <= grad_f( z ) */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,fgradxd,1,etad,1);
|
||||
cudakernel_fns_fgrad_robust(ThreadsPerBlock,BlocksPerGrid,N,M,zd,fgradxd,yd,cohd,bbd,iwd,wtd,1,cbhandle,gWORK1);
|
||||
|
||||
/* z_prop <= z_prop - z */
|
||||
alpha.x=-1.0f;alpha.y=0.0f;
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, zd, 1, z_propd, 1);
|
||||
/* eta <= eta - new_grad */
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, fgradxd, 1, etad, 1);
|
||||
|
||||
/* ||z-z_prop|| */
|
||||
float ydiffnrm;
|
||||
cublasScnrm2(cbhandle,4*N,z_propd,1,&ydiffnrm);
|
||||
/* (z_zold)'*(grad-grad_old) */
|
||||
float dot_ydiff_gdiff;
|
||||
cbstatus=cublasSdot(cbhandle, 8*N, (float*)z_propd, 1, (float*)etad, 1, &dot_ydiff_gdiff);
|
||||
#ifdef DEBUG
|
||||
printf("num=%e den=%e\n",ydiffnrm,dot_ydiff_gdiff);
|
||||
#endif
|
||||
/* the above can be NAN, if so break loop */
|
||||
if (isnan(dot_ydiff_gdiff) || isinf(dot_ydiff_gdiff)) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
/* backtracking
|
||||
t_hat = 0.5*(norm(y-y_old)^2)/abs((y - y_old)'*(g_old - g));
|
||||
t = min( ALPHA*t, max( BETA*t, t_hat ));
|
||||
*/
|
||||
float t_hat=0.5f*(ydiffnrm*ydiffnrm)/fabsf(dot_ydiff_gdiff);
|
||||
t=MIN(ALPHA*t,MAX(BETA*t,t_hat));
|
||||
#ifdef DEBUG
|
||||
printf("k=%d theta=%e step=%e\n",k,theta,t);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* final residual */
|
||||
fx=cudakernel_fns_f_robust(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,gWORK1);
|
||||
info[1]=fx;
|
||||
#ifdef DEBUG
|
||||
printf("NEW NSD cost=%g\n",fx);
|
||||
#endif
|
||||
|
||||
/***************************************************/
|
||||
cudaDeviceSynchronize();
|
||||
/* w <= (p+nu)/(1+error^2), q<=w-log(w) */
|
||||
/* p = 2, use MAX() residual of XX,XY,YX,YY, not the sum */
|
||||
cudakernel_fns_fupdate_weights_q(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,qd,robust_nu);
|
||||
/* sumq<=sum(w-log(w))/N */
|
||||
cbstatus=cublasSasum(cbhandle, M, qd, 1, &q_sum);
|
||||
q_sum/=(float)M;
|
||||
#ifdef DEBUG
|
||||
printf("deltanu=%f sum(w-log(w))=%f\n",deltanu,q_sum);
|
||||
#endif
|
||||
/* for nu range 2~numax evaluate, p-variate T
|
||||
psi((nu0+p)/2)-ln((nu0+p)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1
|
||||
note: AECM not ECME
|
||||
and find min(| |) */
|
||||
int ThreadsPerBlock2=ThreadsPerBlock/4;
|
||||
cudakernel_evaluatenu_fl_eight(ThreadsPerBlock2, (Nd+ThreadsPerBlock-1)/ThreadsPerBlock2, Nd, q_sum, qd, deltanu,(float)robust_nulow,robust_nu);
|
||||
/* find min(abs()) value */
|
||||
cbstatus=cublasIsamin(cbhandle, Nd, qd, 1, &ci); /* 1 based index */
|
||||
robust_nu1=(float)robust_nulow+(float)(ci-1)*deltanu;
|
||||
#ifdef DEBUG
|
||||
printf("nu updated %d from %f [%lf,%lf] to %f\n",ci,robust_nu,robust_nulow,robust_nuhigh,robust_nu1);
|
||||
#endif
|
||||
/* seems pedantic, but make sure new value for robust_nu fits within bounds */
|
||||
if (robust_nu1<robust_nulow) {
|
||||
dp->robust_nu=robust_nulow;
|
||||
} else if (robust_nu1>robust_nuhigh) {
|
||||
dp->robust_nu=robust_nuhigh;
|
||||
} else {
|
||||
dp->robust_nu=(double)robust_nu1;
|
||||
}
|
||||
|
||||
if(fx0>fx) {
|
||||
//printf("Cost final %g initial %g\n",fx,fx0);
|
||||
|
|
|
@ -64,6 +64,7 @@ checkCublasError(cublasStatus_t cbstatus, char *file, int line)
|
|||
|
||||
|
||||
/* cost function */
|
||||
/* storage <= (2 Blocks+4) + 8N */
|
||||
static float
|
||||
cudakernel_fns_f_robust_admm(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *Y, cuFloatComplex *Z, float admm_rho, float *y, float *coh, char *bbh, float *wtd, cublasHandle_t cbhandle, float *gWORK){
|
||||
cuFloatComplex *Yd;
|
||||
|
@ -113,7 +114,7 @@ cudakernel_fns_proj_admm(int N, cuFloatComplex *x, cuFloatComplex *z, cuFloatCom
|
|||
|
||||
|
||||
/* gradient, also projected to tangent space */
|
||||
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
|
||||
/* need 8N*M/ThreadsPerBlock+ 8N float storage */
|
||||
static void
|
||||
cudakernel_fns_fgrad_robust_admm(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *Y, cuFloatComplex *Z, float admm_rho, cuFloatComplex *eta, float *y, float *coh, char *bbh, float *iw, float *wtd, int negate, cublasHandle_t cbhandle,float *gWORK) {
|
||||
|
||||
|
@ -182,7 +183,7 @@ cudakernel_fns_fgrad_robust_admm(int ThreadsPerBlock, int BlocksPerGrid, int N,
|
|||
}
|
||||
|
||||
/* Hessian, also projected to tangent space */
|
||||
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
|
||||
/* need 8N*M/ThreadsPerBlock+ 8N float storage */
|
||||
static void
|
||||
cudakernel_fns_fhess_robust_admm(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *Y, cuFloatComplex *Z, float admm_rho, cuFloatComplex *eta, cuFloatComplex *fhess, float *y, float *coh, char *bbh, float *iw, float *wtd, cublasHandle_t cbhandle, float *gWORK) {
|
||||
cuFloatComplex *tempeta;
|
||||
|
@ -276,6 +277,161 @@ armijostep(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex
|
|||
}
|
||||
|
||||
|
||||
/* Fine tune initial trust region radius, also update initial value for x
|
||||
A. Sartenaer, 1995
|
||||
returns : trust region estimate,
|
||||
also modifies x
|
||||
eta,Heta: used as storage
|
||||
*/
|
||||
/* need 8N*2 + MAX(8N+2 Blocks + 4, 8N (1 + ceil(M/Threads))) float storage */
|
||||
static float
|
||||
itrr(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *Y, cuFloatComplex *Z, float admm_rho, cuFloatComplex *eta, cuFloatComplex *Heta, float *y, float *coh, char *bbh, float *iw, float *wtd, cublasHandle_t cbhandle, float *gWORK) {
|
||||
cuFloatComplex alpha;
|
||||
cublasStatus_t cbstatus;
|
||||
/* temp storage, re-using global storage */
|
||||
cuFloatComplex *s, *x_prop;
|
||||
unsigned long int moff=0;
|
||||
s=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N;
|
||||
x_prop=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N;
|
||||
float *gWORK1=&gWORK[moff];
|
||||
|
||||
|
||||
float f0,fk,mk,rho,rho1,Delta0;
|
||||
/* initialize trust region radii */
|
||||
float delta_0=1.0f;
|
||||
float delta_m=0.0f;
|
||||
|
||||
float sigma=0.0f;
|
||||
float delta=0.0f;
|
||||
|
||||
// initial cost
|
||||
f0=cudakernel_fns_f_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,x,Y,Z,admm_rho,y,coh,bbh,wtd,cbhandle,gWORK1);
|
||||
// gradient at x0;
|
||||
cudakernel_fns_fgrad_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,x,Y,Z,admm_rho,eta,y,coh,bbh,iw,wtd,1,cbhandle, gWORK1);
|
||||
// normalize
|
||||
float eta_nrm;
|
||||
cublasScnrm2(cbhandle,4*N,eta,1,&eta_nrm);
|
||||
alpha.x=1.0f/eta_nrm;alpha.y=0.0f;
|
||||
cbstatus=cublasCscal(cbhandle,4*N,&alpha,eta,1);
|
||||
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,eta,1,s,1);
|
||||
alpha.x=delta_0;alpha.y=0.0f;
|
||||
cbstatus=cublasCscal(cbhandle,4*N,&alpha,s,1);
|
||||
/* Hessian at s */
|
||||
cudakernel_fns_fhess_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,x,Y,Z,admm_rho,s,Heta,y,coh,bbh,iw,wtd,cbhandle,gWORK1);
|
||||
|
||||
/* constants used */
|
||||
float gamma_1=0.0625f; float gamma_2=5.0f; float gamma_3=0.5f; float gamma_4=2.0f;
|
||||
float mu_0=0.5f; float mu_1=0.5f; float mu_2=0.35f;
|
||||
float teta=0.25f;
|
||||
|
||||
|
||||
int MK=4;
|
||||
int m;
|
||||
for (m=0; m<MK; m++) {
|
||||
/* x_prop=x0-s */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,x,1,x_prop,1);
|
||||
alpha.x=-1.0f;alpha.y=0.0f;
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, s, 1, x_prop, 1);
|
||||
|
||||
/* model = f0 - g(x_prop,g0,s) - 0.5 g(x_prop,Hess,s) */
|
||||
mk=f0-cudakernel_fns_g(N,x_prop,eta,s,cbhandle)-0.5f*cudakernel_fns_g(N,x_prop,Heta,s,cbhandle);
|
||||
fk=cudakernel_fns_f_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,x_prop,Y,Z,admm_rho,y,coh,bbh,wtd,cbhandle,gWORK1);
|
||||
|
||||
if (f0==mk) {
|
||||
rho=1e9f;
|
||||
} else {
|
||||
rho=(f0-fk)/(f0-mk);
|
||||
}
|
||||
rho1=fabsf(rho-1.0f);
|
||||
|
||||
/* update max radius */
|
||||
if (rho1<mu_0) {
|
||||
delta_m=MAX(delta_m,delta_0);
|
||||
}
|
||||
if ((f0-fk)>delta) {
|
||||
delta=f0-fk;
|
||||
sigma=delta_0;
|
||||
}
|
||||
/* radius update */
|
||||
float beta_1,beta_2,beta_i;
|
||||
beta_1=0.0f;
|
||||
beta_2=0.0f;
|
||||
|
||||
if (m<MK) {
|
||||
float g0_s=cudakernel_fns_g(N,x,eta,s,cbhandle);
|
||||
float b1=(teta*(f0-g0_s)+(1.0f-teta)*mk-fk);
|
||||
beta_1=(b1==0.0f?1e9f:-teta*g0_s/b1);
|
||||
|
||||
float b2=(-teta*(f0-g0_s)+(1.0f+teta)*mk-fk);
|
||||
beta_2=(b2==0.0f?1e9f:teta*g0_s/b2);
|
||||
|
||||
float minbeta=MIN(beta_1,beta_2);
|
||||
float maxbeta=MAX(beta_1,beta_2);
|
||||
if (rho1>mu_1) {
|
||||
if (minbeta>1.0f) {
|
||||
beta_i=gamma_3;
|
||||
} else if ((maxbeta<gamma_1) || (minbeta<gamma_1 && maxbeta>=1.0f)) {
|
||||
beta_i=gamma_1;
|
||||
} else if ((beta_1>=gamma_1 && beta_1<1.0f) && (beta_2<gamma_1 || beta_2>=1.0f)) {
|
||||
beta_i=beta_1;
|
||||
} else if ((beta_2>=gamma_1 && beta_2<1.0f) && (beta_1<gamma_1 || beta_1>=1.0f)) {
|
||||
beta_i=beta_2;
|
||||
} else {
|
||||
beta_i=maxbeta;
|
||||
}
|
||||
} else if (rho1<=mu_2) {
|
||||
if (maxbeta<1.0f) {
|
||||
beta_i=gamma_4;
|
||||
} else if (maxbeta>gamma_2) {
|
||||
beta_i=gamma_2;
|
||||
} else if ((beta_1>=1.0f && beta_1<=gamma_2) && beta_2<1.0f) {
|
||||
beta_i=beta_1;
|
||||
} else if ((beta_2>=1.0f && beta_2<=gamma_2) && beta_1<1.0f) {
|
||||
beta_i=beta_2;
|
||||
} else {
|
||||
beta_i=maxbeta;
|
||||
}
|
||||
} else {
|
||||
if (maxbeta<gamma_3) {
|
||||
beta_i=gamma_3;
|
||||
} else if (maxbeta>gamma_4) {
|
||||
beta_i=gamma_4;
|
||||
} else {
|
||||
beta_i=maxbeta;
|
||||
}
|
||||
}
|
||||
/* update radius */
|
||||
delta_0=delta_0/beta_i;
|
||||
}
|
||||
#ifdef DEBUG
|
||||
printf("m=%d delta_0=%e delta_max=%e beta=%e rho=%e\n",m,delta_0,delta_m,beta_i,rho);
|
||||
#endif
|
||||
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,eta,1,s,1);
|
||||
alpha.x=delta_0;alpha.y=0.0f;
|
||||
cbstatus=cublasCscal(cbhandle,4*N,&alpha,s,1);
|
||||
}
|
||||
|
||||
// update initial value
|
||||
if (delta>0.0f) {
|
||||
alpha.x=-sigma; alpha.y=0.0f;
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, eta, 1, x, 1);
|
||||
}
|
||||
|
||||
if (delta_m>0.0f) {
|
||||
Delta0=delta_m;
|
||||
} else {
|
||||
Delta0=delta_0;
|
||||
}
|
||||
|
||||
return Delta0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* truncated conjugate gradient method
|
||||
x, grad, eta, r, z, delta, Hxd : size 2N x 2 complex
|
||||
so, vector size is 4N complex double
|
||||
|
@ -399,6 +555,11 @@ tcg_solve_cuda(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComp
|
|||
}
|
||||
|
||||
|
||||
/* storage:
|
||||
8N * 5 + N + 8M * 2 + 2M + M (base storage)
|
||||
MAX( 8N+ 2 * Blocks + 4, 8N(6 + ceil(M/Threads))) for functions
|
||||
Blocks = ceil(M/Threads)
|
||||
*/
|
||||
int
|
||||
rtr_solve_cuda_robust_admm_fl(
|
||||
float *x0, /* initial values and updated solution at output (size 8*N float) */
|
||||
|
@ -407,8 +568,7 @@ rtr_solve_cuda_robust_admm_fl(
|
|||
float *y, /* data vector (size 8*M float) */
|
||||
int N, /* no of stations */
|
||||
int M, /* no of constraints */
|
||||
int itmax_sd, /* maximum number of iterations RSD */
|
||||
int itmax_rtr, /* maximum number of iterations RTR */
|
||||
int itmax_rtr, /* maximum number of iterations */
|
||||
float Delta_bar, float Delta0, /* Trust region radius and initial value */
|
||||
float admm_rho, /* ADMM regularization */
|
||||
double robust_nulow, double robust_nuhigh, /* robust nu range */
|
||||
|
@ -572,12 +732,6 @@ rtr_solve_cuda_robust_admm_fl(
|
|||
cudaMalloc((void **)&Yd, 4*N*sizeof(cuFloatComplex));
|
||||
cudaMalloc((void **)&Zd, 4*N*sizeof(cuFloatComplex));
|
||||
|
||||
|
||||
/* need 8N*(BlocksPerGrid+8) for tcg_solve+grad/hess storage,
|
||||
so total storage needed is
|
||||
8N*(BlocksPerGrid+8) + 8N*5 + 8*M + 8*Nbase + 2*Nbase + N + M + M
|
||||
plus 8N + 8N for ADMM params (Y and BZ)
|
||||
*/
|
||||
/* remaining memory */
|
||||
float *gWORK1=&gWORK[moff];
|
||||
|
||||
|
@ -616,28 +770,39 @@ rtr_solve_cuda_robust_admm_fl(
|
|||
printf("Initial Cost=%g\n",fx0);
|
||||
#endif
|
||||
/***************************************************/
|
||||
int rsdstat=0;
|
||||
/* RSD solution */
|
||||
for (ci=0; ci<itmax_sd; ci++) {
|
||||
/* Armijo step */
|
||||
rsdstat=armijostep(ThreadsPerBlock, BlocksPerGrid, N, M, xd, Yd,Zd,admm_rho, etad, yd, cohd, bbd,iwd,wtd,&fx,cbhandle,gWORK1);
|
||||
/* x=R(x,teta); */
|
||||
cudakernel_fns_R(N,xd,etad,x_propd,cbhandle);
|
||||
if (!rsdstat) {
|
||||
/* cost reduced, update solution */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
|
||||
} else {
|
||||
/* no cost reduction, break loop */
|
||||
break;
|
||||
}
|
||||
}
|
||||
// int rsdstat=0;
|
||||
// /* RSD solution - disabled */
|
||||
// for (ci=0; ci<itmax_sd; ci++) {
|
||||
// /* Armijo step */
|
||||
// rsdstat=armijostep(ThreadsPerBlock, BlocksPerGrid, N, M, xd, Yd,Zd,admm_rho, etad, yd, cohd, bbd,iwd,wtd,&fx,cbhandle,gWORK1);
|
||||
// /* x=R(x,teta); */
|
||||
// cudakernel_fns_R(N,xd,etad,x_propd,cbhandle);
|
||||
// if (!rsdstat) {
|
||||
// /* cost reduced, update solution */
|
||||
// cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
|
||||
// } else {
|
||||
// /* no cost reduction, break loop */
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
|
||||
float Delta_new=itrr(ThreadsPerBlock, BlocksPerGrid, N, M, xd, Yd,Zd,admm_rho, etad, Hetad, yd, cohd, bbd, iwd, wtd, cbhandle, gWORK1);
|
||||
#ifdef DEBUG
|
||||
printf("TR radius given=%f est=%f\n",Delta0,Delta_new);
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
//old values
|
||||
//Delta_bar=MIN(fx,Delta_bar);
|
||||
//Delta0=Delta_bar*0.125f;
|
||||
Delta0=MIN(Delta_new,0.01f); /* need to be more restrictive for EM */
|
||||
Delta_bar=Delta0*8.0f;
|
||||
|
||||
//printf("fx=%g Delta_bar=%g Delta0=%g\n",fx,Delta_bar,Delta0);
|
||||
|
||||
cudakernel_fns_fupdate_weights(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,robust_nu);
|
||||
|
||||
Delta_bar=MIN(fx,Delta_bar);
|
||||
Delta0=Delta_bar*0.125f;
|
||||
//printf("fx=%g Delta_bar=%g Delta0=%g\n",fx,Delta_bar,Delta0);
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("NEW RSD cost=%g\n",fx);
|
||||
#endif
|
||||
|
@ -787,7 +952,8 @@ printf("NEW RTR cost=%g\n",fx);
|
|||
|
||||
/***************************************************/
|
||||
cudaDeviceSynchronize();
|
||||
/* w <= (8+nu)/(1+error^2), q<=w-log(w) */
|
||||
/* w <= (p+nu)/(1+error^2), q<=w-log(w) */
|
||||
/* p = 2, use MAX() residual of XX,XY,YX,YY, not the sum */
|
||||
cudakernel_fns_fupdate_weights_q(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,qd,robust_nu);
|
||||
/* sumq<=sum(w-log(w))/N */
|
||||
cbstatus=cublasSasum(cbhandle, M, qd, 1, &q_sum);
|
||||
|
@ -807,7 +973,14 @@ printf("NEW RTR cost=%g\n",fx);
|
|||
#ifdef DEBUG
|
||||
printf("nu updated %d from %f [%lf,%lf] to %f\n",ci,robust_nu,robust_nulow,robust_nuhigh,robust_nu1);
|
||||
#endif
|
||||
dp->robust_nu=(double)robust_nu1;
|
||||
/* seems pedantic, but make sure new value for robust_nu fits within bounds */
|
||||
if (robust_nu1<robust_nulow) {
|
||||
dp->robust_nu=robust_nulow;
|
||||
} else if (robust_nu1>robust_nuhigh) {
|
||||
dp->robust_nu=robust_nuhigh;
|
||||
} else {
|
||||
dp->robust_nu=(double)robust_nu1;
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Cost final %g initial %g\n",fx,fx0);
|
||||
|
@ -846,3 +1019,387 @@ printf("NEW RTR cost=%g\n",fx);
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* storage:
|
||||
8N * 6 + N + 8M * 2 + 2M + M (base storage)
|
||||
MAX( 2 * Blocks + 4, 8N(1 + ceil(M/Threads))) for functions
|
||||
Blocks = ceil(M/Threads)
|
||||
*/
|
||||
int
|
||||
nsd_solve_cuda_robust_admm_fl(
|
||||
float *x0, /* initial values and updated solution at output (size 8*N float) */
|
||||
float *Y, /* Lagrange multiplier size 8N */
|
||||
float *Z, /* consensus term B Z size 8N */
|
||||
float *y, /* data vector (size 8*M float) */
|
||||
int N, /* no of stations */
|
||||
int M, /* no of constraints */
|
||||
int itmax, /* maximum number of iterations */
|
||||
float admm_rho, /* ADMM regularization */
|
||||
double robust_nulow, double robust_nuhigh, /* robust nu range */
|
||||
double *info, /* initial and final residuals */
|
||||
cublasHandle_t cbhandle, /* device handle */
|
||||
float *gWORK, /* GPU allocated memory */
|
||||
int tileoff, /* tile offset when solving for many chunks */
|
||||
int ntiles, /* total tile (data) size being solved for */
|
||||
me_data_t *adata)
|
||||
{
|
||||
|
||||
/* general note: all device variables end with a 'd' */
|
||||
cudaError_t err;
|
||||
cublasStatus_t cbstatus;
|
||||
|
||||
/* ME data */
|
||||
me_data_t *dp=(me_data_t*)adata;
|
||||
int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
|
||||
/* coherency on device */
|
||||
float *cohd;
|
||||
/* baseline-station map on device/host */
|
||||
char *bbd;
|
||||
|
||||
/* calculate no of cuda threads and blocks */
|
||||
int ThreadsPerBlock=128;
|
||||
int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock;
|
||||
|
||||
|
||||
/* reshape x to make J: 2Nx2 complex double
|
||||
*/
|
||||
complex float *x;
|
||||
if ((x=(complex float*)malloc((size_t)4*N*sizeof(complex float)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
/* map x: [(re,im)J_1(0,0) (re,im)J_1(0,1) (re,im)J_1(1,0) (re,im)J_1(1,1)...]
|
||||
to
|
||||
J: [J_1(0,0) J_1(1,0) J_2(0,0) J_2(1,0) ..... J_1(0,1) J_1(1,1) J_2(0,1) J_2(1,1)....]
|
||||
*/
|
||||
float *Jd=(float*)x;
|
||||
/* re J(0,0) */
|
||||
my_fcopy(N, &x0[0], 8, &Jd[0], 4);
|
||||
/* im J(0,0) */
|
||||
my_fcopy(N, &x0[1], 8, &Jd[1], 4);
|
||||
/* re J(1,0) */
|
||||
my_fcopy(N, &x0[4], 8, &Jd[2], 4);
|
||||
/* im J(1,0) */
|
||||
my_fcopy(N, &x0[5], 8, &Jd[3], 4);
|
||||
/* re J(0,1) */
|
||||
my_fcopy(N, &x0[2], 8, &Jd[4*N], 4);
|
||||
/* im J(0,1) */
|
||||
my_fcopy(N, &x0[3], 8, &Jd[4*N+1], 4);
|
||||
/* re J(1,1) */
|
||||
my_fcopy(N, &x0[6], 8, &Jd[4*N+2], 4);
|
||||
/* im J(1,1) */
|
||||
my_fcopy(N, &x0[7], 8, &Jd[4*N+3], 4);
|
||||
|
||||
|
||||
complex float *Zx,*Yx;
|
||||
if ((Zx=(complex float*)malloc((size_t)4*N*sizeof(complex float)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if ((Yx=(complex float*)malloc((size_t)4*N*sizeof(complex float)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
float *YY=(float*)Yx;
|
||||
my_fcopy(N, &Y[0], 8, &YY[0], 4);
|
||||
my_fcopy(N, &Y[1], 8, &YY[1], 4);
|
||||
my_fcopy(N, &Y[4], 8, &YY[2], 4);
|
||||
my_fcopy(N, &Y[5], 8, &YY[3], 4);
|
||||
my_fcopy(N, &Y[2], 8, &YY[4*N], 4);
|
||||
my_fcopy(N, &Y[3], 8, &YY[4*N+1], 4);
|
||||
my_fcopy(N, &Y[6], 8, &YY[4*N+2], 4);
|
||||
my_fcopy(N, &Y[7], 8, &YY[4*N+3], 4);
|
||||
float *ZZ=(float*)Zx;
|
||||
my_fcopy(N, &Z[0], 8, &ZZ[0], 4);
|
||||
my_fcopy(N, &Z[1], 8, &ZZ[1], 4);
|
||||
my_fcopy(N, &Z[4], 8, &ZZ[2], 4);
|
||||
my_fcopy(N, &Z[5], 8, &ZZ[3], 4);
|
||||
my_fcopy(N, &Z[2], 8, &ZZ[4*N], 4);
|
||||
my_fcopy(N, &Z[3], 8, &ZZ[4*N+1], 4);
|
||||
my_fcopy(N, &Z[6], 8, &ZZ[4*N+2], 4);
|
||||
my_fcopy(N, &Z[7], 8, &ZZ[4*N+3], 4);
|
||||
|
||||
|
||||
int ci;
|
||||
|
||||
/***************************************************/
|
||||
cuFloatComplex *xd,*fgradxd,*etad,*zd,*x_propd,*z_propd,*Yd,*Zd;
|
||||
float *yd;
|
||||
float *wtd,*qd; /* for robust weight and log(weight) */
|
||||
float robust_nu=(float)dp->robust_nu;
|
||||
float q_sum,robust_nu1;
|
||||
float deltanu;
|
||||
int Nd=100; /* no of points where nu is sampled, note Nd<N */
|
||||
if (Nd>M) { Nd=M; }
|
||||
deltanu=(float)(robust_nuhigh-robust_nulow)/(float)Nd;
|
||||
|
||||
/* for counting how many baselines contribute to each station
|
||||
grad/hess calculation */
|
||||
float *iwd,*iw;
|
||||
if ((iw=(float*)malloc((size_t)N*sizeof(float)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
unsigned long int moff=0;
|
||||
fgradxd=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N; /* 4N complex means 8N float */
|
||||
etad=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N;
|
||||
zd=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N;
|
||||
x_propd=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N;
|
||||
xd=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N;
|
||||
z_propd=(cuFloatComplex*)&gWORK[moff];
|
||||
moff+=8*N;
|
||||
|
||||
yd=&gWORK[moff];
|
||||
moff+=8*M;
|
||||
cohd=&gWORK[moff];
|
||||
moff+=Nbase*8;
|
||||
bbd=(char*)&gWORK[moff];
|
||||
unsigned long int charstor=(Nbase*2*sizeof(char))/sizeof(float);
|
||||
if (!charstor || charstor%4) {
|
||||
moff+=(charstor/4+1)*4; /* NOTE +4 multiple to align memory */
|
||||
} else {
|
||||
moff+=charstor;
|
||||
}
|
||||
iwd=&gWORK[moff];
|
||||
if (!(N%4)) {
|
||||
moff+=N;
|
||||
} else {
|
||||
moff+=(N/4+1)*4;
|
||||
}
|
||||
wtd=&gWORK[moff];
|
||||
if (!(M%4)) {
|
||||
moff+=M;
|
||||
} else {
|
||||
moff+=(M/4+1)*4;
|
||||
}
|
||||
qd=&gWORK[moff];
|
||||
if (!(M%4)) {
|
||||
moff+=M;
|
||||
} else {
|
||||
moff+=(M/4+1)*4;
|
||||
}
|
||||
|
||||
|
||||
cudaMalloc((void **)&Yd, 4*N*sizeof(cuFloatComplex));
|
||||
cudaMalloc((void **)&Zd, 4*N*sizeof(cuFloatComplex));
|
||||
|
||||
/* remaining memory */
|
||||
float *gWORK1=&gWORK[moff];
|
||||
|
||||
/* yd <=y : V */
|
||||
err=cudaMemcpy(yd, y, 8*M*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* need to give right offset for coherencies */
|
||||
/* offset: cluster offset+time offset */
|
||||
/* C */
|
||||
err=cudaMemcpy(cohd, &(dp->ddcohf[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* correct offset for baselines */
|
||||
err=cudaMemcpy(bbd, &(dp->ddbase[2*(dp->Nbase)*(tileoff)]), Nbase*2*sizeof(char), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* xd <=x : solution */
|
||||
err=cudaMemcpy(xd, x, 8*N*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(Yd, Yx, 8*N*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(Zd, Zx, 8*N*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
float fx,fx0;
|
||||
|
||||
/* count how many baselines contribute to each station, store (inverse) in iwd */
|
||||
count_baselines(Nbase,N,iw,&(dp->ddbase[2*(dp->Nbase)*(tileoff)]),dp->Nt);
|
||||
err=cudaMemcpy(iwd, iw, N*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
free(iw);
|
||||
|
||||
/* set initial weights to 1 by a cuda kernel */
|
||||
cudakernel_setweights_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, wtd, 1.0f);
|
||||
fx=cudakernel_fns_f_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,xd,Yd,Zd,admm_rho,yd,cohd,bbd,wtd,cbhandle,gWORK1);
|
||||
fx0=fx;
|
||||
#ifdef DEBUG
|
||||
printf("Initial Cost=%g\n",fx0);
|
||||
#endif
|
||||
/***************************************************/
|
||||
// gradient at x0;
|
||||
cudakernel_fns_fgrad_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,xd,Yd,Zd,admm_rho,fgradxd,yd,cohd,bbd,iwd,wtd,1,cbhandle,gWORK1);
|
||||
// Hessian
|
||||
cudakernel_fns_fhess_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,xd,Yd,Zd,admm_rho,xd,zd,yd,cohd,bbd,iwd,wtd,cbhandle,gWORK1);
|
||||
// initial step = 1/||Hess||
|
||||
float hess_nrm;
|
||||
cublasScnrm2(cbhandle,4*N,zd,1,&hess_nrm);
|
||||
float t=1.0f/hess_nrm;
|
||||
/* if initial step too small */
|
||||
if (t<1e-6f) {
|
||||
t=1e-6f;
|
||||
}
|
||||
|
||||
/* z <= x */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,xd,1,zd,1);
|
||||
float theta=1.0f;
|
||||
float ALPHA = 1.01f; // step-size growth factor
|
||||
float BETA = 0.5f; // step-size shrinkage factor
|
||||
int k;
|
||||
cuFloatComplex alpha;
|
||||
|
||||
for (k=0; k<itmax; k++) {
|
||||
/* x_prop <= x */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,xd,1,x_propd,1);
|
||||
/* z_prop <= z */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,zd,1,z_propd,1);
|
||||
|
||||
/* x <= z - t * grad */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,zd,1,xd,1);
|
||||
alpha.x=-t;alpha.y=0.0f;
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, fgradxd, 1, xd, 1);
|
||||
|
||||
/* if ||x-z|| == t||grad|| is below threshold, stop iteration */
|
||||
float grad_nrm,x_nrm;
|
||||
cublasScnrm2(cbhandle,4*N,fgradxd,1,&grad_nrm);
|
||||
cublasScnrm2(cbhandle,4*N,xd,1,&x_nrm);
|
||||
/* norm(y-x)/max(1,norm(x)); */
|
||||
if (grad_nrm*t/MAX(1.0f,x_nrm) < 1e-6f) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
/* theta = 2/(1 + sqrt(1+4/(theta^2))); */
|
||||
theta=2.0f/(1.0f + sqrtf(1.0f+4.0f/(theta*theta)));
|
||||
|
||||
/* z = x + (1-theta)*(x-x_prop);
|
||||
z = (2-theta)*x - (1-theta) * x_prop */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,xd,1,zd,1);
|
||||
alpha.x=(2.0f-theta);alpha.y=0.0f;
|
||||
cbstatus=cublasCscal(cbhandle,4*N,&alpha,zd,1);
|
||||
alpha.x=-(1.0f-theta);alpha.y=0.0f;
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, x_propd, 1, zd, 1);
|
||||
|
||||
/* eta = grad_old;
|
||||
grad <= grad_f( z ) */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,fgradxd,1,etad,1);
|
||||
cudakernel_fns_fgrad_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,zd,Yd,Zd,admm_rho,fgradxd,yd,cohd,bbd,iwd,wtd,1,cbhandle,gWORK1);
|
||||
|
||||
/* z_prop <= z_prop - z */
|
||||
alpha.x=-1.0f;alpha.y=0.0f;
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, zd, 1, z_propd, 1);
|
||||
/* eta <= eta - new_grad */
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, fgradxd, 1, etad, 1);
|
||||
|
||||
/* ||z-z_prop|| */
|
||||
float ydiffnrm;
|
||||
cublasScnrm2(cbhandle,4*N,z_propd,1,&ydiffnrm);
|
||||
/* (z_zold)'*(grad-grad_old) */
|
||||
float dot_ydiff_gdiff;
|
||||
cbstatus=cublasSdot(cbhandle, 8*N, (float*)z_propd, 1, (float*)etad, 1, &dot_ydiff_gdiff);
|
||||
#ifdef DEBUG
|
||||
printf("num=%e den=%e\n",ydiffnrm,dot_ydiff_gdiff);
|
||||
#endif
|
||||
/* the above can be NAN, if so break loop */
|
||||
if (isnan(dot_ydiff_gdiff) || isinf(dot_ydiff_gdiff)) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
/* backtracking
|
||||
t_hat = 0.5*(norm(y-y_old)^2)/abs((y - y_old)'*(g_old - g));
|
||||
t = min( ALPHA*t, max( BETA*t, t_hat ));
|
||||
*/
|
||||
float t_hat=0.5f*(ydiffnrm*ydiffnrm)/fabsf(dot_ydiff_gdiff);
|
||||
t=MIN(ALPHA*t,MAX(BETA*t,t_hat));
|
||||
#ifdef DEBUG
|
||||
printf("k=%d theta=%e step=%e\n",k,theta,t);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* final residual */
|
||||
fx=cudakernel_fns_f_robust_admm(ThreadsPerBlock,BlocksPerGrid,N,M,xd,Yd,Zd,admm_rho,yd,cohd,bbd,wtd,cbhandle,gWORK1);
|
||||
info[1]=fx;
|
||||
#ifdef DEBUG
|
||||
printf("NEW NSD cost=%g\n",fx);
|
||||
#endif
|
||||
|
||||
/***************************************************/
|
||||
cudaDeviceSynchronize();
|
||||
/* w <= (p+nu)/(1+error^2), q<=w-log(w) */
|
||||
/* p = 2, use MAX() residual of XX,XY,YX,YY, not the sum */
|
||||
cudakernel_fns_fupdate_weights_q(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd,wtd,qd,robust_nu);
|
||||
/* sumq<=sum(w-log(w))/N */
|
||||
cbstatus=cublasSasum(cbhandle, M, qd, 1, &q_sum);
|
||||
q_sum/=(float)M;
|
||||
#ifdef DEBUG
|
||||
printf("deltanu=%f sum(w-log(w))=%f\n",deltanu,q_sum);
|
||||
#endif
|
||||
/* for nu range 2~numax evaluate, p-variate T
|
||||
psi((nu0+p)/2)-ln((nu0+p)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1
|
||||
note: AECM not ECME
|
||||
and find min(| |) */
|
||||
int ThreadsPerBlock2=ThreadsPerBlock/4;
|
||||
cudakernel_evaluatenu_fl_eight(ThreadsPerBlock2, (Nd+ThreadsPerBlock-1)/ThreadsPerBlock2, Nd, q_sum, qd, deltanu,(float)robust_nulow,robust_nu);
|
||||
/* find min(abs()) value */
|
||||
cbstatus=cublasIsamin(cbhandle, Nd, qd, 1, &ci); /* 1 based index */
|
||||
robust_nu1=(float)robust_nulow+(float)(ci-1)*deltanu;
|
||||
#ifdef DEBUG
|
||||
printf("nu updated %d from %f [%lf,%lf] to %f\n",ci,robust_nu,robust_nulow,robust_nuhigh,robust_nu1);
|
||||
#endif
|
||||
/* seems pedantic, but make sure new value for robust_nu fits within bounds */
|
||||
if (robust_nu1<robust_nulow) {
|
||||
dp->robust_nu=robust_nulow;
|
||||
} else if (robust_nu1>robust_nuhigh) {
|
||||
dp->robust_nu=robust_nuhigh;
|
||||
} else {
|
||||
dp->robust_nu=(double)robust_nu1;
|
||||
}
|
||||
|
||||
if(fx0>fx) {
|
||||
//printf("Cost final %g initial %g\n",fx,fx0);
|
||||
/* copy back current solution */
|
||||
err=cudaMemcpy(x,xd,8*N*sizeof(float),cudaMemcpyDeviceToHost);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
/* copy back solution to x0 : format checked*/
|
||||
/* re J(0,0) */
|
||||
my_fcopy(N, &Jd[0], 4, &x0[0], 8);
|
||||
/* im J(0,0) */
|
||||
my_fcopy(N, &Jd[1], 4, &x0[1], 8);
|
||||
/* re J(1,0) */
|
||||
my_fcopy(N, &Jd[2], 4, &x0[4], 8);
|
||||
/* im J(1,0) */
|
||||
my_fcopy(N, &Jd[3], 4, &x0[5], 8);
|
||||
/* re J(0,1) */
|
||||
my_fcopy(N, &Jd[4*N], 4, &x0[2], 8);
|
||||
/* im J(0,1) */
|
||||
my_fcopy(N, &Jd[4*N+1], 4, &x0[3], 8);
|
||||
/* re J(1,1) */
|
||||
my_fcopy(N, &Jd[4*N+2], 4, &x0[6], 8);
|
||||
/* im J(1,1) */
|
||||
my_fcopy(N, &Jd[4*N+3], 4, &x0[7], 8);
|
||||
|
||||
}
|
||||
cudaFree(Yd);
|
||||
cudaFree(Zd);
|
||||
|
||||
free(x);
|
||||
free(Yx);
|
||||
free(Zx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -71,6 +71,9 @@
|
|||
#define STYPE_RING 3
|
||||
#define STYPE_SHAPELET 4
|
||||
|
||||
/* max source name length, increase it if names get longer */
|
||||
#define MAX_SNAME 2048
|
||||
|
||||
/********* constants - from levmar ******************/
|
||||
#define CLM_INIT_MU 1E-03
|
||||
#define CLM_STOP_THRESH 1E-17
|
||||
|
@ -192,6 +195,8 @@ typedef struct thread_data_base_ {
|
|||
|
||||
/* following for ignoring clusters in simulation */
|
||||
int *ignlist; /* Mx1 array, if any value 1, that cluster will not be simulated */
|
||||
/* flag for adding model to data */
|
||||
int add_to_data;
|
||||
|
||||
/* following used for multifrequency (channel) data */
|
||||
double *freqs;
|
||||
|
@ -264,6 +269,18 @@ typedef struct thread_data_setwt_ {
|
|||
|
||||
} thread_data_setwt_t;
|
||||
|
||||
/* structure for weight calculation for baselines */
|
||||
typedef struct thread_data_baselinewt_ {
|
||||
int Nb; /* no of baselines this handle */
|
||||
int boff; /* baseline offset per thread */
|
||||
|
||||
double *wt; /* 8 values per baseline */
|
||||
double *u,*v;
|
||||
double freq0;
|
||||
|
||||
} thread_data_baselinewt_t;
|
||||
|
||||
|
||||
|
||||
/* structure for worker threads for jacobian calculation */
|
||||
typedef struct thread_data_jac_ {
|
||||
|
@ -436,6 +453,21 @@ read_solutions(FILE *sfp,double *p,clus_source_t *carr,int N,int M);
|
|||
*/
|
||||
extern int
|
||||
update_ignorelist(const char *ignfile, int *ignlist, int M, clus_source_t *carr);
|
||||
|
||||
/* read ADMM regularization factor per cluster from text file, format:
|
||||
cluster_id hybrid_parameter admm_rho
|
||||
...
|
||||
...
|
||||
(M values)
|
||||
and store it in array arho : size Mtx1, taking into account the hybrid parameter
|
||||
also in array arhoslave : size Mx1, without taking hybrid params into account
|
||||
|
||||
admm_rho : can be 0 to ignore consensus, just normal calibration
|
||||
*/
|
||||
|
||||
extern int
|
||||
read_arho_fromfile(const char *admm_rho_file,int Mt,double *arho, int M, double *arhoslave);
|
||||
|
||||
/****************************** dataio.c ****************************/
|
||||
/* open binary file for input/output
|
||||
datfile: data file descriptor id
|
||||
|
@ -874,7 +906,9 @@ __attribute__ ((target(MIC)))
|
|||
extern int
|
||||
lbfgs_fit_robust(
|
||||
void (*func)(double *p, double *hx, int m, int n, void *adata),
|
||||
double *p, double *x, int m, int n, int itmax, int lbfgs_m, int gpu_threads, void *adata);
|
||||
double *p, double *x, int m, int n, int itmax, int lbfgs_m, int gpu_threads,
|
||||
int whiten, /* if >0 whiten data 1: NCP, 2... */
|
||||
void *adata);
|
||||
#ifdef HAVE_CUDA
|
||||
extern int
|
||||
lbfgs_fit_robust_cuda(
|
||||
|
@ -925,15 +959,17 @@ calculate_residuals_multifreq(double *u,double *v,double *w,double *p,double *x,
|
|||
|
||||
/*
|
||||
calculate visibilities for multiple channels, no solutions are used
|
||||
note: output column x is set to 0
|
||||
note: output column x is set to 0 if add_to_data ==0, else model is added to data
|
||||
*/
|
||||
extern int
|
||||
predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt);
|
||||
predict_visibilities_multifreq(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt,int add_to_data);
|
||||
|
||||
|
||||
/* predict with solutions in p , ignore clusters flagged in ignorelist (Mx1) array*/
|
||||
/* predict with solutions in p , ignore clusters flagged in ignorelist (Mx1) array
|
||||
also correct final data with solutions for cluster ccid, if valid
|
||||
*/
|
||||
extern int
|
||||
predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,double *x,int *ignorelist,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt);
|
||||
predict_visibilities_multifreq_withsol(double *u,double *v,double *w,double *p,double *x,int *ignorelist,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta,double dec0,int Nt,int add_to_data, int ccid, double rho);
|
||||
/****************************** mderiv.cu ****************************/
|
||||
/* cuda driver for kernel */
|
||||
/* ThreadsPerBlock: keep <= 128
|
||||
|
@ -1331,6 +1367,7 @@ osrlevmar_der_single_cuda_fl(
|
|||
int ntiles, /* total tile (data) size being solved for */
|
||||
double robust_nulow, double robust_nuhigh, /* robust nu range */
|
||||
int randomize, /* if >0 randomize */
|
||||
int whiten, /* if >0 whiten data 1: NCP, 2... */
|
||||
void *adata);
|
||||
#endif /* HAVE_CUDA */
|
||||
|
||||
|
@ -1357,6 +1394,7 @@ rlevmar_der_single_nocuda(
|
|||
int linsolv, /* 0 Cholesky, 1 QR, 2 SVD */
|
||||
int Nt, /* no of threads */
|
||||
double robust_nulow, double robust_nuhigh, /* robust nu range */
|
||||
int whiten, /* if >0 whiten data 1: NCP, 2... */
|
||||
void *adata);
|
||||
|
||||
/* robust LM, OS acceleration */
|
||||
|
@ -1396,6 +1434,7 @@ osrlevmar_der_single_nocuda(
|
|||
int Nt, /* no of threads */
|
||||
double robust_nulow, double robust_nuhigh, /* robust nu range */
|
||||
int randomize, /* if >0 randomize */
|
||||
int whiten, /* if >0 whiten data 1: NCP, 2... */
|
||||
void *adata);
|
||||
|
||||
/****************************** updatenu.c ****************************/
|
||||
|
@ -1425,6 +1464,15 @@ __attribute__ ((target(MIC)))
|
|||
#endif
|
||||
extern double
|
||||
update_w_and_nu(double nu0, double *w, double *ed, int N, int Nt, double nulow, double nuhigh);
|
||||
|
||||
/* update weights array wt by multiplying it with the inverse density function
|
||||
1/( 1+f(u,v) )
|
||||
as u,v->inf, f(u,v) -> 0 so long baselines are not affected
|
||||
wt : Nbase*8 x 1
|
||||
u,v : Nbase x 1
|
||||
note: u = u/c, v=v/c here, so need freq to convert to wavelengths */
|
||||
extern void
|
||||
add_whitening_weights(int Nbase, double *wt, double *u, double *v, double freq0, int Nt);
|
||||
/****************************** clmfit_nocuda.c ****************************/
|
||||
/* LM with LAPACK */
|
||||
/** keep interface almost the same as in levmar **/
|
||||
|
@ -1734,6 +1782,22 @@ rtr_solve_nocuda_robust(
|
|||
double *info, /* initial and final residuals */
|
||||
me_data_t *adata);
|
||||
|
||||
/* Nesterov's SD */
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern int
|
||||
nsd_solve_nocuda_robust(
|
||||
double *x, /* initial values and updated solution at output (size 8*N double) */
|
||||
double *y, /* data vector (size 8*M double) */
|
||||
int N, /* no. of stations */
|
||||
int M, /* no. of constraints */
|
||||
int itmax, /* maximum number of iterations */
|
||||
double robust_nulow, double robust_nuhigh, /* robust nu range */
|
||||
double *info, /* initial and final residuals */
|
||||
me_data_t *adata); /* pointer to additional data
|
||||
*/
|
||||
|
||||
/****************************** rtr_solve_robust_admm.c ****************************/
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
|
@ -1824,6 +1888,24 @@ rtr_solve_cuda_robust_fl(
|
|||
int tileoff, /* tile offset when solving for many chunks */
|
||||
int ntiles, /* total tile (data) size being solved for */
|
||||
me_data_t *adata);
|
||||
|
||||
|
||||
/* Nesterov's steepest descent */
|
||||
extern int
|
||||
nsd_solve_cuda_robust_fl(
|
||||
float *x0, /* initial values and updated solution at output (size 8*N float) */
|
||||
float *y, /* data vector (size 8*M float) */
|
||||
int N, /* no of stations */
|
||||
int M, /* no of constraints */
|
||||
int itmax, /* maximum number of iterations */
|
||||
double robust_nulow, double robust_nuhigh, /* robust nu range */
|
||||
double *info, /* initial and final residuals */
|
||||
cublasHandle_t cbhandle, /* device handle */
|
||||
float *gWORK, /* GPU allocated memory */
|
||||
int tileoff, /* tile offset when solving for many chunks */
|
||||
int ntiles, /* total tile (data) size being solved for */
|
||||
me_data_t *adata);
|
||||
|
||||
/****************************** rtr_solve_robust_cuda_admm.c ****************************/
|
||||
/* ADMM solver */
|
||||
extern int
|
||||
|
@ -1834,8 +1916,7 @@ rtr_solve_cuda_robust_admm_fl(
|
|||
float *y, /* data vector (size 8*M float) */
|
||||
int N, /* no of stations */
|
||||
int M, /* no of constraints */
|
||||
int itmax_sd, /* maximum number of iterations RSD */
|
||||
int itmax_rtr, /* maximum number of iterations RTR */
|
||||
int itmax_rtr, /* maximum number of iterations */
|
||||
float Delta_bar, float Delta0, /* Trust region radius and initial value */
|
||||
float admm_rho, /* ADMM regularization */
|
||||
double robust_nulow, double robust_nuhigh, /* robust nu range */
|
||||
|
@ -1846,6 +1927,26 @@ rtr_solve_cuda_robust_admm_fl(
|
|||
int tileoff, /* tile offset when solving for many chunks */
|
||||
int ntiles, /* total tile (data) size being solved for */
|
||||
me_data_t *adata);
|
||||
|
||||
|
||||
/* Nesterov's SD */
|
||||
extern int
|
||||
nsd_solve_cuda_robust_admm_fl(
|
||||
float *x0, /* initial values and updated solution at output (size 8*N float) */
|
||||
float *Y, /* Lagrange multiplier size 8N */
|
||||
float *Z, /* consensus term B Z size 8N */
|
||||
float *y, /* data vector (size 8*M float) */
|
||||
int N, /* no of stations */
|
||||
int M, /* no of constraints */
|
||||
int itmax, /* maximum number of iterations */
|
||||
float admm_rho, /* ADMM regularization */
|
||||
double robust_nulow, double robust_nuhigh, /* robust nu range */
|
||||
double *info, /* initial and final residuals */
|
||||
cublasHandle_t cbhandle, /* device handle */
|
||||
float *gWORK, /* GPU allocated memory */
|
||||
int tileoff, /* tile offset when solving for many chunks */
|
||||
int ntiles, /* total tile (data) size being solved for */
|
||||
me_data_t *adata);
|
||||
#endif /* HAVE_CUDA */
|
||||
/****************************** lmfit.c ****************************/
|
||||
/****************************** lmfit_nocuda.c ****************************/
|
||||
|
@ -1885,6 +1986,7 @@ random_permutation(int n, int weighted_iter, double *w);
|
|||
#define SM_RLM_RLBFGS 2
|
||||
#define SM_RTR_OSLM_LBFGS 4
|
||||
#define SM_RTR_OSRLM_RLBFGS 5
|
||||
#define SM_NSD_RLBFGS 6
|
||||
/* fit visibilities
|
||||
u,v,w: u,v,w coordinates (wavelengths) size Nbase*tilesz x 1
|
||||
u,v,w are ordered with baselines, timeslots
|
||||
|
@ -1913,6 +2015,7 @@ random_permutation(int n, int weighted_iter, double *w);
|
|||
nulow,nuhigh: robust nu search range
|
||||
randomize: if >0, randomize cluster selection in SAGE and OS subset selection
|
||||
|
||||
whiten : if >0 whiten data 1: NCP, 2...
|
||||
mean_nu: output mean value of nu
|
||||
res_0,res_1: initial and final residuals (output)
|
||||
return val=0 if final residual< initial residual
|
||||
|
@ -1923,7 +2026,7 @@ __attribute__ ((target(MIC)))
|
|||
#endif
|
||||
extern int
|
||||
sagefit_visibilities(double *u, double *v, double *w, double *x, int N,
|
||||
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt,int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv, int solver_mode, double nulow, double nuhigh, int randomize, double *mean_nu, double *res_0, double *res_1);
|
||||
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double uvmin, int Nt,int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv, int solver_mode, double nulow, double nuhigh, int randomize, int whiten, double *mean_nu, double *res_0, double *res_1);
|
||||
|
||||
/* same as above, but uses 2 GPUS in the LM stage */
|
||||
extern int
|
||||
|
@ -2010,6 +2113,9 @@ typedef struct pipeline_ {
|
|||
#ifndef PT_DO_WORK_RRTR /* Robust RTR */
|
||||
#define PT_DO_WORK_RRTR 8
|
||||
#endif
|
||||
#ifndef PT_DO_WORK_NSD /* Nesterov's SD */
|
||||
#define PT_DO_WORK_NSD 9
|
||||
#endif
|
||||
#ifndef PT_DO_MEMRESET
|
||||
#define PT_DO_MEMRESET 99
|
||||
#endif
|
||||
|
@ -2086,7 +2192,7 @@ typedef struct gb_data_admm_fl_ {
|
|||
float *p[2]; /* pointer to parameters being solved by each thread */
|
||||
float *Y[2]; /* pointer to Lagrange multiplier */
|
||||
float *Z[2]; /* pointer to consensus term */
|
||||
float admm_rho;
|
||||
float admm_rho[2];
|
||||
float *x[2]; /* pointer to data being fit by each thread */
|
||||
int M[2];
|
||||
int N[2];
|
||||
|
@ -2233,23 +2339,23 @@ update_global_z(double *Z,int N,int M,int Npoly,double *z,double *Bi);
|
|||
Y : Lagrange multiplier
|
||||
BZ : consensus term
|
||||
Y,BZ : size same as pp : 8*N*Mt x1 double values (re,img) for each station/direction
|
||||
admm_rho : regularization factor
|
||||
admm_rho : regularization factor array size Mx1
|
||||
*/
|
||||
extern int
|
||||
sagefit_visibilities_admm(double *u, double *v, double *w, double *x, int N,
|
||||
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, double admm_rho, double *mean_nu, double *res_0, double *res_1);
|
||||
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode,double nulow, double nuhigh,int randomize, double *admm_rho, double *mean_nu, double *res_0, double *res_1);
|
||||
|
||||
/* ADMM cost function = normal_cost + ||Y^H(J-BZ)|| + rho/2 ||J-BZ||^2 */
|
||||
/* extra params
|
||||
Y : Lagrange multiplier
|
||||
BZ : consensus term
|
||||
Y,BZ : size same as pp : 8*N*Mt x1 double values (re,img) for each station/direction
|
||||
admm_rho : regularization factor
|
||||
admm_rho : regularization factor array size Mx1
|
||||
*/
|
||||
#ifdef HAVE_CUDA
|
||||
extern int
|
||||
sagefit_visibilities_admm_dual_pt_flt(double *u, double *v, double *w, double *x, int N,
|
||||
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode, double nulow, double nuhigh, int randomize, double admm_rho, double *mean_nu, double *res_0, double *res_1);
|
||||
int Nbase, int tilesz, baseline_t *barr, clus_source_t *carr, complex double *coh, int M, int Mt, double freq0, double fdelta, double *pp, double *Y, double *BZ, double uvmin, int Nt, int max_emiter, int max_iter, int max_lbfgs, int lbfgs_m, int gpu_threads, int linsolv,int solver_mode, double nulow, double nuhigh, int randomize, double *admm_rho, double *mean_nu, double *res_0, double *res_1);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -336,3 +336,151 @@ update_nu(double logsumw, int Nd, int Nt, double nulow, double nuhigh, int p, do
|
|||
free(q);
|
||||
return thisnu;
|
||||
}
|
||||
|
||||
|
||||
/* x = sqrt(u^2+v^2) */
|
||||
static double
|
||||
ncp_weight(double ud) {
|
||||
/* fo(x) =
|
||||
a1*exp(-((x-b1)/c1)^2) + a2*exp(-((x-b2)/c2)^2) +
|
||||
a3*exp(-((x-b3)/c3)^2) + a4*exp(-((x-b4)/c4)^2) +
|
||||
a5*exp(-((x-b5)/c5)^2) + a6*exp(-((x-b6)/c6)^2)
|
||||
mean(fo(x)) is about 1
|
||||
*/
|
||||
float x=(float)ud;
|
||||
if (x<40.0f) { return 1.0; }
|
||||
if (x>800.0f) {
|
||||
return 1.0;
|
||||
}
|
||||
/* else [40,285] */
|
||||
float r[6];
|
||||
float a1 =-0.9415f;
|
||||
float b1 =117.1f;
|
||||
float c1 =15.08f;
|
||||
float a2 =5.231f;
|
||||
float b2 =49.57f;
|
||||
float c2 =13.79f;
|
||||
float a3 =2.209f;
|
||||
float b3 =67.29f;
|
||||
float c3 =14.86f;
|
||||
float a4 =10.43f;
|
||||
float b4 =72.19f;
|
||||
float c4 =200.8f;
|
||||
float a5 =104.9f;
|
||||
float b5 =98.72f;
|
||||
float c5 =65.8f;
|
||||
float a6 =-101.3f;
|
||||
float b6 =101.2f;
|
||||
float c6 =66.63f;
|
||||
r[0]=(x-b1)/c1;
|
||||
r[1]=(x-b2)/c2;
|
||||
r[2]=(x-b3)/c3;
|
||||
r[3]=(x-b4)/c4;
|
||||
r[4]=(x-b5)/c5;
|
||||
r[5]=(x-b6)/c6;
|
||||
r[0]*=-r[0];
|
||||
r[1]*=-r[1];
|
||||
r[2]*=-r[2];
|
||||
r[3]*=-r[3];
|
||||
r[4]*=-r[4];
|
||||
r[5]*=-r[5];
|
||||
float sum=0.0f;
|
||||
sum+=a1*expf(r[0]);
|
||||
sum+=a2*expf(r[1]);
|
||||
sum+=a3*expf(r[2]);
|
||||
sum+=a4*expf(r[3]);
|
||||
sum+=a5*expf(r[4]);
|
||||
sum+=a6*expf(r[5]);
|
||||
return (1.0/((double)sum+1.0)); /* as x-> inf, goes to 1 */
|
||||
}
|
||||
|
||||
static void *
|
||||
threadfn_setblweight(void *data) {
|
||||
thread_data_baselinewt_t *t=(thread_data_baselinewt_t*)data;
|
||||
|
||||
int ci;
|
||||
for (ci=0; ci<t->Nb; ci++) {
|
||||
/* get sqrt(u^2+v^2) */
|
||||
double uu=t->u[ci+t->boff]*t->freq0;
|
||||
double vv=t->v[ci+t->boff]*t->freq0;
|
||||
double a=ncp_weight(sqrt(uu*uu+vv*vv));
|
||||
t->wt[8*(ci+t->boff)]*=a;
|
||||
t->wt[8*(ci+t->boff)+1]*=a;
|
||||
t->wt[8*(ci+t->boff)+2]*=a;
|
||||
t->wt[8*(ci+t->boff)+3]*=a;
|
||||
t->wt[8*(ci+t->boff)+4]*=a;
|
||||
t->wt[8*(ci+t->boff)+5]*=a;
|
||||
t->wt[8*(ci+t->boff)+6]*=a;
|
||||
t->wt[8*(ci+t->boff)+7]*=a;
|
||||
printf("%lf %lf %lf\n",uu,vv,a);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/* update weights array wt by multiplying it with the inverse density function
|
||||
1/( 1+f(u,v) )
|
||||
as u,v->inf, f(u,v) -> 0 so long baselines are not affected
|
||||
wt : Nbase*8 x 1
|
||||
u,v : Nbase x 1
|
||||
note: u = u/c, v=v/c here, so need freq to convert to wavelengths */
|
||||
void
|
||||
add_whitening_weights(int Nbase, double *wt, double *u, double *v, double freq0, int Nt) {
|
||||
pthread_attr_t attr;
|
||||
pthread_t *th_array;
|
||||
thread_data_baselinewt_t *threaddata;
|
||||
|
||||
int ci,nth1,nth;
|
||||
int Nthb0,Nthb;
|
||||
|
||||
Nthb0=(Nbase+Nt-1)/Nt;
|
||||
|
||||
pthread_attr_init(&attr);
|
||||
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
|
||||
|
||||
if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ((threaddata=(thread_data_baselinewt_t*)malloc((size_t)Nt*sizeof(thread_data_baselinewt_t)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
/* iterate over threads, allocating baselines per thread */
|
||||
ci=0;
|
||||
for (nth=0; nth<Nt && ci<Nbase; nth++) {
|
||||
if (ci+Nthb0<Nbase) {
|
||||
Nthb=Nthb0;
|
||||
} else {
|
||||
Nthb=Nbase-ci;
|
||||
}
|
||||
|
||||
threaddata[nth].Nb=Nthb;
|
||||
threaddata[nth].boff=ci;
|
||||
threaddata[nth].wt=wt;
|
||||
threaddata[nth].u=u;
|
||||
threaddata[nth].v=v;
|
||||
threaddata[nth].freq0=freq0;
|
||||
|
||||
pthread_create(&th_array[nth],&attr,threadfn_setblweight,(void*)(&threaddata[nth]));
|
||||
/* next baseline set */
|
||||
ci=ci+Nthb;
|
||||
}
|
||||
|
||||
/* now wait for threads to finish */
|
||||
for(nth1=0; nth1<nth; nth1++) {
|
||||
pthread_join(th_array[nth1],NULL);
|
||||
}
|
||||
|
||||
pthread_attr_destroy(&attr);
|
||||
free(th_array);
|
||||
free(threaddata);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue