added model prediction and residual calculation using GPU

This commit is contained in:
Sarod Yatawatta 2017-11-20 12:36:17 +01:00
parent e07a26b63f
commit 028ac53347
7 changed files with 1254 additions and 93 deletions

View File

@ -239,6 +239,7 @@ ParseCmdLine(int ac, char **av) {
}
}
/* real main program */
int
main(int argc, char **argv) {

View File

@ -537,12 +537,27 @@ cout<<myrank<<" : "<<cm<<": downweight ratio ("<<iodata_vec[cm].fratio<<") based
}
for(int cm=0; cm<mymscount; cm++) {
#ifndef HAVE_CUDA
if (!doBeam) {
precalculate_coherencies(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
} else {
precalculate_coherencies_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,
beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,iodata_vec[cm].tilesz,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt);
}
#endif
#ifdef HAVE_CUDA
if (GPUpredict) {
precalculate_coherencies_withbeam_gpu(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,
beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,iodata_vec[cm].tilesz,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,doBeam,Data::Nt);
} else {
if (!doBeam) {
precalculate_coherencies(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
} else {
precalculate_coherencies_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,
beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,iodata_vec[cm].tilesz,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt);
}
}
#endif
}
/******************** ADMM *******************************/
@ -734,12 +749,27 @@ cout<<myrank<<" : "<<cm<<": downweight ratio ("<<iodata_vec[cm].fratio<<") based
/* write residuals to output */
for(int cm=0; cm<mymscount; cm++) {
#ifndef HAVE_CUDA
if (!doBeam) {
calculate_residuals_multifreq(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
} else {
calculate_residuals_multifreq_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
calculate_residuals_multifreq(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
} else {
calculate_residuals_multifreq_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
}
#endif
#ifdef HAVE_CUDA
if (GPUpredict) {
calculate_residuals_multifreq_withbeam_gpu(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,doBeam,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
} else {
if (!doBeam) {
calculate_residuals_multifreq(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
} else {
calculate_residuals_multifreq_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
}
}
#endif
}
tilex+=iodata_vec[0].tilesz;

View File

@ -526,18 +526,27 @@ main(int argc, char **argv) {
/* FIXME: uvmin is not needed in calibration, because its taken care of by flags */
if (!Data::DoSim) {
/****************** calibration **************************/
////#ifndef HAVE_CUDA
#ifndef HAVE_CUDA
if (!doBeam) {
precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
} else {
precalculate_coherencies_withbeam(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt);
}
////#endif
////#ifdef HAVE_CUDA
//// precalculate_coherencies_withbeam_gpu(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
//// beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt);
////#endif
#endif
#ifdef HAVE_CUDA
if (GPUpredict) {
precalculate_coherencies_withbeam_gpu(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt);
} else {
if (!doBeam) {
precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
} else {
precalculate_coherencies_withbeam(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt);
}
}
#endif
#ifndef HAVE_CUDA
@ -651,12 +660,28 @@ main(int argc, char **argv) {
} else {
/* since residual is calculated not using x (instead using xo), no need to unwhiten data
in case x was whitened */
#ifndef HAVE_CUDA
if (!doBeam) {
calculate_residuals_multifreq(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
} else {
calculate_residuals_multifreq_withbeam(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
}
#endif
#ifdef HAVE_CUDA
if (GPUpredict) {
calculate_residuals_multifreq_withbeam_gpu(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
} else {
if (!doBeam) {
calculate_residuals_multifreq(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
} else {
calculate_residuals_multifreq_withbeam(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
}
}
#endif
}
/****************** end calibration **************************/
/****************** begin diagnostics ************************/

View File

@ -1224,6 +1224,7 @@ attach_gpu_to_thread1(int card, cublasHandle_t *cbhandle, cusolverDnHandle_t *s
status=cusolverDnCreate(solver_handle);
if (status != CUSOLVER_STATUS_SUCCESS) {
fprintf(stderr,"%s: %d: CUSOLV create fail %d\n",__FILE__,__LINE__,status);
fprintf(stderr,"common problems: not initialized %d, alloc fail %d, no compute %d\n",CUSOLVER_STATUS_NOT_INITIALIZED, CUSOLVER_STATUS_ALLOC_FAILED, CUSOLVER_STATUS_ARCH_MISMATCH);
exit(1);
}
}

View File

@ -26,6 +26,25 @@
/* enable this for checking for kernel failure */
//#define CUDA_DBG
/* matrix multiplications */
/* C=A*B */
__device__ void
amb(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) {
c[0]=cuCadd(cuCmul(a[0],b[0]),cuCmul(a[1],b[2]));
c[1]=cuCadd(cuCmul(a[0],b[1]),cuCmul(a[1],b[3]));
c[2]=cuCadd(cuCmul(a[2],b[0]),cuCmul(a[3],b[2]));
c[3]=cuCadd(cuCmul(a[2],b[1]),cuCmul(a[3],b[3]));
}
/* C=A*B^H */
__device__ void
ambt(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) {
c[0]=cuCadd(cuCmul(a[0],cuConj(b[0])),cuCmul(a[1],cuConj(b[1])));
c[1]=cuCadd(cuCmul(a[0],cuConj(b[2])),cuCmul(a[1],cuConj(b[3])));
c[2]=cuCadd(cuCmul(a[2],cuConj(b[0])),cuCmul(a[3],cuConj(b[1])));
c[3]=cuCadd(cuCmul(a[2],cuConj(b[2])),cuCmul(a[3],cuConj(b[3])));
}
__device__ void
radec2azel_gmst__(float ra, float dec, float longitude, float latitude, float thetaGMST, float *az, float *el) {
float thetaLST=thetaGMST+longitude*180.0f/M_PI;
@ -327,6 +346,11 @@ shapelet_contrib__(int *dd, float u, float v, float w) {
__sincosf((float)dp->eP,&sinph,&cosph);
ut=a*(cosph*up-sinph*vp);
vt=b*(sinph*up+cosph*vp);
/* if u,v is way off the scale (beta) of shapelet modes, the result is almost always zero,
so check this here and return 0, otherwise spurious nans may result */
if (__fdiv_rz(100.0f,__fsqrt_rz(ut*ut+vt*vt))<dp->beta) {
return make_cuDoubleComplex(0.0,0.0);
}
/* note: we decompose f(-l,m) so the Fourier transform is F(-u,v)
so negate the u grid */
Av=(float*)malloc((size_t)((dp->n0)*(dp->n0))*sizeof(float));
@ -345,45 +369,57 @@ shapelet_contrib__(int *dd, float u, float v, float w) {
free(Av);
free(cplx);
//return 2.0*M_PI*(realsum+_Complex_I*imagsum);
realsum*=2.0f*M_PI*a*b;
imagsum*=2.0f*M_PI*a*b;
/* additional safeguards */
if ( isnan(realsum) ) { realsum=0.0f; }
if ( isnan(imagsum) ) { imagsum=0.0f; }
return make_cuDoubleComplex((double)realsum,(double)imagsum);
}
__device__ cuDoubleComplex
compute_prodterm(int sta1, int sta2, int N, int K, int T, int F,
double phterm0, double sIf, double sI0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0,
const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w) {
__device__ void
compute_prodterm_multifreq(int sta1, int sta2, int N, int K, int T, int F,
double phterm0, double sI0f, double sQ0f, double sU0f, double sV0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0,
double myfreq, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) {
/* F>1 is assumed output: 8x1 array */
double sinph,cosph;
double myfreq=__ldg(&freqs[cf]);
sincos(phterm0*myfreq,&sinph,&cosph);
cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph);
double If;
if (F==1) {
/* flux: do not use spectra here, because F=1*/
If=sIf;
} else {
/* evaluate spectra */
double fratio=log(myfreq/myf0);
double fratio1=fratio*fratio;
double fratio2=fratio1*fratio;
/* catch -ve flux */
if (sI0f>0.0) {
If=exp(log(sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
} else if (sI0f<0.0) {
If=-exp(log(-sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
} else {
If=0.0;
}
double If,Qf,Uf,Vf;
If=Qf=Uf=Vf=0.0;
/* evaluate spectra */
double fratio=log(myfreq/myf0);
double fratio1=fratio*fratio;
double fratio2=fratio1*fratio;
double cm=spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2;
/* catch -ve flux */
if (sI0f>0.0) {
If=exp(log(sI0f)+cm);
} else if (sI0f<0.0) {
If=-exp(log(-sI0f)+cm);
}
if (sQ0f>0.0) {
Qf=exp(log(sQ0f)+cm);
} else if (sI0f<0.0) {
Qf=-exp(log(-sQ0f)+cm);
}
if (sU0f>0.0) {
Uf=exp(log(sU0f)+cm);
} else if (sI0f<0.0) {
Uf=-exp(log(-sU0f)+cm);
}
if (sV0f>0.0) {
Vf=exp(log(sV0f)+cm);
} else if (sI0f<0.0) {
Vf=-exp(log(-sV0f)+cm);
}
/* smearing */
/* smearing, beam */
double scalef=1.0;
double phterm =(phterm0*0.5*deltaf);
if (phterm!=0.0) {
sinph=(sin(phterm)/phterm);
If *=fabsf(sinph); /* catch -ve values due to rounding off */
scalef *=fabs(sinph); /* catch -ve values due to rounding off */
}
if (dobeam) {
@ -396,13 +432,13 @@ const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int
//int boffset2=sta2*K*T*F + k1*T*F + cf*T + itm;
int boffset2=itm*N*K*F+k*N*F+cf*N+sta2;
float beam2=__ldg(&beam[boffset2]);
If *=(double)(beam1*beam2);
scalef *=(double)(beam1*beam2);
}
/* form complex value */
prodterm.x *=If;
prodterm.y *=If;
prodterm.x *=scalef;
prodterm.y *=scalef;
/* check for type of source */
if (stypeT!=STYPE_POINT) {
@ -420,10 +456,99 @@ const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int
}
}
//printf("Input k=%d uvw %lE,%lE,%lE sI=%f phterm0=%lf\n",k,u,v,w,sIf,phterm0);
//printf("phterm %lf smearing %f\n",phterm,sinph);
//printf("k=%d uvw %lE,%lE,%lE If=%f phterm %lf out=%f,%f\n",k,u,v,w,If,phterm,prodterm.x,prodterm.y);
return prodterm;
double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy;
Ix=If*prodterm.x;
Iy=If*prodterm.y;
Qx=Qf*prodterm.x;
Qy=Qf*prodterm.y;
Ux=Uf*prodterm.x;
Uy=Uf*prodterm.y;
Vx=Vf*prodterm.x;
Vy=Vf*prodterm.y;
output[0]=Ix+Qx;
output[1]=Iy+Qy;
output[2]=Ux-Vy;
output[3]=Vx+Uy;
output[4]=Ux+Vy;
output[5]=-Vx+Uy;
output[6]=Ix-Qx;
output[7]=Iy-Qy;
}
__device__ void
compute_prodterm(int sta1, int sta2, int N, int K, int T,
double phterm0, double If, double Qf, double Uf, double Vf,
double myfreq, double deltaf, int dobeam, int itm, int k, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) {
/* F==1 is assumed, output: 8x1 array */
double sinph,cosph;
sincos(phterm0*myfreq,&sinph,&cosph);
cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph);
/* smearing, beam */
double scalef=1.0;
double phterm =(phterm0*0.5*deltaf);
if (phterm!=0.0) {
sinph=(sin(phterm)/phterm);
scalef *=fabs(sinph); /* catch -ve values due to rounding off */
}
if (dobeam) {
/* get beam info */
int boffset1=itm*N*K+k*N+sta1;
// printf("itm=%d, k1=%d, sta1=%d, sta2=%d, boffset1=%d, boffset2=%d\n", itm, k1, sta1, sta2, boffset1, boffset2);
float beam1=__ldg(&beam[boffset1]);
int boffset2=itm*N*K+k*N+sta2;
float beam2=__ldg(&beam[boffset2]);
scalef *=(double)(beam1*beam2);
}
/* form complex value */
prodterm.x *=scalef;
prodterm.y *=scalef;
/* check for type of source */
if (stypeT!=STYPE_POINT) {
double uscaled=u*myfreq;
double vscaled=v*myfreq;
double wscaled=w*myfreq;
if (stypeT==STYPE_SHAPELET) {
prodterm=cuCmul(shapelet_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
} else if (stypeT==STYPE_GAUSSIAN) {
prodterm=cuCmul(gaussian_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
} else if (stypeT==STYPE_DISK) {
prodterm=cuCmul(disk_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
} else if (stypeT==STYPE_RING) {
prodterm=cuCmul(ring_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
}
}
double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy;
Ix=If*prodterm.x;
Iy=If*prodterm.y;
Qx=Qf*prodterm.x;
Qy=Qf*prodterm.y;
Ux=Uf*prodterm.x;
Uy=Uf*prodterm.y;
Vx=Vf*prodterm.x;
Vy=Vf*prodterm.y;
output[0]=Ix+Qx;
output[1]=Iy+Qy;
output[2]=Ux-Vy;
output[3]=Vx+Uy;
output[4]=Ux+Vy;
output[5]=-Vx+Uy;
output[6]=Ix-Qx;
output[7]=Iy-Qy;
}
@ -431,8 +556,11 @@ const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int
__global__ void
kernel_coherencies(int B, int N, int T, int K, int F,
const double *__restrict__ u, const double *__restrict__ v, const double *__restrict__ w,
baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, const double *__restrict__ sI,
const unsigned char *__restrict__ stype, const double *__restrict__ sI0, const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn,
const double *__restrict__ sI, const double *__restrict__ sQ, const double *__restrict__ sU, const double *__restrict__ sV,
const unsigned char *__restrict__ stype, const double *__restrict__ sI0,
const double *__restrict__ sQ0, const double *__restrict__ sU0, const double *__restrict__ sV0,
const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
/* global thread index */
unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
@ -451,46 +579,96 @@ kernel_coherencies(int B, int N, int T, int K, int F,
//TODO: figure out if this max_f makes any sense
#define MAX_F 20
cuDoubleComplex l_coh[MAX_F];
double l_coh[MAX_F][8];
for(int cf=0; cf<F; cf++) {
l_coh[cf] = make_cuDoubleComplex(0.0, 0.0);
l_coh[cf][0]=0.0;
l_coh[cf][1]=0.0;
l_coh[cf][2]=0.0;
l_coh[cf][3]=0.0;
l_coh[cf][4]=0.0;
l_coh[cf][5]=0.0;
l_coh[cf][6]=0.0;
l_coh[cf][7]=0.0;
}
//use simply for-loop, if K is very large this may be slow and may need further parallelization
for (int k=0; k<K; k++) {
// split to two cases, F==1 and F>1
if (F==1) {
//use simply for-loop, if K is very large this may be slow and may need further parallelization
for (int k=0; k<K; k++) {
//source specific params
double sIf,sI0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
double sIf,sQf,sUf,sVf;
double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
sIf=__ldg(&sI[k]);
if (F>1) {
sI0f=__ldg(&sI0[k]);
spec_idxf=__ldg(&spec_idx[k]);
spec_idx1f=__ldg(&spec_idx1[k]);
spec_idx2f=__ldg(&spec_idx2[k]);
myf0=__ldg(&f0[k]);
}
sQf=__ldg(&sQ[k]);
sUf=__ldg(&sU[k]);
sVf=__ldg(&sV[k]);
unsigned char stypeT=__ldg(&stype[k]);
double llcoh[8];
compute_prodterm(sta1, sta2, N, K, T, phterm0, sIf, sQf, sUf, sVf,
__ldg(&(freqs[0])), deltaf, dobeam, tslot, k, beam, exs, stypeT, u_n, v_n, w_n, llcoh);
l_coh[0][0] +=llcoh[0];
l_coh[0][1] +=llcoh[1];
l_coh[0][2] +=llcoh[2];
l_coh[0][3] +=llcoh[3];
l_coh[0][4] +=llcoh[4];
l_coh[0][5] +=llcoh[5];
l_coh[0][6] +=llcoh[6];
l_coh[0][7] +=llcoh[7];
}
} else {
//use simply for-loop, if K is very large this may be slow and may need further parallelization
for (int k=0; k<K; k++) {
//source specific params
double sI0f,sQ0f,sU0f,sV0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
sI0f=__ldg(&sI0[k]);
sQ0f=__ldg(&sQ0[k]);
sU0f=__ldg(&sU0[k]);
sV0f=__ldg(&sV0[k]);
spec_idxf=__ldg(&spec_idx[k]);
spec_idx1f=__ldg(&spec_idx1[k]);
spec_idx2f=__ldg(&spec_idx2[k]);
myf0=__ldg(&f0[k]);
unsigned char stypeT=__ldg(&stype[k]);
for(int cf=0; cf<F; cf++) {
l_coh[cf] = cuCadd(l_coh[cf], compute_prodterm(sta1, sta2, N, K, T, F, phterm0, sIf, sI0f, spec_idxf, spec_idx1f, spec_idx2f,
myf0, freqs, deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n));
double llcoh[8];
compute_prodterm_multifreq(sta1, sta2, N, K, T, F, phterm0, sI0f, sQ0f, sU0f, sV0f, spec_idxf, spec_idx1f, spec_idx2f,
myf0, __ldg(&(freqs[cf])), deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n,llcoh);
l_coh[cf][0] +=llcoh[0];
l_coh[cf][1] +=llcoh[1];
l_coh[cf][2] +=llcoh[2];
l_coh[cf][3] +=llcoh[3];
l_coh[cf][4] +=llcoh[4];
l_coh[cf][5] +=llcoh[5];
l_coh[cf][6] +=llcoh[6];
l_coh[cf][7] +=llcoh[7];
}
}
}
}
//write output with right multi frequency offset
double *coh1 = &coh[8*n];
for(int cf=0; cf<F; cf++) {
coh1[cf*8*B+0] = l_coh[cf].x;
coh1[cf*8*B+1] = l_coh[cf].y;
coh1[cf*8*B+2] = 0.0;
coh1[cf*8*B+3] = 0.0;
coh1[cf*8*B+4] = 0.0;
coh1[cf*8*B+5] = 0.0;
coh1[cf*8*B+6] = l_coh[cf].x;
coh1[cf*8*B+7] = l_coh[cf].y;
coh1[cf*8*B+0] = l_coh[cf][0];
coh1[cf*8*B+1] = l_coh[cf][1];
coh1[cf*8*B+2] = l_coh[cf][2];
coh1[cf*8*B+3] = l_coh[cf][3];
coh1[cf*8*B+4] = l_coh[cf][4];
coh1[cf*8*B+5] = l_coh[cf][5];
coh1[cf*8*B+6] = l_coh[cf][6];
coh1[cf*8*B+7] = l_coh[cf][7];
}
@ -498,6 +676,196 @@ kernel_coherencies(int B, int N, int T, int K, int F,
}
/* kernel to calculate residuals */
__global__ void
kernel_residuals(int B, int N, int T, int K, int F,
const double *__restrict__ u, const double *__restrict__ v, const double *__restrict__ w,
const double *__restrict__ p, int nchunk,
baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn,
const double *__restrict__ sI, const double *__restrict__ sQ, const double *__restrict__ sU, const double *__restrict__ sV,
const unsigned char *__restrict__ stype, const double *__restrict__ sI0,
const double *__restrict__ sQ0, const double *__restrict__ sU0, const double *__restrict__ sV0,
const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
/* global thread index */
unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
/* each thread will calculate for one baseline, over all sources */
if (n<B) {
int sta1=barr[n].sta1;
int sta2=barr[n].sta2;
/* find out which time slot this baseline is from */
int tslot=n/((N*(N-1)/2));
/* find out which chunk to select from p : 0,1..nchunk-1 */
int chunk=(n*nchunk)/B;
/* create G1 and G2 Jones matrices from p */
cuDoubleComplex G1[4],G2[4];
G1[0].x=__ldg(&p[chunk*8*N+sta1*8]);
G1[0].y=__ldg(&p[chunk*8*N+sta1*8+1]);
G1[1].x=__ldg(&p[chunk*8*N+sta1*8+2]);
G1[1].y=__ldg(&p[chunk*8*N+sta1*8+3]);
G1[2].x=__ldg(&p[chunk*8*N+sta1*8+4]);
G1[2].y=__ldg(&p[chunk*8*N+sta1*8+5]);
G1[3].x=__ldg(&p[chunk*8*N+sta1*8+6]);
G1[3].y=__ldg(&p[chunk*8*N+sta1*8+7]);
G2[0].x=__ldg(&p[chunk*8*N+sta2*8]);
G2[0].y=__ldg(&p[chunk*8*N+sta2*8+1]);
G2[1].x=__ldg(&p[chunk*8*N+sta2*8+2]);
G2[1].y=__ldg(&p[chunk*8*N+sta2*8+3]);
G2[2].x=__ldg(&p[chunk*8*N+sta2*8+4]);
G2[2].y=__ldg(&p[chunk*8*N+sta2*8+5]);
G2[3].x=__ldg(&p[chunk*8*N+sta2*8+6]);
G2[3].y=__ldg(&p[chunk*8*N+sta2*8+7]);
double u_n=(u[n]);
double v_n=(v[n]);
double w_n=(w[n]);
//TODO: figure out if this max_f makes any sense
#define MAX_F 20
double l_coh[MAX_F][8];
for(int cf=0; cf<F; cf++) {
l_coh[cf][0]=0.0;
l_coh[cf][1]=0.0;
l_coh[cf][2]=0.0;
l_coh[cf][3]=0.0;
l_coh[cf][4]=0.0;
l_coh[cf][5]=0.0;
l_coh[cf][6]=0.0;
l_coh[cf][7]=0.0;
}
// split to two cases, F==1 and F>1
if (F==1) {
//use simply for-loop, if K is very large this may be slow and may need further parallelization
for (int k=0; k<K; k++) {
//source specific params
double sIf,sQf,sUf,sVf;
double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
sIf=__ldg(&sI[k]);
sQf=__ldg(&sQ[k]);
sUf=__ldg(&sU[k]);
sVf=__ldg(&sV[k]);
unsigned char stypeT=__ldg(&stype[k]);
double llcoh[8];
compute_prodterm(sta1, sta2, N, K, T, phterm0, sIf, sQf, sUf, sVf,
__ldg(&(freqs[0])), deltaf, dobeam, tslot, k, beam, exs, stypeT, u_n, v_n, w_n, llcoh);
l_coh[0][0] +=llcoh[0];
l_coh[0][1] +=llcoh[1];
l_coh[0][2] +=llcoh[2];
l_coh[0][3] +=llcoh[3];
l_coh[0][4] +=llcoh[4];
l_coh[0][5] +=llcoh[5];
l_coh[0][6] +=llcoh[6];
l_coh[0][7] +=llcoh[7];
}
cuDoubleComplex L1[4],L2[4];
L1[0].x=l_coh[0][0];
L1[0].y=l_coh[0][1];
L1[1].x=l_coh[0][2];
L1[1].y=l_coh[0][3];
L1[2].x=l_coh[0][4];
L1[2].y=l_coh[0][5];
L1[3].x=l_coh[0][6];
L1[3].y=l_coh[0][7];
/* L2=G1*L1 */
amb(G1,L1,L2);
/* L1=L2*G2^H */
ambt(L2,G2,L1);
l_coh[0][0]=L1[0].x;
l_coh[0][1]=L1[0].y;
l_coh[0][2]=L1[1].x;
l_coh[0][3]=L1[1].y;
l_coh[0][4]=L1[2].x;
l_coh[0][5]=L1[2].y;
l_coh[0][6]=L1[3].x;
l_coh[0][7]=L1[3].y;
} else {
//use simply for-loop, if K is very large this may be slow and may need further parallelization
for (int k=0; k<K; k++) {
//source specific params
double sI0f,sQ0f,sU0f,sV0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
sI0f=__ldg(&sI0[k]);
sQ0f=__ldg(&sQ0[k]);
sU0f=__ldg(&sU0[k]);
sV0f=__ldg(&sV0[k]);
spec_idxf=__ldg(&spec_idx[k]);
spec_idx1f=__ldg(&spec_idx1[k]);
spec_idx2f=__ldg(&spec_idx2[k]);
myf0=__ldg(&f0[k]);
unsigned char stypeT=__ldg(&stype[k]);
for(int cf=0; cf<F; cf++) {
double llcoh[8];
compute_prodterm_multifreq(sta1, sta2, N, K, T, F, phterm0, sI0f, sQ0f, sU0f, sV0f, spec_idxf, spec_idx1f, spec_idx2f,
myf0, __ldg(&(freqs[cf])), deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n,llcoh);
l_coh[cf][0] +=llcoh[0];
l_coh[cf][1] +=llcoh[1];
l_coh[cf][2] +=llcoh[2];
l_coh[cf][3] +=llcoh[3];
l_coh[cf][4] +=llcoh[4];
l_coh[cf][5] +=llcoh[5];
l_coh[cf][6] +=llcoh[6];
l_coh[cf][7] +=llcoh[7];
}
}
cuDoubleComplex L1[4],L2[4];
for(int cf=0; cf<F; cf++) {
L1[0].x=l_coh[cf][0];
L1[0].y=l_coh[cf][1];
L1[1].x=l_coh[cf][2];
L1[1].y=l_coh[cf][3];
L1[2].x=l_coh[cf][4];
L1[2].y=l_coh[cf][5];
L1[3].x=l_coh[cf][6];
L1[3].y=l_coh[cf][7];
/* L2=G1*L1 */
amb(G1,L1,L2);
/* L1=L2*G2^H */
ambt(L2,G2,L1);
l_coh[cf][0]=L1[0].x;
l_coh[cf][1]=L1[0].y;
l_coh[cf][2]=L1[1].x;
l_coh[cf][3]=L1[1].y;
l_coh[cf][4]=L1[2].x;
l_coh[cf][5]=L1[2].y;
l_coh[cf][6]=L1[3].x;
l_coh[cf][7]=L1[3].y;
}
}
//write output with right multi frequency offset
double *coh1 = &coh[8*n];
for(int cf=0; cf<F; cf++) {
coh1[cf*8*B+0] = l_coh[cf][0];
coh1[cf*8*B+1] = l_coh[cf][1];
coh1[cf*8*B+2] = l_coh[cf][2];
coh1[cf*8*B+3] = l_coh[cf][3];
coh1[cf*8*B+4] = l_coh[cf][4];
coh1[cf*8*B+5] = l_coh[cf][5];
coh1[cf*8*B+6] = l_coh[cf][6];
coh1[cf*8*B+7] = l_coh[cf][7];
}
}
}
/* kernel to convert time (JD) to GMST angle*/
__global__ void
@ -585,7 +953,7 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud
/*
calculate coherencies:
B: total baselines
B: total baselines (could be more than one timeslot)
N: no of stations
T: no of time slots
K: no of sources
@ -608,8 +976,8 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud
dobeam: enable beam if >0
*/
void
cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI,
unsigned char *stype, double *sI0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
#ifdef CUDA_DBG
cudaError_t error;
error = cudaGetLastError();
@ -620,8 +988,8 @@ cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v,
/* note: make sure we do not exceed max no of blocks available,
otherwise (too many baselines, loop over source id) */
int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock;
kernel_coherencies<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI,
stype, sI0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
kernel_coherencies<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV,
stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
@ -634,6 +1002,33 @@ cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v,
}
/* p : parameters 8Nxnchunk values */
void
cudakernel_residuals(int B, int N, int T, int K, int F, double *u, double *v, double *w, double *p, int nchunk, baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
#ifdef CUDA_DBG
cudaError_t error;
error = cudaGetLastError();
#endif
/* spawn threads to handle baselines, these threads will spawn threads for sources */
int ThreadsPerBlock=DEFAULT_TH_PER_BK;
/* note: make sure we do not exceed max no of blocks available,
otherwise (too many baselines, loop over source id) */
int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock;
kernel_residuals<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,p,nchunk,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV,
stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess) {
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* convert time JD to GMST angle
store result at the same location */
void

View File

@ -92,7 +92,7 @@ dtofcopy(int N, float **x_d, double *x) {
checkCudaError(err,__FILE__,__LINE__);
/* copy memory */
err=cudaMemcpyAsync(xc,xhost,N*sizeof(float),cudaMemcpyHostToDevice,0);
err=cudaMemcpy(xc,xhost,N*sizeof(float),cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* free host buffer */
@ -120,6 +120,15 @@ precalcoh_threadfn(void *data) {
err=cudaSetDevice(card);
checkCudaError(err,__FILE__,__LINE__);
/* make sure enough heap memory is available for shapelet computations */
size_t plim;
err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize);
checkCudaError(err,__FILE__,__LINE__);
if (plim<GPU_HEAP_SIZE*1024*1024) {
err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, GPU_HEAP_SIZE*1024*1024);
checkCudaError(err,__FILE__,__LINE__);
}
double *ud,*vd,*wd;
double *cohd;
baseline_t *barrd;
@ -129,7 +138,7 @@ precalcoh_threadfn(void *data) {
float **xx_p=0,**yy_p=0,**zz_p=0;
float **xxd,**yyd,**zzd;
/* allocate memory in GPU */
err=cudaMalloc((void**) &cohd, t->Nbase*8*sizeof(float)); /* coherencies only for 1 cluster, Nf=1 */
err=cudaMalloc((void**) &cohd, t->Nbase*8*sizeof(double)); /* coherencies only for 1 cluster, Nf=1 */
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t));
checkCudaError(err,__FILE__,__LINE__);
@ -218,9 +227,16 @@ precalcoh_threadfn(void *data) {
float *beamd;
double *lld,*mmd,*nnd;
double *sId; float *rad,*decd;
double *sQd,*sUd,*sVd;
unsigned char *styped;
double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
double *sQ0d,*sU0d,*sV0d;
int **host_p,**dev_p;
complex double *tempdcoh;
err=cudaMallocHost((void**)&tempdcoh,sizeof(complex double)*(size_t)t->Nbase*4);
checkCudaError(err,__FILE__,__LINE__);
/******************* begin loop over clusters **************************/
for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
@ -249,11 +265,29 @@ precalcoh_threadfn(void *data) {
err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
if (t->Nf==1) {
err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
}
if (t->dobeam) {
dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
@ -264,7 +298,8 @@ precalcoh_threadfn(void *data) {
err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* for multi channel data */
/* for multi channel data - FIXME: remove this part because Nf==1 always */
if (t->Nf>1) {
err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
@ -286,6 +321,22 @@ precalcoh_threadfn(void *data) {
err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
}
/* extra info for source, if any */
if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
@ -347,15 +398,10 @@ precalcoh_threadfn(void *data) {
/* calculate coherencies for all sources in this cluster, add them up */
cudakernel_coherencies(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,barrd,freqsd,beamd,
lld,mmd,nnd,sId,styped,sI0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
/* copy back coherencies to host,
coherencies on host have 8M stride, on device have 8 stride */
complex double *tempdcoh;
if ((tempdcoh=(complex double*)calloc((size_t)t->Nbase*4,sizeof(complex double)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
err=cudaMemcpy((double*)tempdcoh, cohd, sizeof(double)*t->Nbase*8, cudaMemcpyDeviceToHost);
checkCudaError(err,__FILE__,__LINE__);
/* now copy this with right offset and stride */
@ -363,7 +409,6 @@ precalcoh_threadfn(void *data) {
my_ccopy(t->Nbase,&tempdcoh[1],4,&(t->coh[4*ncl+1]),4*t->M);
my_ccopy(t->Nbase,&tempdcoh[2],4,&(t->coh[4*ncl+2]),4*t->M);
my_ccopy(t->Nbase,&tempdcoh[3],4,&(t->coh[4*ncl+3]),4*t->M);
free(tempdcoh);
for (cj=0; cj<t->carr[ncl].N; cj++) {
@ -402,8 +447,16 @@ precalcoh_threadfn(void *data) {
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(nnd);
checkCudaError(err,__FILE__,__LINE__);
if (t->Nf==1) {
err=cudaFree(sId);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sQd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sUd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sVd);
checkCudaError(err,__FILE__,__LINE__);
}
if (t->dobeam) {
err=cudaFree(rad);
checkCudaError(err,__FILE__,__LINE__);
@ -413,6 +466,7 @@ precalcoh_threadfn(void *data) {
err=cudaFree(styped);
checkCudaError(err,__FILE__,__LINE__);
if (t->Nf>1) {
err=cudaFree(sI0d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(f0d);
@ -423,10 +477,21 @@ precalcoh_threadfn(void *data) {
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(spec_idx2d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sQ0d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sU0d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sV0d);
checkCudaError(err,__FILE__,__LINE__);
}
}
/******************* end loop over clusters **************************/
/* free memory */
err=cudaFreeHost(tempdcoh);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(ud);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(vd);
@ -545,7 +610,7 @@ precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex d
threaddata[nth].x=0; /* no input data */
threaddata[nth].N=N;
threaddata[nth].Nbase=Nbase; /* total baselines: actually Nbasextilesz */
threaddata[nth].Nbase=Nbase; /* total baselines: actually Nbasextilesz (from input) */
threaddata[nth].barr=barr;
threaddata[nth].carr=carr;
threaddata[nth].M=M;
@ -666,7 +731,6 @@ predictvis_threadfn(void *data) {
cudaError_t err;
int ci,ncl,cj;
printf("Attath %d to card %d\n",t->tid,card);
err=cudaSetDevice(card);
checkCudaError(err,__FILE__,__LINE__);
@ -675,12 +739,13 @@ printf("Attath %d to card %d\n",t->tid,card);
size_t plim;
err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize);
checkCudaError(err,__FILE__,__LINE__);
if (plim<16*1024*1024) { /* 16MB */
err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, 16*1024*1024);
if (plim<GPU_HEAP_SIZE*1024*1024) {
err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, GPU_HEAP_SIZE*1024*1024);
checkCudaError(err,__FILE__,__LINE__);
}
double *ud,*vd,*wd;
double *cohd;
baseline_t *barrd;
@ -782,8 +847,10 @@ printf("Attath %d to card %d\n",t->tid,card);
float *beamd;
double *lld,*mmd,*nnd;
double *sId; float *rad,*decd;
double *sQd,*sUd,*sVd;
unsigned char *styped;
double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
double *sQ0d,*sU0d,*sV0d;
int **host_p,**dev_p;
double *xlocal;
@ -792,7 +859,6 @@ printf("Attath %d to card %d\n",t->tid,card);
/******************* begin loop over clusters **************************/
for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
printf("th %d clus %d:%d working %d\n",t->tid,t->soff,t->soff+t->Ns-1,ncl);
/* allocate memory for this clusters beam */
if (t->dobeam) {
err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float));
@ -820,11 +886,27 @@ printf("Attath %d to card %d\n",t->tid,card);
checkCudaError(err,__FILE__,__LINE__);
if (t->Nf==1) {
err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
}
if (t->dobeam) {
dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
@ -836,6 +918,7 @@ printf("Attath %d to card %d\n",t->tid,card);
checkCudaError(err,__FILE__,__LINE__);
/* for multi channel data */
if (t->Nf>1) {
err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
@ -858,6 +941,23 @@ printf("Attath %d to card %d\n",t->tid,card);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
}
/* extra info for source, if any */
if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
@ -920,7 +1020,7 @@ printf("Attath %d to card %d\n",t->tid,card);
/* calculate coherencies for all sources in this cluster, add them up */
cudakernel_coherencies(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,barrd,freqsd,beamd,
lld,mmd,nnd,sId,styped,sI0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
/* copy back coherencies to host,
coherencies on host have 8M stride, on device have 8 stride */
@ -964,8 +1064,16 @@ printf("Attath %d to card %d\n",t->tid,card);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(nnd);
checkCudaError(err,__FILE__,__LINE__);
if (t->Nf==1) {
err=cudaFree(sId);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sQd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sUd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sVd);
checkCudaError(err,__FILE__,__LINE__);
}
if (t->dobeam) {
err=cudaFree(rad);
checkCudaError(err,__FILE__,__LINE__);
@ -975,6 +1083,7 @@ printf("Attath %d to card %d\n",t->tid,card);
err=cudaFree(styped);
checkCudaError(err,__FILE__,__LINE__);
if (t->Nf>1) {
err=cudaFree(sI0d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(f0d);
@ -985,6 +1094,14 @@ printf("Attath %d to card %d\n",t->tid,card);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(spec_idx2d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sQ0d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sU0d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sV0d);
checkCudaError(err,__FILE__,__LINE__);
}
}
/******************* end loop over clusters **************************/
@ -1037,6 +1154,7 @@ printf("Attath %d to card %d\n",t->tid,card);
free(zz_p);
}
/* reset error state */
err=cudaGetLastError();
return NULL;
@ -1089,7 +1207,7 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit
/* set common parameters, and split baselines to threads */
/* set common parameters, and split clusters to threads */
ci=0;
for (nth=0; nth<Ngpu && ci<M; nth++) {
if (ci+Nthb0<M) {
@ -1158,6 +1276,583 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit
free(xlocal);
free(threaddata);
destroy_task_hist(&thst);
free(th_array);
pthread_attr_destroy(&attr);
return 0;
}
static void *
residual_threadfn(void *data) {
thread_data_pred_t *t=(thread_data_pred_t*)data;
/* first, select a GPU, if total clusters < MAX_GPU_ID
use random selection, elese use this thread id */
int card;
if (t->M<=MAX_GPU_ID) {
card=select_work_gpu(MAX_GPU_ID,t->hst);
} else {
card=t->tid;
}
cudaError_t err;
int ci,ncl,cj;
err=cudaSetDevice(card);
checkCudaError(err,__FILE__,__LINE__);
/* make sure enough heap memory is available for shapelet computations */
size_t plim;
err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize);
checkCudaError(err,__FILE__,__LINE__);
if (plim<GPU_HEAP_SIZE*1024*1024) {
err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, GPU_HEAP_SIZE*1024*1024);
checkCudaError(err,__FILE__,__LINE__);
}
double *ud,*vd,*wd;
double *cohd;
baseline_t *barrd;
double *freqsd;
float *longd=0,*latd=0; double *timed;
int *Nelemd;
float **xx_p=0,**yy_p=0,**zz_p=0;
float **xxd,**yyd,**zzd;
/* allocate memory in GPU */
err=cudaMalloc((void**) &cohd, t->Nbase*8*t->Nf*sizeof(double)); /* coherencies only for 1 cluster, Nf freq, used to store sum of clusters*/
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &Nelemd, t->N*sizeof(int));
checkCudaError(err,__FILE__,__LINE__);
/* copy to device */
/* u,v,w and l,m,n coords need to be double for precision */
err=cudaMalloc((void**) &ud, t->Nbase*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &vd, t->Nbase*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &wd, t->Nbase*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(ud, t->u, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(vd, t->v, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(wd, t->w, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(barrd, t->barr, t->Nbase*sizeof(baseline_t), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &freqsd, t->Nf*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(freqsd, t->freqs, t->Nf*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* check if beam is actually calculated */
if (t->dobeam) {
dtofcopy(t->N,&longd,t->longitude);
dtofcopy(t->N,&latd,t->latitude);
err=cudaMalloc((void**) &timed, t->tilesz*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(timed, t->time_utc, t->tilesz*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* convert time jd to GMST angle */
cudakernel_convert_time(t->tilesz,timed);
err=cudaMemcpy(Nelemd, t->Nelem, t->N*sizeof(int), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* jagged arrays for element locations */
err=cudaMalloc((void**)&xxd, t->N*sizeof(int*));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&yyd, t->N*sizeof(int*));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&zzd, t->N*sizeof(int*));
checkCudaError(err,__FILE__,__LINE__);
/* allocate host memory to store pointers */
if ((xx_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((yy_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((zz_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
for (ci=0; ci<t->N; ci++) {
err=cudaMalloc((void**)&xx_p[ci], t->Nelem[ci]*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&yy_p[ci], t->Nelem[ci]*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&zz_p[ci], t->Nelem[ci]*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
}
/* now copy data */
for (ci=0; ci<t->N; ci++) {
dtofcopy(t->Nelem[ci],&xx_p[ci],t->xx[ci]);
dtofcopy(t->Nelem[ci],&yy_p[ci],t->yy[ci]);
dtofcopy(t->Nelem[ci],&zz_p[ci],t->zz[ci]);
}
/* now copy pointer locations to device */
err=cudaMemcpy(xxd, xx_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(yyd, yy_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(zzd, zz_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
}
float *beamd;
double *lld,*mmd,*nnd;
double *sId; float *rad,*decd;
double *sQd,*sUd,*sVd;
unsigned char *styped;
double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
double *sQ0d,*sU0d,*sV0d;
int **host_p,**dev_p;
double *xlocal;
err=cudaMallocHost((void**)&xlocal,sizeof(double)*(size_t)t->Nbase*8*t->Nf);
checkCudaError(err,__FILE__,__LINE__);
double *pd; /* parameter array per cluster */
/******************* begin loop over clusters **************************/
for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
/* allocate memory for this clusters beam */
if (t->dobeam) {
err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float));
checkCudaError(err,__FILE__,__LINE__);
} else {
beamd=0;
}
/* copy cluster details to GPU */
err=cudaMalloc((void**)&styped, t->carr[ncl].N*sizeof(unsigned char));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &lld, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(lld, t->carr[ncl].ll, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &mmd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(mmd, t->carr[ncl].mm, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &nnd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* parameter vector size may change depending on hybrid parameter */
err=cudaMalloc((void**) &pd, t->N*8*t->carr[ncl].nchunk*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(pd, &(t->p[t->carr[ncl].p[0]]), t->N*8*t->carr[ncl].nchunk*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
if (t->Nf==1) {
err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
}
if (t->dobeam) {
dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
} else {
rad=0;
decd=0;
}
err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* for multi channel data */
if (t->Nf>1) {
err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &f0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(f0d, t->carr[ncl].f0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &spec_idxd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(spec_idxd, t->carr[ncl].spec_idx, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &spec_idx1d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(spec_idx1d, t->carr[ncl].spec_idx1, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &spec_idx2d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
}
/* extra info for source, if any */
if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
err=cudaMalloc((void**)&dev_p, t->carr[ncl].N*sizeof(int*));
checkCudaError(err,__FILE__,__LINE__);
for (cj=0; cj<t->carr[ncl].N; cj++) {
if (t->carr[ncl].stype[cj]==STYPE_POINT) {
host_p[cj]=0;
} else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) {
exinfo_shapelet *d=(exinfo_shapelet*)t->carr[ncl].ex[cj];
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_shapelet));
checkCudaError(err,__FILE__,__LINE__);
double *modes;
err=cudaMalloc((void**)&modes, d->n0*d->n0*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_shapelet), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(modes, d->modes, d->n0*d->n0*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj];
err=cudaMemcpy(&(d_p->modes), &modes, sizeof(double*), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
} else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) {
exinfo_gaussian *d=(exinfo_gaussian*)t->carr[ncl].ex[cj];
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_gaussian));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_gaussian), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
} else if (t->carr[ncl].stype[cj]==STYPE_DISK) {
exinfo_disk *d=(exinfo_disk*)t->carr[ncl].ex[cj];
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_disk));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_disk), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
} else if (t->carr[ncl].stype[cj]==STYPE_RING) {
exinfo_ring *d=(exinfo_ring*)t->carr[ncl].ex[cj];
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_ring));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_ring), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
}
}
/* now copy pointer locations to device */
err=cudaMemcpy(dev_p, host_p, t->carr[ncl].N*sizeof(int*), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
if (t->dobeam) {
/* now calculate beam for all sources in this cluster */
cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd);
}
/* calculate coherencies for all sources in this cluster, add them up */
cudakernel_residuals(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,pd,t->carr[ncl].nchunk,barrd,freqsd,beamd,
lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
/* copy back coherencies to host,
coherencies on host have 8M stride, on device have 8 stride */
err=cudaMemcpy(xlocal, cohd, sizeof(double)*t->Nbase*8*t->Nf, cudaMemcpyDeviceToHost);
checkCudaError(err,__FILE__,__LINE__);
my_daxpy(t->Nbase*8*t->Nf,xlocal,1.0,t->x);
for (cj=0; cj<t->carr[ncl].N; cj++) {
if (t->carr[ncl].stype[cj]==STYPE_POINT) {
} else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) {
exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj];
double *modes=0;
err=cudaMemcpy(&modes, &(d_p->modes), sizeof(double*), cudaMemcpyDeviceToHost);
err=cudaFree(modes);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(host_p[cj]);
checkCudaError(err,__FILE__,__LINE__);
} else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) {
err=cudaFree(host_p[cj]);
checkCudaError(err,__FILE__,__LINE__);
} else if (t->carr[ncl].stype[cj]==STYPE_DISK) {
err=cudaFree(host_p[cj]);
checkCudaError(err,__FILE__,__LINE__);
} else if (t->carr[ncl].stype[cj]==STYPE_RING) {
err=cudaFree(host_p[cj]);
checkCudaError(err,__FILE__,__LINE__);
}
}
free(host_p);
err=cudaFree(dev_p);
checkCudaError(err,__FILE__,__LINE__);
if (t->dobeam) {
err=cudaFree(beamd);
checkCudaError(err,__FILE__,__LINE__);
}
err=cudaFree(lld);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(mmd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(nnd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(pd);
checkCudaError(err,__FILE__,__LINE__);
if (t->Nf==1) {
err=cudaFree(sId);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sQd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sUd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sVd);
checkCudaError(err,__FILE__,__LINE__);
}
if (t->dobeam) {
err=cudaFree(rad);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(decd);
checkCudaError(err,__FILE__,__LINE__);
}
err=cudaFree(styped);
checkCudaError(err,__FILE__,__LINE__);
if (t->Nf>1) {
err=cudaFree(sI0d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(f0d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(spec_idxd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(spec_idx1d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(spec_idx2d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sQ0d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sU0d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sV0d);
checkCudaError(err,__FILE__,__LINE__);
}
}
/******************* end loop over clusters **************************/
/* free memory */
err=cudaFreeHost(xlocal);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(ud);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(vd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(wd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(cohd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(barrd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(freqsd);
checkCudaError(err,__FILE__,__LINE__);
if (t->dobeam) {
err=cudaFree(longd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(latd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(timed);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(Nelemd);
checkCudaError(err,__FILE__,__LINE__);
for (ci=0; ci<t->N; ci++) {
err=cudaFree(xx_p[ci]);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(yy_p[ci]);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(zz_p[ci]);
checkCudaError(err,__FILE__,__LINE__);
}
err=cudaFree(xxd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(yyd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(zzd);
checkCudaError(err,__FILE__,__LINE__);
free(xx_p);
free(yy_p);
free(zz_p);
}
/* reset error state */
err=cudaGetLastError();
return NULL;
}
/* p: 8NMx1 parameter array, but M is 'effective' clusters, need to use carr to find the right offset
ccid: which cluster to use as correction
rho: MMSE robust parameter J+rho I inverted
phase_only: if >0, and if there is any correction done, use only phase of diagonal elements for correction
*/
int
calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int ccid, double rho, int phase_only) {
int nth,ci;
int Nthb0,Nthb;
pthread_attr_t attr;
pthread_t *th_array;
thread_data_pred_t *threaddata;
taskhist thst;
init_task_hist(&thst);
/* oversubsribe GPU */
int Ngpu=MAX_GPU_ID+1;
/* calculate min clusters thread can handle */
Nthb0=(M+Ngpu-1)/Ngpu;
/* setup threads : note: Ngpu is no of GPUs used */
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
if ((th_array=(pthread_t*)malloc((size_t)Ngpu*sizeof(pthread_t)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((threaddata=(thread_data_pred_t*)malloc((size_t)Ngpu*sizeof(thread_data_pred_t)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
/* arrays to store result */
double *xlocal;
if ((xlocal=(double*)calloc((size_t)Nbase*8*tilesz*Nchan*Ngpu,sizeof(double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
/* set common parameters, and split clusters to threads */
ci=0;
for (nth=0; nth<Ngpu && ci<M; nth++) {
if (ci+Nthb0<M) {
Nthb=Nthb0;
} else {
Nthb=M-ci;
}
threaddata[nth].hst=&thst; /* for load balancing */
threaddata[nth].tid=nth;
threaddata[nth].u=u;
threaddata[nth].v=v;
threaddata[nth].w=w;
threaddata[nth].coh=0;
threaddata[nth].x=&xlocal[nth*Nbase*8*tilesz*Nchan]; /* distinct arrays to get back the result */
threaddata[nth].N=N;
threaddata[nth].Nbase=Nbase*tilesz; /* total baselines: actually Nbasextilesz */
threaddata[nth].barr=barr;
threaddata[nth].carr=carr;
threaddata[nth].M=M;
threaddata[nth].Nf=Nchan;
threaddata[nth].freqs=freqs;
threaddata[nth].fdelta=fdelta;
threaddata[nth].tdelta=tdelta;
threaddata[nth].dec0=dec0;
threaddata[nth].dobeam=dobeam;
threaddata[nth].ph_ra0=ph_ra0;
threaddata[nth].ph_dec0=ph_dec0;
threaddata[nth].ph_freq0=ph_freq0;
threaddata[nth].longitude=longitude;
threaddata[nth].latitude=latitude;
threaddata[nth].time_utc=time_utc;
threaddata[nth].tilesz=tilesz;
threaddata[nth].Nelem=Nelem;
threaddata[nth].xx=xx;
threaddata[nth].yy=yy;
threaddata[nth].zz=zz;
/* parameters */
threaddata[nth].p=p;
threaddata[nth].Ns=Nthb;
threaddata[nth].soff=ci;
pthread_create(&th_array[nth],&attr,residual_threadfn,(void*)(&threaddata[nth]));
/* next source set */
ci=ci+Nthb;
}
/* now wait for threads to finish */
for(ci=0; ci<nth; ci++) {
pthread_join(th_array[ci],NULL);
/* subtract to find residual */
my_daxpy(Nbase*8*tilesz*Nchan,threaddata[ci].x,-1.0,x);
}
free(xlocal);
free(threaddata);
destroy_task_hist(&thst);

View File

@ -2658,11 +2658,16 @@ precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex d
double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int tileze, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt);
/* FIXME: add_to_data: no subtraction is done */
extern int
predict_visibilities_multifreq_withbeam_gpu(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int add_to_data);
extern int
calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int ccid, double rho, int phase_only);
#endif /*!HAVE_CUDA */
@ -2675,6 +2680,12 @@ predict_visibilities_multifreq_withbeam_gpu(double *u,double *v,double *w,double
#ifndef ARRAY_MAX_ELEM /* if using shared memory, max possible elements for a station */
#define ARRAY_MAX_ELEM 512
#endif
/* default GPU heap size (in MB) needed to calculate some shapelet models,
if model has n0>20 or so, try increasing this and recompiling
the default GPU values is ~ 8MB */
#ifndef GPU_HEAP_SIZE
#define GPU_HEAP_SIZE 20
#endif
extern void
@ -2683,9 +2694,12 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud
extern void
cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI,
unsigned char *stype, double *sI0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam);
cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam);
extern void
cudakernel_residuals(int B, int N, int T, int K, int F, double *u, double *v, double *w, double *p, int nchunk, baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam);
extern void
cudakernel_convert_time(int T, double *time_utc);