added model prediction and residual calculation using GPU
This commit is contained in:
parent
e07a26b63f
commit
028ac53347
|
@ -239,6 +239,7 @@ ParseCmdLine(int ac, char **av) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/* real main program */
|
||||
int
|
||||
main(int argc, char **argv) {
|
||||
|
|
|
@ -537,12 +537,27 @@ cout<<myrank<<" : "<<cm<<": downweight ratio ("<<iodata_vec[cm].fratio<<") based
|
|||
}
|
||||
|
||||
for(int cm=0; cm<mymscount; cm++) {
|
||||
#ifndef HAVE_CUDA
|
||||
if (!doBeam) {
|
||||
precalculate_coherencies(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
|
||||
} else {
|
||||
precalculate_coherencies_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,
|
||||
beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,iodata_vec[cm].tilesz,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt);
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_CUDA
|
||||
if (GPUpredict) {
|
||||
precalculate_coherencies_withbeam_gpu(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,
|
||||
beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,iodata_vec[cm].tilesz,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,doBeam,Data::Nt);
|
||||
} else {
|
||||
if (!doBeam) {
|
||||
precalculate_coherencies(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
|
||||
} else {
|
||||
precalculate_coherencies_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,coh_vec[cm],iodata_vec[cm].N,iodata_vec[cm].Nbase*iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freq0,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::min_uvcut,Data::max_uvcut,
|
||||
beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,iodata_vec[cm].tilesz,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/******************** ADMM *******************************/
|
||||
|
@ -734,12 +749,27 @@ cout<<myrank<<" : "<<cm<<": downweight ratio ("<<iodata_vec[cm].fratio<<") based
|
|||
|
||||
/* write residuals to output */
|
||||
for(int cm=0; cm<mymscount; cm++) {
|
||||
#ifndef HAVE_CUDA
|
||||
if (!doBeam) {
|
||||
calculate_residuals_multifreq(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
|
||||
} else {
|
||||
calculate_residuals_multifreq_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
|
||||
calculate_residuals_multifreq(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
|
||||
} else {
|
||||
calculate_residuals_multifreq_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
|
||||
beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_CUDA
|
||||
if (GPUpredict) {
|
||||
calculate_residuals_multifreq_withbeam_gpu(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
|
||||
beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,doBeam,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
|
||||
} else {
|
||||
if (!doBeam) {
|
||||
calculate_residuals_multifreq(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
|
||||
} else {
|
||||
calculate_residuals_multifreq_withbeam(iodata_vec[cm].u,iodata_vec[cm].v,iodata_vec[cm].w,p_vec[cm],iodata_vec[cm].xo,iodata_vec[cm].N,iodata_vec[cm].Nbase,iodata_vec[cm].tilesz,barr_vec[cm],carr_vec[cm],M,iodata_vec[cm].freqs,iodata_vec[cm].Nchan,iodata_vec[cm].deltaf,iodata_vec[cm].deltat,iodata_vec[cm].dec0,
|
||||
beam_vec[cm].p_ra0,beam_vec[cm].p_dec0,iodata_vec[cm].freq0,beam_vec[cm].sx,beam_vec[cm].sy,beam_vec[cm].time_utc,beam_vec[cm].Nelem,beam_vec[cm].xx,beam_vec[cm].yy,beam_vec[cm].zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
tilex+=iodata_vec[0].tilesz;
|
||||
|
||||
|
|
|
@ -526,18 +526,27 @@ main(int argc, char **argv) {
|
|||
/* FIXME: uvmin is not needed in calibration, because its taken care of by flags */
|
||||
if (!Data::DoSim) {
|
||||
/****************** calibration **************************/
|
||||
////#ifndef HAVE_CUDA
|
||||
#ifndef HAVE_CUDA
|
||||
if (!doBeam) {
|
||||
precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
|
||||
} else {
|
||||
precalculate_coherencies_withbeam(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
|
||||
beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt);
|
||||
}
|
||||
////#endif
|
||||
////#ifdef HAVE_CUDA
|
||||
//// precalculate_coherencies_withbeam_gpu(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
|
||||
//// beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt);
|
||||
////#endif
|
||||
#endif
|
||||
#ifdef HAVE_CUDA
|
||||
if (GPUpredict) {
|
||||
precalculate_coherencies_withbeam_gpu(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
|
||||
beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt);
|
||||
} else {
|
||||
if (!doBeam) {
|
||||
precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt);
|
||||
} else {
|
||||
precalculate_coherencies_withbeam(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,
|
||||
beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef HAVE_CUDA
|
||||
|
@ -651,12 +660,28 @@ main(int argc, char **argv) {
|
|||
} else {
|
||||
/* since residual is calculated not using x (instead using xo), no need to unwhiten data
|
||||
in case x was whitened */
|
||||
#ifndef HAVE_CUDA
|
||||
if (!doBeam) {
|
||||
calculate_residuals_multifreq(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
|
||||
} else {
|
||||
calculate_residuals_multifreq_withbeam(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
|
||||
beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_CUDA
|
||||
if (GPUpredict) {
|
||||
calculate_residuals_multifreq_withbeam_gpu(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
|
||||
beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
|
||||
} else {
|
||||
if (!doBeam) {
|
||||
calculate_residuals_multifreq(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
|
||||
} else {
|
||||
calculate_residuals_multifreq_withbeam(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
|
||||
beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt,Data::ccid,Data::rho,Data::phaseOnly);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
/****************** end calibration **************************/
|
||||
/****************** begin diagnostics ************************/
|
||||
|
|
|
@ -1224,6 +1224,7 @@ attach_gpu_to_thread1(int card, cublasHandle_t *cbhandle, cusolverDnHandle_t *s
|
|||
status=cusolverDnCreate(solver_handle);
|
||||
if (status != CUSOLVER_STATUS_SUCCESS) {
|
||||
fprintf(stderr,"%s: %d: CUSOLV create fail %d\n",__FILE__,__LINE__,status);
|
||||
fprintf(stderr,"common problems: not initialized %d, alloc fail %d, no compute %d\n",CUSOLVER_STATUS_NOT_INITIALIZED, CUSOLVER_STATUS_ALLOC_FAILED, CUSOLVER_STATUS_ARCH_MISMATCH);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,6 +26,25 @@
|
|||
/* enable this for checking for kernel failure */
|
||||
//#define CUDA_DBG
|
||||
|
||||
/* matrix multiplications */
|
||||
/* C=A*B */
|
||||
__device__ void
|
||||
amb(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) {
|
||||
c[0]=cuCadd(cuCmul(a[0],b[0]),cuCmul(a[1],b[2]));
|
||||
c[1]=cuCadd(cuCmul(a[0],b[1]),cuCmul(a[1],b[3]));
|
||||
c[2]=cuCadd(cuCmul(a[2],b[0]),cuCmul(a[3],b[2]));
|
||||
c[3]=cuCadd(cuCmul(a[2],b[1]),cuCmul(a[3],b[3]));
|
||||
}
|
||||
/* C=A*B^H */
|
||||
__device__ void
|
||||
ambt(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) {
|
||||
c[0]=cuCadd(cuCmul(a[0],cuConj(b[0])),cuCmul(a[1],cuConj(b[1])));
|
||||
c[1]=cuCadd(cuCmul(a[0],cuConj(b[2])),cuCmul(a[1],cuConj(b[3])));
|
||||
c[2]=cuCadd(cuCmul(a[2],cuConj(b[0])),cuCmul(a[3],cuConj(b[1])));
|
||||
c[3]=cuCadd(cuCmul(a[2],cuConj(b[2])),cuCmul(a[3],cuConj(b[3])));
|
||||
}
|
||||
|
||||
|
||||
__device__ void
|
||||
radec2azel_gmst__(float ra, float dec, float longitude, float latitude, float thetaGMST, float *az, float *el) {
|
||||
float thetaLST=thetaGMST+longitude*180.0f/M_PI;
|
||||
|
@ -327,6 +346,11 @@ shapelet_contrib__(int *dd, float u, float v, float w) {
|
|||
__sincosf((float)dp->eP,&sinph,&cosph);
|
||||
ut=a*(cosph*up-sinph*vp);
|
||||
vt=b*(sinph*up+cosph*vp);
|
||||
/* if u,v is way off the scale (beta) of shapelet modes, the result is almost always zero,
|
||||
so check this here and return 0, otherwise spurious nans may result */
|
||||
if (__fdiv_rz(100.0f,__fsqrt_rz(ut*ut+vt*vt))<dp->beta) {
|
||||
return make_cuDoubleComplex(0.0,0.0);
|
||||
}
|
||||
/* note: we decompose f(-l,m) so the Fourier transform is F(-u,v)
|
||||
so negate the u grid */
|
||||
Av=(float*)malloc((size_t)((dp->n0)*(dp->n0))*sizeof(float));
|
||||
|
@ -345,45 +369,57 @@ shapelet_contrib__(int *dd, float u, float v, float w) {
|
|||
|
||||
free(Av);
|
||||
free(cplx);
|
||||
//return 2.0*M_PI*(realsum+_Complex_I*imagsum);
|
||||
realsum*=2.0f*M_PI*a*b;
|
||||
imagsum*=2.0f*M_PI*a*b;
|
||||
/* additional safeguards */
|
||||
if ( isnan(realsum) ) { realsum=0.0f; }
|
||||
if ( isnan(imagsum) ) { imagsum=0.0f; }
|
||||
return make_cuDoubleComplex((double)realsum,(double)imagsum);
|
||||
}
|
||||
|
||||
|
||||
__device__ cuDoubleComplex
|
||||
compute_prodterm(int sta1, int sta2, int N, int K, int T, int F,
|
||||
double phterm0, double sIf, double sI0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0,
|
||||
const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w) {
|
||||
|
||||
__device__ void
|
||||
compute_prodterm_multifreq(int sta1, int sta2, int N, int K, int T, int F,
|
||||
double phterm0, double sI0f, double sQ0f, double sU0f, double sV0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0,
|
||||
double myfreq, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) {
|
||||
/* F>1 is assumed output: 8x1 array */
|
||||
double sinph,cosph;
|
||||
double myfreq=__ldg(&freqs[cf]);
|
||||
sincos(phterm0*myfreq,&sinph,&cosph);
|
||||
cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph);
|
||||
double If;
|
||||
if (F==1) {
|
||||
/* flux: do not use spectra here, because F=1*/
|
||||
If=sIf;
|
||||
} else {
|
||||
/* evaluate spectra */
|
||||
double fratio=log(myfreq/myf0);
|
||||
double fratio1=fratio*fratio;
|
||||
double fratio2=fratio1*fratio;
|
||||
/* catch -ve flux */
|
||||
if (sI0f>0.0) {
|
||||
If=exp(log(sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
|
||||
} else if (sI0f<0.0) {
|
||||
If=-exp(log(-sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
|
||||
} else {
|
||||
If=0.0;
|
||||
}
|
||||
double If,Qf,Uf,Vf;
|
||||
If=Qf=Uf=Vf=0.0;
|
||||
/* evaluate spectra */
|
||||
double fratio=log(myfreq/myf0);
|
||||
double fratio1=fratio*fratio;
|
||||
double fratio2=fratio1*fratio;
|
||||
double cm=spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2;
|
||||
/* catch -ve flux */
|
||||
if (sI0f>0.0) {
|
||||
If=exp(log(sI0f)+cm);
|
||||
} else if (sI0f<0.0) {
|
||||
If=-exp(log(-sI0f)+cm);
|
||||
}
|
||||
if (sQ0f>0.0) {
|
||||
Qf=exp(log(sQ0f)+cm);
|
||||
} else if (sI0f<0.0) {
|
||||
Qf=-exp(log(-sQ0f)+cm);
|
||||
}
|
||||
if (sU0f>0.0) {
|
||||
Uf=exp(log(sU0f)+cm);
|
||||
} else if (sI0f<0.0) {
|
||||
Uf=-exp(log(-sU0f)+cm);
|
||||
}
|
||||
if (sV0f>0.0) {
|
||||
Vf=exp(log(sV0f)+cm);
|
||||
} else if (sI0f<0.0) {
|
||||
Vf=-exp(log(-sV0f)+cm);
|
||||
}
|
||||
/* smearing */
|
||||
|
||||
/* smearing, beam */
|
||||
double scalef=1.0;
|
||||
double phterm =(phterm0*0.5*deltaf);
|
||||
if (phterm!=0.0) {
|
||||
sinph=(sin(phterm)/phterm);
|
||||
If *=fabsf(sinph); /* catch -ve values due to rounding off */
|
||||
scalef *=fabs(sinph); /* catch -ve values due to rounding off */
|
||||
}
|
||||
|
||||
if (dobeam) {
|
||||
|
@ -396,13 +432,13 @@ const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int
|
|||
//int boffset2=sta2*K*T*F + k1*T*F + cf*T + itm;
|
||||
int boffset2=itm*N*K*F+k*N*F+cf*N+sta2;
|
||||
float beam2=__ldg(&beam[boffset2]);
|
||||
If *=(double)(beam1*beam2);
|
||||
scalef *=(double)(beam1*beam2);
|
||||
}
|
||||
|
||||
|
||||
/* form complex value */
|
||||
prodterm.x *=If;
|
||||
prodterm.y *=If;
|
||||
prodterm.x *=scalef;
|
||||
prodterm.y *=scalef;
|
||||
|
||||
/* check for type of source */
|
||||
if (stypeT!=STYPE_POINT) {
|
||||
|
@ -420,10 +456,99 @@ const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int
|
|||
}
|
||||
}
|
||||
|
||||
//printf("Input k=%d uvw %lE,%lE,%lE sI=%f phterm0=%lf\n",k,u,v,w,sIf,phterm0);
|
||||
//printf("phterm %lf smearing %f\n",phterm,sinph);
|
||||
//printf("k=%d uvw %lE,%lE,%lE If=%f phterm %lf out=%f,%f\n",k,u,v,w,If,phterm,prodterm.x,prodterm.y);
|
||||
return prodterm;
|
||||
double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy;
|
||||
Ix=If*prodterm.x;
|
||||
Iy=If*prodterm.y;
|
||||
Qx=Qf*prodterm.x;
|
||||
Qy=Qf*prodterm.y;
|
||||
Ux=Uf*prodterm.x;
|
||||
Uy=Uf*prodterm.y;
|
||||
Vx=Vf*prodterm.x;
|
||||
Vy=Vf*prodterm.y;
|
||||
|
||||
|
||||
|
||||
output[0]=Ix+Qx;
|
||||
output[1]=Iy+Qy;
|
||||
output[2]=Ux-Vy;
|
||||
output[3]=Vx+Uy;
|
||||
output[4]=Ux+Vy;
|
||||
output[5]=-Vx+Uy;
|
||||
output[6]=Ix-Qx;
|
||||
output[7]=Iy-Qy;
|
||||
|
||||
}
|
||||
|
||||
|
||||
__device__ void
|
||||
compute_prodterm(int sta1, int sta2, int N, int K, int T,
|
||||
double phterm0, double If, double Qf, double Uf, double Vf,
|
||||
double myfreq, double deltaf, int dobeam, int itm, int k, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) {
|
||||
/* F==1 is assumed, output: 8x1 array */
|
||||
double sinph,cosph;
|
||||
sincos(phterm0*myfreq,&sinph,&cosph);
|
||||
cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph);
|
||||
|
||||
/* smearing, beam */
|
||||
double scalef=1.0;
|
||||
double phterm =(phterm0*0.5*deltaf);
|
||||
if (phterm!=0.0) {
|
||||
sinph=(sin(phterm)/phterm);
|
||||
scalef *=fabs(sinph); /* catch -ve values due to rounding off */
|
||||
}
|
||||
|
||||
if (dobeam) {
|
||||
/* get beam info */
|
||||
int boffset1=itm*N*K+k*N+sta1;
|
||||
// printf("itm=%d, k1=%d, sta1=%d, sta2=%d, boffset1=%d, boffset2=%d\n", itm, k1, sta1, sta2, boffset1, boffset2);
|
||||
float beam1=__ldg(&beam[boffset1]);
|
||||
int boffset2=itm*N*K+k*N+sta2;
|
||||
float beam2=__ldg(&beam[boffset2]);
|
||||
scalef *=(double)(beam1*beam2);
|
||||
}
|
||||
|
||||
|
||||
/* form complex value */
|
||||
prodterm.x *=scalef;
|
||||
prodterm.y *=scalef;
|
||||
|
||||
/* check for type of source */
|
||||
if (stypeT!=STYPE_POINT) {
|
||||
double uscaled=u*myfreq;
|
||||
double vscaled=v*myfreq;
|
||||
double wscaled=w*myfreq;
|
||||
if (stypeT==STYPE_SHAPELET) {
|
||||
prodterm=cuCmul(shapelet_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
|
||||
} else if (stypeT==STYPE_GAUSSIAN) {
|
||||
prodterm=cuCmul(gaussian_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
|
||||
} else if (stypeT==STYPE_DISK) {
|
||||
prodterm=cuCmul(disk_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
|
||||
} else if (stypeT==STYPE_RING) {
|
||||
prodterm=cuCmul(ring_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy;
|
||||
Ix=If*prodterm.x;
|
||||
Iy=If*prodterm.y;
|
||||
Qx=Qf*prodterm.x;
|
||||
Qy=Qf*prodterm.y;
|
||||
Ux=Uf*prodterm.x;
|
||||
Uy=Uf*prodterm.y;
|
||||
Vx=Vf*prodterm.x;
|
||||
Vy=Vf*prodterm.y;
|
||||
|
||||
|
||||
|
||||
output[0]=Ix+Qx;
|
||||
output[1]=Iy+Qy;
|
||||
output[2]=Ux-Vy;
|
||||
output[3]=Vx+Uy;
|
||||
output[4]=Ux+Vy;
|
||||
output[5]=-Vx+Uy;
|
||||
output[6]=Ix-Qx;
|
||||
output[7]=Iy-Qy;
|
||||
|
||||
}
|
||||
|
||||
|
@ -431,8 +556,11 @@ const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int
|
|||
__global__ void
|
||||
kernel_coherencies(int B, int N, int T, int K, int F,
|
||||
const double *__restrict__ u, const double *__restrict__ v, const double *__restrict__ w,
|
||||
baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, const double *__restrict__ sI,
|
||||
const unsigned char *__restrict__ stype, const double *__restrict__ sI0, const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
|
||||
baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn,
|
||||
const double *__restrict__ sI, const double *__restrict__ sQ, const double *__restrict__ sU, const double *__restrict__ sV,
|
||||
const unsigned char *__restrict__ stype, const double *__restrict__ sI0,
|
||||
const double *__restrict__ sQ0, const double *__restrict__ sU0, const double *__restrict__ sV0,
|
||||
const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
|
||||
|
||||
/* global thread index */
|
||||
unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
|
||||
|
@ -451,46 +579,96 @@ kernel_coherencies(int B, int N, int T, int K, int F,
|
|||
|
||||
//TODO: figure out if this max_f makes any sense
|
||||
#define MAX_F 20
|
||||
cuDoubleComplex l_coh[MAX_F];
|
||||
double l_coh[MAX_F][8];
|
||||
for(int cf=0; cf<F; cf++) {
|
||||
l_coh[cf] = make_cuDoubleComplex(0.0, 0.0);
|
||||
l_coh[cf][0]=0.0;
|
||||
l_coh[cf][1]=0.0;
|
||||
l_coh[cf][2]=0.0;
|
||||
l_coh[cf][3]=0.0;
|
||||
l_coh[cf][4]=0.0;
|
||||
l_coh[cf][5]=0.0;
|
||||
l_coh[cf][6]=0.0;
|
||||
l_coh[cf][7]=0.0;
|
||||
}
|
||||
|
||||
//use simply for-loop, if K is very large this may be slow and may need further parallelization
|
||||
for (int k=0; k<K; k++) {
|
||||
|
||||
// split to two cases, F==1 and F>1
|
||||
if (F==1) {
|
||||
//use simply for-loop, if K is very large this may be slow and may need further parallelization
|
||||
for (int k=0; k<K; k++) {
|
||||
//source specific params
|
||||
double sIf,sI0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
|
||||
double sIf,sQf,sUf,sVf;
|
||||
double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
|
||||
sIf=__ldg(&sI[k]);
|
||||
if (F>1) {
|
||||
sI0f=__ldg(&sI0[k]);
|
||||
spec_idxf=__ldg(&spec_idx[k]);
|
||||
spec_idx1f=__ldg(&spec_idx1[k]);
|
||||
spec_idx2f=__ldg(&spec_idx2[k]);
|
||||
myf0=__ldg(&f0[k]);
|
||||
}
|
||||
sQf=__ldg(&sQ[k]);
|
||||
sUf=__ldg(&sU[k]);
|
||||
sVf=__ldg(&sV[k]);
|
||||
|
||||
unsigned char stypeT=__ldg(&stype[k]);
|
||||
|
||||
double llcoh[8];
|
||||
compute_prodterm(sta1, sta2, N, K, T, phterm0, sIf, sQf, sUf, sVf,
|
||||
__ldg(&(freqs[0])), deltaf, dobeam, tslot, k, beam, exs, stypeT, u_n, v_n, w_n, llcoh);
|
||||
|
||||
l_coh[0][0] +=llcoh[0];
|
||||
l_coh[0][1] +=llcoh[1];
|
||||
l_coh[0][2] +=llcoh[2];
|
||||
l_coh[0][3] +=llcoh[3];
|
||||
l_coh[0][4] +=llcoh[4];
|
||||
l_coh[0][5] +=llcoh[5];
|
||||
l_coh[0][6] +=llcoh[6];
|
||||
l_coh[0][7] +=llcoh[7];
|
||||
|
||||
}
|
||||
} else {
|
||||
//use simply for-loop, if K is very large this may be slow and may need further parallelization
|
||||
for (int k=0; k<K; k++) {
|
||||
//source specific params
|
||||
double sI0f,sQ0f,sU0f,sV0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
|
||||
double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
|
||||
sI0f=__ldg(&sI0[k]);
|
||||
sQ0f=__ldg(&sQ0[k]);
|
||||
sU0f=__ldg(&sU0[k]);
|
||||
sV0f=__ldg(&sV0[k]);
|
||||
spec_idxf=__ldg(&spec_idx[k]);
|
||||
spec_idx1f=__ldg(&spec_idx1[k]);
|
||||
spec_idx2f=__ldg(&spec_idx2[k]);
|
||||
myf0=__ldg(&f0[k]);
|
||||
|
||||
unsigned char stypeT=__ldg(&stype[k]);
|
||||
|
||||
for(int cf=0; cf<F; cf++) {
|
||||
l_coh[cf] = cuCadd(l_coh[cf], compute_prodterm(sta1, sta2, N, K, T, F, phterm0, sIf, sI0f, spec_idxf, spec_idx1f, spec_idx2f,
|
||||
myf0, freqs, deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n));
|
||||
double llcoh[8];
|
||||
compute_prodterm_multifreq(sta1, sta2, N, K, T, F, phterm0, sI0f, sQ0f, sU0f, sV0f, spec_idxf, spec_idx1f, spec_idx2f,
|
||||
myf0, __ldg(&(freqs[cf])), deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n,llcoh);
|
||||
l_coh[cf][0] +=llcoh[0];
|
||||
l_coh[cf][1] +=llcoh[1];
|
||||
l_coh[cf][2] +=llcoh[2];
|
||||
l_coh[cf][3] +=llcoh[3];
|
||||
l_coh[cf][4] +=llcoh[4];
|
||||
l_coh[cf][5] +=llcoh[5];
|
||||
l_coh[cf][6] +=llcoh[6];
|
||||
l_coh[cf][7] +=llcoh[7];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
//write output with right multi frequency offset
|
||||
double *coh1 = &coh[8*n];
|
||||
for(int cf=0; cf<F; cf++) {
|
||||
coh1[cf*8*B+0] = l_coh[cf].x;
|
||||
coh1[cf*8*B+1] = l_coh[cf].y;
|
||||
coh1[cf*8*B+2] = 0.0;
|
||||
coh1[cf*8*B+3] = 0.0;
|
||||
coh1[cf*8*B+4] = 0.0;
|
||||
coh1[cf*8*B+5] = 0.0;
|
||||
coh1[cf*8*B+6] = l_coh[cf].x;
|
||||
coh1[cf*8*B+7] = l_coh[cf].y;
|
||||
coh1[cf*8*B+0] = l_coh[cf][0];
|
||||
coh1[cf*8*B+1] = l_coh[cf][1];
|
||||
coh1[cf*8*B+2] = l_coh[cf][2];
|
||||
coh1[cf*8*B+3] = l_coh[cf][3];
|
||||
coh1[cf*8*B+4] = l_coh[cf][4];
|
||||
coh1[cf*8*B+5] = l_coh[cf][5];
|
||||
coh1[cf*8*B+6] = l_coh[cf][6];
|
||||
coh1[cf*8*B+7] = l_coh[cf][7];
|
||||
}
|
||||
|
||||
|
||||
|
@ -498,6 +676,196 @@ kernel_coherencies(int B, int N, int T, int K, int F,
|
|||
|
||||
}
|
||||
|
||||
/* kernel to calculate residuals */
|
||||
__global__ void
|
||||
kernel_residuals(int B, int N, int T, int K, int F,
|
||||
const double *__restrict__ u, const double *__restrict__ v, const double *__restrict__ w,
|
||||
const double *__restrict__ p, int nchunk,
|
||||
baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn,
|
||||
const double *__restrict__ sI, const double *__restrict__ sQ, const double *__restrict__ sU, const double *__restrict__ sV,
|
||||
const unsigned char *__restrict__ stype, const double *__restrict__ sI0,
|
||||
const double *__restrict__ sQ0, const double *__restrict__ sU0, const double *__restrict__ sV0,
|
||||
const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
|
||||
|
||||
/* global thread index */
|
||||
unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
|
||||
|
||||
/* each thread will calculate for one baseline, over all sources */
|
||||
if (n<B) {
|
||||
int sta1=barr[n].sta1;
|
||||
int sta2=barr[n].sta2;
|
||||
/* find out which time slot this baseline is from */
|
||||
int tslot=n/((N*(N-1)/2));
|
||||
/* find out which chunk to select from p : 0,1..nchunk-1 */
|
||||
int chunk=(n*nchunk)/B;
|
||||
/* create G1 and G2 Jones matrices from p */
|
||||
cuDoubleComplex G1[4],G2[4];
|
||||
G1[0].x=__ldg(&p[chunk*8*N+sta1*8]);
|
||||
G1[0].y=__ldg(&p[chunk*8*N+sta1*8+1]);
|
||||
G1[1].x=__ldg(&p[chunk*8*N+sta1*8+2]);
|
||||
G1[1].y=__ldg(&p[chunk*8*N+sta1*8+3]);
|
||||
G1[2].x=__ldg(&p[chunk*8*N+sta1*8+4]);
|
||||
G1[2].y=__ldg(&p[chunk*8*N+sta1*8+5]);
|
||||
G1[3].x=__ldg(&p[chunk*8*N+sta1*8+6]);
|
||||
G1[3].y=__ldg(&p[chunk*8*N+sta1*8+7]);
|
||||
G2[0].x=__ldg(&p[chunk*8*N+sta2*8]);
|
||||
G2[0].y=__ldg(&p[chunk*8*N+sta2*8+1]);
|
||||
G2[1].x=__ldg(&p[chunk*8*N+sta2*8+2]);
|
||||
G2[1].y=__ldg(&p[chunk*8*N+sta2*8+3]);
|
||||
G2[2].x=__ldg(&p[chunk*8*N+sta2*8+4]);
|
||||
G2[2].y=__ldg(&p[chunk*8*N+sta2*8+5]);
|
||||
G2[3].x=__ldg(&p[chunk*8*N+sta2*8+6]);
|
||||
G2[3].y=__ldg(&p[chunk*8*N+sta2*8+7]);
|
||||
|
||||
double u_n=(u[n]);
|
||||
double v_n=(v[n]);
|
||||
double w_n=(w[n]);
|
||||
|
||||
//TODO: figure out if this max_f makes any sense
|
||||
#define MAX_F 20
|
||||
double l_coh[MAX_F][8];
|
||||
for(int cf=0; cf<F; cf++) {
|
||||
l_coh[cf][0]=0.0;
|
||||
l_coh[cf][1]=0.0;
|
||||
l_coh[cf][2]=0.0;
|
||||
l_coh[cf][3]=0.0;
|
||||
l_coh[cf][4]=0.0;
|
||||
l_coh[cf][5]=0.0;
|
||||
l_coh[cf][6]=0.0;
|
||||
l_coh[cf][7]=0.0;
|
||||
}
|
||||
|
||||
|
||||
// split to two cases, F==1 and F>1
|
||||
if (F==1) {
|
||||
//use simply for-loop, if K is very large this may be slow and may need further parallelization
|
||||
for (int k=0; k<K; k++) {
|
||||
//source specific params
|
||||
double sIf,sQf,sUf,sVf;
|
||||
double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
|
||||
sIf=__ldg(&sI[k]);
|
||||
sQf=__ldg(&sQ[k]);
|
||||
sUf=__ldg(&sU[k]);
|
||||
sVf=__ldg(&sV[k]);
|
||||
|
||||
unsigned char stypeT=__ldg(&stype[k]);
|
||||
|
||||
double llcoh[8];
|
||||
compute_prodterm(sta1, sta2, N, K, T, phterm0, sIf, sQf, sUf, sVf,
|
||||
__ldg(&(freqs[0])), deltaf, dobeam, tslot, k, beam, exs, stypeT, u_n, v_n, w_n, llcoh);
|
||||
|
||||
l_coh[0][0] +=llcoh[0];
|
||||
l_coh[0][1] +=llcoh[1];
|
||||
l_coh[0][2] +=llcoh[2];
|
||||
l_coh[0][3] +=llcoh[3];
|
||||
l_coh[0][4] +=llcoh[4];
|
||||
l_coh[0][5] +=llcoh[5];
|
||||
l_coh[0][6] +=llcoh[6];
|
||||
l_coh[0][7] +=llcoh[7];
|
||||
|
||||
|
||||
}
|
||||
cuDoubleComplex L1[4],L2[4];
|
||||
L1[0].x=l_coh[0][0];
|
||||
L1[0].y=l_coh[0][1];
|
||||
L1[1].x=l_coh[0][2];
|
||||
L1[1].y=l_coh[0][3];
|
||||
L1[2].x=l_coh[0][4];
|
||||
L1[2].y=l_coh[0][5];
|
||||
L1[3].x=l_coh[0][6];
|
||||
L1[3].y=l_coh[0][7];
|
||||
/* L2=G1*L1 */
|
||||
amb(G1,L1,L2);
|
||||
/* L1=L2*G2^H */
|
||||
ambt(L2,G2,L1);
|
||||
l_coh[0][0]=L1[0].x;
|
||||
l_coh[0][1]=L1[0].y;
|
||||
l_coh[0][2]=L1[1].x;
|
||||
l_coh[0][3]=L1[1].y;
|
||||
l_coh[0][4]=L1[2].x;
|
||||
l_coh[0][5]=L1[2].y;
|
||||
l_coh[0][6]=L1[3].x;
|
||||
l_coh[0][7]=L1[3].y;
|
||||
|
||||
} else {
|
||||
//use simply for-loop, if K is very large this may be slow and may need further parallelization
|
||||
for (int k=0; k<K; k++) {
|
||||
//source specific params
|
||||
double sI0f,sQ0f,sU0f,sV0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
|
||||
double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
|
||||
sI0f=__ldg(&sI0[k]);
|
||||
sQ0f=__ldg(&sQ0[k]);
|
||||
sU0f=__ldg(&sU0[k]);
|
||||
sV0f=__ldg(&sV0[k]);
|
||||
spec_idxf=__ldg(&spec_idx[k]);
|
||||
spec_idx1f=__ldg(&spec_idx1[k]);
|
||||
spec_idx2f=__ldg(&spec_idx2[k]);
|
||||
myf0=__ldg(&f0[k]);
|
||||
|
||||
unsigned char stypeT=__ldg(&stype[k]);
|
||||
|
||||
for(int cf=0; cf<F; cf++) {
|
||||
double llcoh[8];
|
||||
compute_prodterm_multifreq(sta1, sta2, N, K, T, F, phterm0, sI0f, sQ0f, sU0f, sV0f, spec_idxf, spec_idx1f, spec_idx2f,
|
||||
myf0, __ldg(&(freqs[cf])), deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n,llcoh);
|
||||
l_coh[cf][0] +=llcoh[0];
|
||||
l_coh[cf][1] +=llcoh[1];
|
||||
l_coh[cf][2] +=llcoh[2];
|
||||
l_coh[cf][3] +=llcoh[3];
|
||||
l_coh[cf][4] +=llcoh[4];
|
||||
l_coh[cf][5] +=llcoh[5];
|
||||
l_coh[cf][6] +=llcoh[6];
|
||||
l_coh[cf][7] +=llcoh[7];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
cuDoubleComplex L1[4],L2[4];
|
||||
for(int cf=0; cf<F; cf++) {
|
||||
L1[0].x=l_coh[cf][0];
|
||||
L1[0].y=l_coh[cf][1];
|
||||
L1[1].x=l_coh[cf][2];
|
||||
L1[1].y=l_coh[cf][3];
|
||||
L1[2].x=l_coh[cf][4];
|
||||
L1[2].y=l_coh[cf][5];
|
||||
L1[3].x=l_coh[cf][6];
|
||||
L1[3].y=l_coh[cf][7];
|
||||
/* L2=G1*L1 */
|
||||
amb(G1,L1,L2);
|
||||
/* L1=L2*G2^H */
|
||||
ambt(L2,G2,L1);
|
||||
l_coh[cf][0]=L1[0].x;
|
||||
l_coh[cf][1]=L1[0].y;
|
||||
l_coh[cf][2]=L1[1].x;
|
||||
l_coh[cf][3]=L1[1].y;
|
||||
l_coh[cf][4]=L1[2].x;
|
||||
l_coh[cf][5]=L1[2].y;
|
||||
l_coh[cf][6]=L1[3].x;
|
||||
l_coh[cf][7]=L1[3].y;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
//write output with right multi frequency offset
|
||||
double *coh1 = &coh[8*n];
|
||||
for(int cf=0; cf<F; cf++) {
|
||||
coh1[cf*8*B+0] = l_coh[cf][0];
|
||||
coh1[cf*8*B+1] = l_coh[cf][1];
|
||||
coh1[cf*8*B+2] = l_coh[cf][2];
|
||||
coh1[cf*8*B+3] = l_coh[cf][3];
|
||||
coh1[cf*8*B+4] = l_coh[cf][4];
|
||||
coh1[cf*8*B+5] = l_coh[cf][5];
|
||||
coh1[cf*8*B+6] = l_coh[cf][6];
|
||||
coh1[cf*8*B+7] = l_coh[cf][7];
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* kernel to convert time (JD) to GMST angle*/
|
||||
__global__ void
|
||||
|
@ -585,7 +953,7 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud
|
|||
|
||||
/*
|
||||
calculate coherencies:
|
||||
B: total baselines
|
||||
B: total baselines (could be more than one timeslot)
|
||||
N: no of stations
|
||||
T: no of time slots
|
||||
K: no of sources
|
||||
|
@ -608,8 +976,8 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud
|
|||
dobeam: enable beam if >0
|
||||
*/
|
||||
void
|
||||
cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI,
|
||||
unsigned char *stype, double *sI0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
|
||||
cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
|
||||
unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
error = cudaGetLastError();
|
||||
|
@ -620,8 +988,8 @@ cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v,
|
|||
/* note: make sure we do not exceed max no of blocks available,
|
||||
otherwise (too many baselines, loop over source id) */
|
||||
int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock;
|
||||
kernel_coherencies<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI,
|
||||
stype, sI0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
|
||||
kernel_coherencies<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV,
|
||||
stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
|
@ -634,6 +1002,33 @@ cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v,
|
|||
}
|
||||
|
||||
|
||||
/* p : parameters 8Nxnchunk values */
|
||||
void
|
||||
cudakernel_residuals(int B, int N, int T, int K, int F, double *u, double *v, double *w, double *p, int nchunk, baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
|
||||
unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
error = cudaGetLastError();
|
||||
#endif
|
||||
|
||||
/* spawn threads to handle baselines, these threads will spawn threads for sources */
|
||||
int ThreadsPerBlock=DEFAULT_TH_PER_BK;
|
||||
/* note: make sure we do not exceed max no of blocks available,
|
||||
otherwise (too many baselines, loop over source id) */
|
||||
int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock;
|
||||
kernel_residuals<<<BlocksPerGrid,ThreadsPerBlock>>>(B, N, T, K, F,u,v,w,p,nchunk,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV,
|
||||
stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess) {
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* convert time JD to GMST angle
|
||||
store result at the same location */
|
||||
void
|
||||
|
|
|
@ -92,7 +92,7 @@ dtofcopy(int N, float **x_d, double *x) {
|
|||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/* copy memory */
|
||||
err=cudaMemcpyAsync(xc,xhost,N*sizeof(float),cudaMemcpyHostToDevice,0);
|
||||
err=cudaMemcpy(xc,xhost,N*sizeof(float),cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/* free host buffer */
|
||||
|
@ -120,6 +120,15 @@ precalcoh_threadfn(void *data) {
|
|||
err=cudaSetDevice(card);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/* make sure enough heap memory is available for shapelet computations */
|
||||
size_t plim;
|
||||
err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
if (plim<GPU_HEAP_SIZE*1024*1024) {
|
||||
err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, GPU_HEAP_SIZE*1024*1024);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
double *ud,*vd,*wd;
|
||||
double *cohd;
|
||||
baseline_t *barrd;
|
||||
|
@ -129,7 +138,7 @@ precalcoh_threadfn(void *data) {
|
|||
float **xx_p=0,**yy_p=0,**zz_p=0;
|
||||
float **xxd,**yyd,**zzd;
|
||||
/* allocate memory in GPU */
|
||||
err=cudaMalloc((void**) &cohd, t->Nbase*8*sizeof(float)); /* coherencies only for 1 cluster, Nf=1 */
|
||||
err=cudaMalloc((void**) &cohd, t->Nbase*8*sizeof(double)); /* coherencies only for 1 cluster, Nf=1 */
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
@ -218,9 +227,16 @@ precalcoh_threadfn(void *data) {
|
|||
float *beamd;
|
||||
double *lld,*mmd,*nnd;
|
||||
double *sId; float *rad,*decd;
|
||||
double *sQd,*sUd,*sVd;
|
||||
unsigned char *styped;
|
||||
double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
|
||||
double *sQ0d,*sU0d,*sV0d;
|
||||
int **host_p,**dev_p;
|
||||
|
||||
complex double *tempdcoh;
|
||||
err=cudaMallocHost((void**)&tempdcoh,sizeof(complex double)*(size_t)t->Nbase*4);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/******************* begin loop over clusters **************************/
|
||||
for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
|
||||
|
||||
|
@ -249,11 +265,29 @@ precalcoh_threadfn(void *data) {
|
|||
err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
if (t->Nf==1) {
|
||||
err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (t->dobeam) {
|
||||
dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
|
||||
dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
|
||||
|
@ -264,7 +298,8 @@ precalcoh_threadfn(void *data) {
|
|||
err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/* for multi channel data */
|
||||
/* for multi channel data - FIXME: remove this part because Nf==1 always */
|
||||
if (t->Nf>1) {
|
||||
err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
|
@ -286,6 +321,22 @@ precalcoh_threadfn(void *data) {
|
|||
err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
}
|
||||
|
||||
/* extra info for source, if any */
|
||||
if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
|
@ -347,15 +398,10 @@ precalcoh_threadfn(void *data) {
|
|||
|
||||
/* calculate coherencies for all sources in this cluster, add them up */
|
||||
cudakernel_coherencies(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,barrd,freqsd,beamd,
|
||||
lld,mmd,nnd,sId,styped,sI0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
|
||||
lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
|
||||
|
||||
/* copy back coherencies to host,
|
||||
coherencies on host have 8M stride, on device have 8 stride */
|
||||
complex double *tempdcoh;
|
||||
if ((tempdcoh=(complex double*)calloc((size_t)t->Nbase*4,sizeof(complex double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
err=cudaMemcpy((double*)tempdcoh, cohd, sizeof(double)*t->Nbase*8, cudaMemcpyDeviceToHost);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* now copy this with right offset and stride */
|
||||
|
@ -363,7 +409,6 @@ precalcoh_threadfn(void *data) {
|
|||
my_ccopy(t->Nbase,&tempdcoh[1],4,&(t->coh[4*ncl+1]),4*t->M);
|
||||
my_ccopy(t->Nbase,&tempdcoh[2],4,&(t->coh[4*ncl+2]),4*t->M);
|
||||
my_ccopy(t->Nbase,&tempdcoh[3],4,&(t->coh[4*ncl+3]),4*t->M);
|
||||
free(tempdcoh);
|
||||
|
||||
|
||||
for (cj=0; cj<t->carr[ncl].N; cj++) {
|
||||
|
@ -402,8 +447,16 @@ precalcoh_threadfn(void *data) {
|
|||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(nnd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
if (t->Nf==1) {
|
||||
err=cudaFree(sId);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sQd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sUd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sVd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
if (t->dobeam) {
|
||||
err=cudaFree(rad);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
@ -413,6 +466,7 @@ precalcoh_threadfn(void *data) {
|
|||
err=cudaFree(styped);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
if (t->Nf>1) {
|
||||
err=cudaFree(sI0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(f0d);
|
||||
|
@ -423,10 +477,21 @@ precalcoh_threadfn(void *data) {
|
|||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(spec_idx2d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
err=cudaFree(sQ0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sU0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sV0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
}
|
||||
/******************* end loop over clusters **************************/
|
||||
|
||||
/* free memory */
|
||||
err=cudaFreeHost(tempdcoh);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
err=cudaFree(ud);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(vd);
|
||||
|
@ -545,7 +610,7 @@ precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex d
|
|||
threaddata[nth].x=0; /* no input data */
|
||||
|
||||
threaddata[nth].N=N;
|
||||
threaddata[nth].Nbase=Nbase; /* total baselines: actually Nbasextilesz */
|
||||
threaddata[nth].Nbase=Nbase; /* total baselines: actually Nbasextilesz (from input) */
|
||||
threaddata[nth].barr=barr;
|
||||
threaddata[nth].carr=carr;
|
||||
threaddata[nth].M=M;
|
||||
|
@ -666,7 +731,6 @@ predictvis_threadfn(void *data) {
|
|||
cudaError_t err;
|
||||
int ci,ncl,cj;
|
||||
|
||||
printf("Attath %d to card %d\n",t->tid,card);
|
||||
|
||||
err=cudaSetDevice(card);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
@ -675,12 +739,13 @@ printf("Attath %d to card %d\n",t->tid,card);
|
|||
size_t plim;
|
||||
err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
if (plim<16*1024*1024) { /* 16MB */
|
||||
err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, 16*1024*1024);
|
||||
if (plim<GPU_HEAP_SIZE*1024*1024) {
|
||||
err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, GPU_HEAP_SIZE*1024*1024);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
|
||||
|
||||
double *ud,*vd,*wd;
|
||||
double *cohd;
|
||||
baseline_t *barrd;
|
||||
|
@ -782,8 +847,10 @@ printf("Attath %d to card %d\n",t->tid,card);
|
|||
float *beamd;
|
||||
double *lld,*mmd,*nnd;
|
||||
double *sId; float *rad,*decd;
|
||||
double *sQd,*sUd,*sVd;
|
||||
unsigned char *styped;
|
||||
double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
|
||||
double *sQ0d,*sU0d,*sV0d;
|
||||
int **host_p,**dev_p;
|
||||
|
||||
double *xlocal;
|
||||
|
@ -792,7 +859,6 @@ printf("Attath %d to card %d\n",t->tid,card);
|
|||
|
||||
/******************* begin loop over clusters **************************/
|
||||
for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
|
||||
printf("th %d clus %d:%d working %d\n",t->tid,t->soff,t->soff+t->Ns-1,ncl);
|
||||
/* allocate memory for this clusters beam */
|
||||
if (t->dobeam) {
|
||||
err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float));
|
||||
|
@ -820,11 +886,27 @@ printf("Attath %d to card %d\n",t->tid,card);
|
|||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
if (t->Nf==1) {
|
||||
err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
|
||||
if (t->dobeam) {
|
||||
dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
|
||||
dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
|
||||
|
@ -836,6 +918,7 @@ printf("Attath %d to card %d\n",t->tid,card);
|
|||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/* for multi channel data */
|
||||
if (t->Nf>1) {
|
||||
err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
|
@ -858,6 +941,23 @@ printf("Attath %d to card %d\n",t->tid,card);
|
|||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* extra info for source, if any */
|
||||
if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
|
@ -920,7 +1020,7 @@ printf("Attath %d to card %d\n",t->tid,card);
|
|||
|
||||
/* calculate coherencies for all sources in this cluster, add them up */
|
||||
cudakernel_coherencies(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,barrd,freqsd,beamd,
|
||||
lld,mmd,nnd,sId,styped,sI0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
|
||||
lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
|
||||
|
||||
/* copy back coherencies to host,
|
||||
coherencies on host have 8M stride, on device have 8 stride */
|
||||
|
@ -964,8 +1064,16 @@ printf("Attath %d to card %d\n",t->tid,card);
|
|||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(nnd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
if (t->Nf==1) {
|
||||
err=cudaFree(sId);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sQd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sUd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sVd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
if (t->dobeam) {
|
||||
err=cudaFree(rad);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
@ -975,6 +1083,7 @@ printf("Attath %d to card %d\n",t->tid,card);
|
|||
err=cudaFree(styped);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
if (t->Nf>1) {
|
||||
err=cudaFree(sI0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(f0d);
|
||||
|
@ -985,6 +1094,14 @@ printf("Attath %d to card %d\n",t->tid,card);
|
|||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(spec_idx2d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
err=cudaFree(sQ0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sU0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sV0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
}
|
||||
/******************* end loop over clusters **************************/
|
||||
|
||||
|
@ -1037,6 +1154,7 @@ printf("Attath %d to card %d\n",t->tid,card);
|
|||
free(zz_p);
|
||||
}
|
||||
|
||||
|
||||
/* reset error state */
|
||||
err=cudaGetLastError();
|
||||
return NULL;
|
||||
|
@ -1089,7 +1207,7 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit
|
|||
|
||||
|
||||
|
||||
/* set common parameters, and split baselines to threads */
|
||||
/* set common parameters, and split clusters to threads */
|
||||
ci=0;
|
||||
for (nth=0; nth<Ngpu && ci<M; nth++) {
|
||||
if (ci+Nthb0<M) {
|
||||
|
@ -1158,6 +1276,583 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit
|
|||
|
||||
|
||||
|
||||
free(xlocal);
|
||||
free(threaddata);
|
||||
destroy_task_hist(&thst);
|
||||
free(th_array);
|
||||
pthread_attr_destroy(&attr);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void *
|
||||
residual_threadfn(void *data) {
|
||||
thread_data_pred_t *t=(thread_data_pred_t*)data;
|
||||
/* first, select a GPU, if total clusters < MAX_GPU_ID
|
||||
use random selection, elese use this thread id */
|
||||
int card;
|
||||
if (t->M<=MAX_GPU_ID) {
|
||||
card=select_work_gpu(MAX_GPU_ID,t->hst);
|
||||
} else {
|
||||
card=t->tid;
|
||||
}
|
||||
cudaError_t err;
|
||||
int ci,ncl,cj;
|
||||
|
||||
|
||||
err=cudaSetDevice(card);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/* make sure enough heap memory is available for shapelet computations */
|
||||
size_t plim;
|
||||
err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
if (plim<GPU_HEAP_SIZE*1024*1024) {
|
||||
err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, GPU_HEAP_SIZE*1024*1024);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
|
||||
|
||||
double *ud,*vd,*wd;
|
||||
double *cohd;
|
||||
baseline_t *barrd;
|
||||
double *freqsd;
|
||||
float *longd=0,*latd=0; double *timed;
|
||||
int *Nelemd;
|
||||
float **xx_p=0,**yy_p=0,**zz_p=0;
|
||||
float **xxd,**yyd,**zzd;
|
||||
/* allocate memory in GPU */
|
||||
err=cudaMalloc((void**) &cohd, t->Nbase*8*t->Nf*sizeof(double)); /* coherencies only for 1 cluster, Nf freq, used to store sum of clusters*/
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &Nelemd, t->N*sizeof(int));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/* copy to device */
|
||||
/* u,v,w and l,m,n coords need to be double for precision */
|
||||
err=cudaMalloc((void**) &ud, t->Nbase*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &vd, t->Nbase*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &wd, t->Nbase*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(ud, t->u, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(vd, t->v, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(wd, t->w, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
err=cudaMemcpy(barrd, t->barr, t->Nbase*sizeof(baseline_t), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &freqsd, t->Nf*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(freqsd, t->freqs, t->Nf*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
/* check if beam is actually calculated */
|
||||
if (t->dobeam) {
|
||||
dtofcopy(t->N,&longd,t->longitude);
|
||||
dtofcopy(t->N,&latd,t->latitude);
|
||||
err=cudaMalloc((void**) &timed, t->tilesz*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(timed, t->time_utc, t->tilesz*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* convert time jd to GMST angle */
|
||||
cudakernel_convert_time(t->tilesz,timed);
|
||||
|
||||
err=cudaMemcpy(Nelemd, t->Nelem, t->N*sizeof(int), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/* jagged arrays for element locations */
|
||||
err=cudaMalloc((void**)&xxd, t->N*sizeof(int*));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&yyd, t->N*sizeof(int*));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&zzd, t->N*sizeof(int*));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* allocate host memory to store pointers */
|
||||
if ((xx_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((yy_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((zz_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
for (ci=0; ci<t->N; ci++) {
|
||||
err=cudaMalloc((void**)&xx_p[ci], t->Nelem[ci]*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&yy_p[ci], t->Nelem[ci]*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&zz_p[ci], t->Nelem[ci]*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
/* now copy data */
|
||||
for (ci=0; ci<t->N; ci++) {
|
||||
dtofcopy(t->Nelem[ci],&xx_p[ci],t->xx[ci]);
|
||||
dtofcopy(t->Nelem[ci],&yy_p[ci],t->yy[ci]);
|
||||
dtofcopy(t->Nelem[ci],&zz_p[ci],t->zz[ci]);
|
||||
}
|
||||
/* now copy pointer locations to device */
|
||||
err=cudaMemcpy(xxd, xx_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(yyd, yy_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(zzd, zz_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
|
||||
float *beamd;
|
||||
double *lld,*mmd,*nnd;
|
||||
double *sId; float *rad,*decd;
|
||||
double *sQd,*sUd,*sVd;
|
||||
unsigned char *styped;
|
||||
double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
|
||||
double *sQ0d,*sU0d,*sV0d;
|
||||
int **host_p,**dev_p;
|
||||
|
||||
double *xlocal;
|
||||
err=cudaMallocHost((void**)&xlocal,sizeof(double)*(size_t)t->Nbase*8*t->Nf);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
double *pd; /* parameter array per cluster */
|
||||
|
||||
/******************* begin loop over clusters **************************/
|
||||
for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
|
||||
/* allocate memory for this clusters beam */
|
||||
if (t->dobeam) {
|
||||
err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else {
|
||||
beamd=0;
|
||||
}
|
||||
|
||||
|
||||
/* copy cluster details to GPU */
|
||||
err=cudaMalloc((void**)&styped, t->carr[ncl].N*sizeof(unsigned char));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
err=cudaMalloc((void**) &lld, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(lld, t->carr[ncl].ll, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &mmd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(mmd, t->carr[ncl].mm, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &nnd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/* parameter vector size may change depending on hybrid parameter */
|
||||
err=cudaMalloc((void**) &pd, t->N*8*t->carr[ncl].nchunk*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(pd, &(t->p[t->carr[ncl].p[0]]), t->N*8*t->carr[ncl].nchunk*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
if (t->Nf==1) {
|
||||
err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
|
||||
if (t->dobeam) {
|
||||
dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
|
||||
dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
|
||||
} else {
|
||||
rad=0;
|
||||
decd=0;
|
||||
}
|
||||
err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/* for multi channel data */
|
||||
if (t->Nf>1) {
|
||||
err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &f0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(f0d, t->carr[ncl].f0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &spec_idxd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(spec_idxd, t->carr[ncl].spec_idx, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &spec_idx1d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(spec_idx1d, t->carr[ncl].spec_idx1, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &spec_idx2d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* extra info for source, if any */
|
||||
if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
err=cudaMalloc((void**)&dev_p, t->carr[ncl].N*sizeof(int*));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
for (cj=0; cj<t->carr[ncl].N; cj++) {
|
||||
|
||||
if (t->carr[ncl].stype[cj]==STYPE_POINT) {
|
||||
host_p[cj]=0;
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) {
|
||||
exinfo_shapelet *d=(exinfo_shapelet*)t->carr[ncl].ex[cj];
|
||||
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_shapelet));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
double *modes;
|
||||
err=cudaMalloc((void**)&modes, d->n0*d->n0*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_shapelet), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(modes, d->modes, d->n0*d->n0*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj];
|
||||
err=cudaMemcpy(&(d_p->modes), &modes, sizeof(double*), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) {
|
||||
exinfo_gaussian *d=(exinfo_gaussian*)t->carr[ncl].ex[cj];
|
||||
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_gaussian));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_gaussian), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_DISK) {
|
||||
exinfo_disk *d=(exinfo_disk*)t->carr[ncl].ex[cj];
|
||||
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_disk));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_disk), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_RING) {
|
||||
exinfo_ring *d=(exinfo_ring*)t->carr[ncl].ex[cj];
|
||||
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_ring));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_ring), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
/* now copy pointer locations to device */
|
||||
err=cudaMemcpy(dev_p, host_p, t->carr[ncl].N*sizeof(int*), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
if (t->dobeam) {
|
||||
/* now calculate beam for all sources in this cluster */
|
||||
cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd);
|
||||
}
|
||||
|
||||
|
||||
/* calculate coherencies for all sources in this cluster, add them up */
|
||||
cudakernel_residuals(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,pd,t->carr[ncl].nchunk,barrd,freqsd,beamd,
|
||||
lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
|
||||
|
||||
/* copy back coherencies to host,
|
||||
coherencies on host have 8M stride, on device have 8 stride */
|
||||
err=cudaMemcpy(xlocal, cohd, sizeof(double)*t->Nbase*8*t->Nf, cudaMemcpyDeviceToHost);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
my_daxpy(t->Nbase*8*t->Nf,xlocal,1.0,t->x);
|
||||
|
||||
for (cj=0; cj<t->carr[ncl].N; cj++) {
|
||||
if (t->carr[ncl].stype[cj]==STYPE_POINT) {
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) {
|
||||
exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj];
|
||||
double *modes=0;
|
||||
err=cudaMemcpy(&modes, &(d_p->modes), sizeof(double*), cudaMemcpyDeviceToHost);
|
||||
err=cudaFree(modes);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(host_p[cj]);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) {
|
||||
err=cudaFree(host_p[cj]);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_DISK) {
|
||||
err=cudaFree(host_p[cj]);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_RING) {
|
||||
err=cudaFree(host_p[cj]);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
}
|
||||
free(host_p);
|
||||
|
||||
err=cudaFree(dev_p);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
if (t->dobeam) {
|
||||
err=cudaFree(beamd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
err=cudaFree(lld);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(mmd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(nnd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(pd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
if (t->Nf==1) {
|
||||
err=cudaFree(sId);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sQd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sUd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sVd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
if (t->dobeam) {
|
||||
err=cudaFree(rad);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(decd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
err=cudaFree(styped);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
if (t->Nf>1) {
|
||||
err=cudaFree(sI0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(f0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(spec_idxd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(spec_idx1d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(spec_idx2d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
err=cudaFree(sQ0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sU0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sV0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
}
|
||||
/******************* end loop over clusters **************************/
|
||||
|
||||
/* free memory */
|
||||
err=cudaFreeHost(xlocal);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
err=cudaFree(ud);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(vd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(wd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(cohd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(barrd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(freqsd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
if (t->dobeam) {
|
||||
err=cudaFree(longd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(latd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(timed);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(Nelemd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
for (ci=0; ci<t->N; ci++) {
|
||||
err=cudaFree(xx_p[ci]);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(yy_p[ci]);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(zz_p[ci]);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
err=cudaFree(xxd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(yyd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(zzd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
free(xx_p);
|
||||
free(yy_p);
|
||||
free(zz_p);
|
||||
}
|
||||
|
||||
|
||||
/* reset error state */
|
||||
err=cudaGetLastError();
|
||||
return NULL;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* p: 8NMx1 parameter array, but M is 'effective' clusters, need to use carr to find the right offset
|
||||
ccid: which cluster to use as correction
|
||||
rho: MMSE robust parameter J+rho I inverted
|
||||
|
||||
phase_only: if >0, and if there is any correction done, use only phase of diagonal elements for correction
|
||||
*/
|
||||
int
|
||||
calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
|
||||
double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int ccid, double rho, int phase_only) {
|
||||
|
||||
int nth,ci;
|
||||
|
||||
int Nthb0,Nthb;
|
||||
pthread_attr_t attr;
|
||||
pthread_t *th_array;
|
||||
thread_data_pred_t *threaddata;
|
||||
taskhist thst;
|
||||
init_task_hist(&thst);
|
||||
|
||||
/* oversubsribe GPU */
|
||||
int Ngpu=MAX_GPU_ID+1;
|
||||
|
||||
/* calculate min clusters thread can handle */
|
||||
Nthb0=(M+Ngpu-1)/Ngpu;
|
||||
|
||||
/* setup threads : note: Ngpu is no of GPUs used */
|
||||
pthread_attr_init(&attr);
|
||||
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
|
||||
|
||||
if ((th_array=(pthread_t*)malloc((size_t)Ngpu*sizeof(pthread_t)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((threaddata=(thread_data_pred_t*)malloc((size_t)Ngpu*sizeof(thread_data_pred_t)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* arrays to store result */
|
||||
double *xlocal;
|
||||
if ((xlocal=(double*)calloc((size_t)Nbase*8*tilesz*Nchan*Ngpu,sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
/* set common parameters, and split clusters to threads */
|
||||
ci=0;
|
||||
for (nth=0; nth<Ngpu && ci<M; nth++) {
|
||||
if (ci+Nthb0<M) {
|
||||
Nthb=Nthb0;
|
||||
} else {
|
||||
Nthb=M-ci;
|
||||
}
|
||||
|
||||
threaddata[nth].hst=&thst; /* for load balancing */
|
||||
threaddata[nth].tid=nth;
|
||||
|
||||
threaddata[nth].u=u;
|
||||
threaddata[nth].v=v;
|
||||
threaddata[nth].w=w;
|
||||
threaddata[nth].coh=0;
|
||||
threaddata[nth].x=&xlocal[nth*Nbase*8*tilesz*Nchan]; /* distinct arrays to get back the result */
|
||||
|
||||
threaddata[nth].N=N;
|
||||
threaddata[nth].Nbase=Nbase*tilesz; /* total baselines: actually Nbasextilesz */
|
||||
threaddata[nth].barr=barr;
|
||||
threaddata[nth].carr=carr;
|
||||
threaddata[nth].M=M;
|
||||
threaddata[nth].Nf=Nchan;
|
||||
threaddata[nth].freqs=freqs;
|
||||
threaddata[nth].fdelta=fdelta;
|
||||
threaddata[nth].tdelta=tdelta;
|
||||
threaddata[nth].dec0=dec0;
|
||||
|
||||
threaddata[nth].dobeam=dobeam;
|
||||
threaddata[nth].ph_ra0=ph_ra0;
|
||||
threaddata[nth].ph_dec0=ph_dec0;
|
||||
threaddata[nth].ph_freq0=ph_freq0;
|
||||
threaddata[nth].longitude=longitude;
|
||||
threaddata[nth].latitude=latitude;
|
||||
threaddata[nth].time_utc=time_utc;
|
||||
threaddata[nth].tilesz=tilesz;
|
||||
threaddata[nth].Nelem=Nelem;
|
||||
threaddata[nth].xx=xx;
|
||||
threaddata[nth].yy=yy;
|
||||
threaddata[nth].zz=zz;
|
||||
|
||||
/* parameters */
|
||||
threaddata[nth].p=p;
|
||||
|
||||
threaddata[nth].Ns=Nthb;
|
||||
threaddata[nth].soff=ci;
|
||||
|
||||
|
||||
pthread_create(&th_array[nth],&attr,residual_threadfn,(void*)(&threaddata[nth]));
|
||||
/* next source set */
|
||||
ci=ci+Nthb;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* now wait for threads to finish */
|
||||
for(ci=0; ci<nth; ci++) {
|
||||
pthread_join(th_array[ci],NULL);
|
||||
/* subtract to find residual */
|
||||
my_daxpy(Nbase*8*tilesz*Nchan,threaddata[ci].x,-1.0,x);
|
||||
}
|
||||
|
||||
|
||||
|
||||
free(xlocal);
|
||||
free(threaddata);
|
||||
destroy_task_hist(&thst);
|
||||
|
|
|
@ -2658,11 +2658,16 @@ precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex d
|
|||
double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int tileze, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt);
|
||||
|
||||
|
||||
/* FIXME: add_to_data: no subtraction is done */
|
||||
extern int
|
||||
predict_visibilities_multifreq_withbeam_gpu(double *u,double *v,double *w,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
|
||||
double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int add_to_data);
|
||||
|
||||
extern int
|
||||
calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
|
||||
double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int ccid, double rho, int phase_only);
|
||||
|
||||
|
||||
|
||||
#endif /*!HAVE_CUDA */
|
||||
|
||||
|
||||
|
@ -2675,6 +2680,12 @@ predict_visibilities_multifreq_withbeam_gpu(double *u,double *v,double *w,double
|
|||
#ifndef ARRAY_MAX_ELEM /* if using shared memory, max possible elements for a station */
|
||||
#define ARRAY_MAX_ELEM 512
|
||||
#endif
|
||||
/* default GPU heap size (in MB) needed to calculate some shapelet models,
|
||||
if model has n0>20 or so, try increasing this and recompiling
|
||||
the default GPU values is ~ 8MB */
|
||||
#ifndef GPU_HEAP_SIZE
|
||||
#define GPU_HEAP_SIZE 20
|
||||
#endif
|
||||
|
||||
|
||||
extern void
|
||||
|
@ -2683,9 +2694,12 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud
|
|||
|
||||
|
||||
extern void
|
||||
cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI,
|
||||
unsigned char *stype, double *sI0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam);
|
||||
cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
|
||||
unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam);
|
||||
|
||||
extern void
|
||||
cudakernel_residuals(int B, int N, int T, int K, int F, double *u, double *v, double *w, double *p, int nchunk, baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV,
|
||||
unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam);
|
||||
|
||||
extern void
|
||||
cudakernel_convert_time(int T, double *time_utc);
|
||||
|
|
Loading…
Reference in New Issue