added function to predict using stored solutions, with GPU accel
This commit is contained in:
parent
41cf1b3b69
commit
157eaf84cd
|
@ -740,10 +740,16 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea
|
|||
}
|
||||
#endif
|
||||
} else {
|
||||
read_solutions(sfp,p,carr,iodata.N,M);
|
||||
/* if solution file is given, read in the solutions and predict */
|
||||
read_solutions(sfp,p,carr,iodata.N,M);
|
||||
|
||||
if (GPUpredict) {
|
||||
predict_visibilities_withsol_withbeam_gpu(iodata.u,iodata.v,iodata.w,p,iodata.xo,ignorelist,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
|
||||
beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt,Data::DoSim,Data::ccid,Data::rho,Data::phaseOnly);
|
||||
} else {
|
||||
predict_visibilities_multifreq_withsol(iodata.u,iodata.v,iodata.w,p,iodata.xo,ignorelist,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,Data::DoSim,Data::ccid,Data::rho,Data::phaseOnly);
|
||||
}
|
||||
}
|
||||
/************ end simulation only mode ***************************/
|
||||
}
|
||||
|
||||
|
|
|
@ -394,6 +394,9 @@ extern int
|
|||
calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
|
||||
double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int ccid, double rho, int phase_only);
|
||||
|
||||
extern int
|
||||
predict_visibilities_withsol_withbeam_gpu(double *u,double *v,double *w,double *p,double *x, int *ignorelist, int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
|
||||
double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int add_to_data, int ccid, double rho, int phase_only);
|
||||
#endif /*!HAVE_CUDA */
|
||||
|
||||
/****************************** predict_model.cu ****************************/
|
||||
|
|
|
@ -106,6 +106,9 @@ typedef struct thread_data_pred_t_ {
|
|||
/* following are only used while predict with gain */
|
||||
double *p; /* parameter array, size could be 8*N*Mx1 (full) or 8*Nx1 (single)*/
|
||||
|
||||
/* following only used to ignore some clusters while predicting */
|
||||
int *ignlist;
|
||||
|
||||
} thread_data_pred_t;
|
||||
|
||||
/* struct for passing data to threads for correcting data */
|
||||
|
@ -2051,3 +2054,678 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void *
|
||||
predict_threadfn(void *data) {
|
||||
thread_data_pred_t *t=(thread_data_pred_t*)data;
|
||||
/* first, select a GPU, if total clusters < MAX_GPU_ID
|
||||
use random selection, elese use this thread id */
|
||||
int card;
|
||||
if (t->M<=MAX_GPU_ID) {
|
||||
card=select_work_gpu(MAX_GPU_ID,t->hst);
|
||||
} else {
|
||||
card=t->tid;
|
||||
}
|
||||
cudaError_t err;
|
||||
int ci,ncl,cj;
|
||||
|
||||
|
||||
err=cudaSetDevice(card);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
double *ud,*vd,*wd;
|
||||
double *cohd;
|
||||
baseline_t *barrd;
|
||||
double *freqsd;
|
||||
float *longd=0,*latd=0; double *timed;
|
||||
int *Nelemd;
|
||||
float **xx_p=0,**yy_p=0,**zz_p=0;
|
||||
float **xxd,**yyd,**zzd;
|
||||
/* allocate memory in GPU */
|
||||
err=cudaMalloc((void**) &cohd, t->Nbase*8*t->Nf*sizeof(double)); /* coherencies only for 1 cluster, Nf freq, used to store sum of clusters*/
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &Nelemd, t->N*sizeof(int));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/* copy to device */
|
||||
/* u,v,w and l,m,n coords need to be double for precision */
|
||||
err=cudaMalloc((void**) &ud, t->Nbase*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &vd, t->Nbase*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &wd, t->Nbase*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(ud, t->u, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(vd, t->v, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(wd, t->w, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
err=cudaMemcpy(barrd, t->barr, t->Nbase*sizeof(baseline_t), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &freqsd, t->Nf*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(freqsd, t->freqs, t->Nf*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
/* check if beam is actually calculated */
|
||||
if (t->dobeam) {
|
||||
dtofcopy(t->N,&longd,t->longitude);
|
||||
dtofcopy(t->N,&latd,t->latitude);
|
||||
err=cudaMalloc((void**) &timed, t->tilesz*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(timed, t->time_utc, t->tilesz*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* convert time jd to GMST angle */
|
||||
cudakernel_convert_time(t->tilesz,timed);
|
||||
|
||||
err=cudaMemcpy(Nelemd, t->Nelem, t->N*sizeof(int), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/* jagged arrays for element locations */
|
||||
err=cudaMalloc((void**)&xxd, t->N*sizeof(int*));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&yyd, t->N*sizeof(int*));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&zzd, t->N*sizeof(int*));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* allocate host memory to store pointers */
|
||||
if ((xx_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((yy_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((zz_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
for (ci=0; ci<t->N; ci++) {
|
||||
err=cudaMalloc((void**)&xx_p[ci], t->Nelem[ci]*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&yy_p[ci], t->Nelem[ci]*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&zz_p[ci], t->Nelem[ci]*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
/* now copy data */
|
||||
for (ci=0; ci<t->N; ci++) {
|
||||
dtofcopy(t->Nelem[ci],&xx_p[ci],t->xx[ci]);
|
||||
dtofcopy(t->Nelem[ci],&yy_p[ci],t->yy[ci]);
|
||||
dtofcopy(t->Nelem[ci],&zz_p[ci],t->zz[ci]);
|
||||
}
|
||||
/* now copy pointer locations to device */
|
||||
err=cudaMemcpy(xxd, xx_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(yyd, yy_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(zzd, zz_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
|
||||
float *beamd;
|
||||
double *lld,*mmd,*nnd;
|
||||
double *sId; float *rad,*decd;
|
||||
double *sQd,*sUd,*sVd;
|
||||
unsigned char *styped;
|
||||
double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
|
||||
double *sQ0d,*sU0d,*sV0d;
|
||||
int **host_p,**dev_p;
|
||||
|
||||
double *xlocal;
|
||||
err=cudaMallocHost((void**)&xlocal,sizeof(double)*(size_t)t->Nbase*8*t->Nf);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
double *pd; /* parameter array per cluster */
|
||||
|
||||
/******************* begin loop over clusters **************************/
|
||||
for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
|
||||
/* check if cluster id is not part of the ignore list */
|
||||
if (!t->ignlist[ncl]) {
|
||||
/* allocate memory for this clusters beam */
|
||||
if (t->dobeam) {
|
||||
err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else {
|
||||
beamd=0;
|
||||
}
|
||||
|
||||
|
||||
/* copy cluster details to GPU */
|
||||
err=cudaMalloc((void**)&styped, t->carr[ncl].N*sizeof(unsigned char));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
err=cudaMalloc((void**) &lld, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(lld, t->carr[ncl].ll, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &mmd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(mmd, t->carr[ncl].mm, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &nnd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/* parameter vector size may change depending on hybrid parameter */
|
||||
err=cudaMalloc((void**) &pd, t->N*8*t->carr[ncl].nchunk*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(pd, &(t->p[t->carr[ncl].p[0]]), t->N*8*t->carr[ncl].nchunk*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
if (t->Nf==1) {
|
||||
err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
|
||||
if (t->dobeam) {
|
||||
dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
|
||||
dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
|
||||
} else {
|
||||
rad=0;
|
||||
decd=0;
|
||||
}
|
||||
err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/* for multi channel data */
|
||||
if (t->Nf>1) {
|
||||
err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &f0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(f0d, t->carr[ncl].f0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &spec_idxd, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(spec_idxd, t->carr[ncl].spec_idx, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &spec_idx1d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(spec_idx1d, t->carr[ncl].spec_idx1, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &spec_idx2d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* extra info for source, if any */
|
||||
if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
err=cudaMalloc((void**)&dev_p, t->carr[ncl].N*sizeof(int*));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
for (cj=0; cj<t->carr[ncl].N; cj++) {
|
||||
|
||||
if (t->carr[ncl].stype[cj]==STYPE_POINT) {
|
||||
host_p[cj]=0;
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) {
|
||||
exinfo_shapelet *d=(exinfo_shapelet*)t->carr[ncl].ex[cj];
|
||||
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_shapelet));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
double *modes;
|
||||
err=cudaMalloc((void**)&modes, d->n0*d->n0*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_shapelet), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(modes, d->modes, d->n0*d->n0*sizeof(double), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj];
|
||||
err=cudaMemcpy(&(d_p->modes), &modes, sizeof(double*), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) {
|
||||
exinfo_gaussian *d=(exinfo_gaussian*)t->carr[ncl].ex[cj];
|
||||
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_gaussian));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_gaussian), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_DISK) {
|
||||
exinfo_disk *d=(exinfo_disk*)t->carr[ncl].ex[cj];
|
||||
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_disk));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_disk), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_RING) {
|
||||
exinfo_ring *d=(exinfo_ring*)t->carr[ncl].ex[cj];
|
||||
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_ring));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_ring), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
/* now copy pointer locations to device */
|
||||
err=cudaMemcpy(dev_p, host_p, t->carr[ncl].N*sizeof(int*), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
if (t->dobeam) {
|
||||
/* now calculate beam for all sources in this cluster */
|
||||
cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd);
|
||||
}
|
||||
|
||||
|
||||
/* calculate coherencies for all sources in this cluster, add them up */
|
||||
cudakernel_residuals(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,pd,t->carr[ncl].nchunk,barrd,freqsd,beamd,
|
||||
lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
|
||||
|
||||
/* copy back coherencies to host,
|
||||
coherencies on host have 8M stride, on device have 8 stride */
|
||||
err=cudaMemcpy(xlocal, cohd, sizeof(double)*t->Nbase*8*t->Nf, cudaMemcpyDeviceToHost);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
my_daxpy(t->Nbase*8*t->Nf,xlocal,1.0,t->x);
|
||||
|
||||
for (cj=0; cj<t->carr[ncl].N; cj++) {
|
||||
if (t->carr[ncl].stype[cj]==STYPE_POINT) {
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) {
|
||||
exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj];
|
||||
double *modes=0;
|
||||
err=cudaMemcpy(&modes, &(d_p->modes), sizeof(double*), cudaMemcpyDeviceToHost);
|
||||
err=cudaFree(modes);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(host_p[cj]);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) {
|
||||
err=cudaFree(host_p[cj]);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_DISK) {
|
||||
err=cudaFree(host_p[cj]);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (t->carr[ncl].stype[cj]==STYPE_RING) {
|
||||
err=cudaFree(host_p[cj]);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
}
|
||||
free(host_p);
|
||||
|
||||
err=cudaFree(dev_p);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
if (t->dobeam) {
|
||||
err=cudaFree(beamd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
err=cudaFree(lld);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(mmd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(nnd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(pd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
if (t->Nf==1) {
|
||||
err=cudaFree(sId);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sQd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sUd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sVd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
if (t->dobeam) {
|
||||
err=cudaFree(rad);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(decd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
err=cudaFree(styped);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
if (t->Nf>1) {
|
||||
err=cudaFree(sI0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(f0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(spec_idxd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(spec_idx1d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(spec_idx2d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
err=cudaFree(sQ0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sU0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(sV0d);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
}
|
||||
}
|
||||
/******************* end loop over clusters **************************/
|
||||
|
||||
/* free memory */
|
||||
err=cudaFreeHost(xlocal);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
err=cudaFree(ud);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(vd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(wd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(cohd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(barrd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(freqsd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
if (t->dobeam) {
|
||||
err=cudaFree(longd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(latd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(timed);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(Nelemd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
for (ci=0; ci<t->N; ci++) {
|
||||
err=cudaFree(xx_p[ci]);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(yy_p[ci]);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(zz_p[ci]);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
err=cudaFree(xxd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(yyd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaFree(zzd);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
free(xx_p);
|
||||
free(yy_p);
|
||||
free(zz_p);
|
||||
}
|
||||
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
/* reset error state */
|
||||
err=cudaGetLastError();
|
||||
return NULL;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
almost same as calculate_residuals_multifreq_withbeam_gpu(), but ignorelist is used to ignore prediction of some clusters, and predict/add/subtract options based on simulation mode add_to_data
|
||||
|
||||
int * ignorelist: Mx1 array
|
||||
int add_to_data: predict/add/subtract mode
|
||||
*/
|
||||
int
|
||||
predict_visibilities_withsol_withbeam_gpu(double *u,double *v,double *w,double *p,double *x, int *ignorelist, int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
|
||||
double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int add_to_data, int ccid, double rho, int phase_only) {
|
||||
|
||||
int nth,ci;
|
||||
|
||||
int Nthb0,Nthb;
|
||||
pthread_attr_t attr;
|
||||
pthread_t *th_array;
|
||||
thread_data_pred_t *threaddata;
|
||||
thread_data_corr_t *threaddata_corr;
|
||||
taskhist thst;
|
||||
init_task_hist(&thst);
|
||||
|
||||
/* oversubsribe GPU */
|
||||
int Ngpu=MAX_GPU_ID+1;
|
||||
|
||||
/* calculate min clusters thread can handle */
|
||||
Nthb0=(M+Ngpu-1)/Ngpu;
|
||||
|
||||
/* setup threads : note: Ngpu is no of GPUs used */
|
||||
pthread_attr_init(&attr);
|
||||
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
|
||||
|
||||
if ((th_array=(pthread_t*)malloc((size_t)Ngpu*sizeof(pthread_t)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((threaddata=(thread_data_pred_t*)malloc((size_t)Ngpu*sizeof(thread_data_pred_t)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* arrays to store result */
|
||||
double *xlocal;
|
||||
if ((xlocal=(double*)calloc((size_t)Nbase*8*tilesz*Nchan*Ngpu,sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
/* set common parameters, and split clusters to threads */
|
||||
ci=0;
|
||||
for (nth=0; nth<Ngpu && ci<M; nth++) {
|
||||
if (ci+Nthb0<M) {
|
||||
Nthb=Nthb0;
|
||||
} else {
|
||||
Nthb=M-ci;
|
||||
}
|
||||
|
||||
threaddata[nth].hst=&thst; /* for load balancing */
|
||||
threaddata[nth].tid=nth;
|
||||
|
||||
threaddata[nth].u=u;
|
||||
threaddata[nth].v=v;
|
||||
threaddata[nth].w=w;
|
||||
threaddata[nth].coh=0;
|
||||
threaddata[nth].x=&xlocal[nth*Nbase*8*tilesz*Nchan]; /* distinct arrays to get back the result */
|
||||
|
||||
threaddata[nth].N=N;
|
||||
threaddata[nth].Nbase=Nbase*tilesz; /* total baselines: actually Nbasextilesz */
|
||||
threaddata[nth].barr=barr;
|
||||
threaddata[nth].carr=carr;
|
||||
threaddata[nth].M=M;
|
||||
threaddata[nth].Nf=Nchan;
|
||||
threaddata[nth].freqs=freqs;
|
||||
threaddata[nth].fdelta=fdelta;
|
||||
threaddata[nth].tdelta=tdelta;
|
||||
threaddata[nth].dec0=dec0;
|
||||
|
||||
threaddata[nth].dobeam=dobeam;
|
||||
threaddata[nth].ph_ra0=ph_ra0;
|
||||
threaddata[nth].ph_dec0=ph_dec0;
|
||||
threaddata[nth].ph_freq0=ph_freq0;
|
||||
threaddata[nth].longitude=longitude;
|
||||
threaddata[nth].latitude=latitude;
|
||||
threaddata[nth].time_utc=time_utc;
|
||||
threaddata[nth].tilesz=tilesz;
|
||||
threaddata[nth].Nelem=Nelem;
|
||||
threaddata[nth].xx=xx;
|
||||
threaddata[nth].yy=yy;
|
||||
threaddata[nth].zz=zz;
|
||||
|
||||
/* parameters */
|
||||
threaddata[nth].p=p;
|
||||
|
||||
threaddata[nth].Ns=Nthb;
|
||||
threaddata[nth].soff=ci;
|
||||
|
||||
/* which clusters to ignore */
|
||||
threaddata[nth].ignlist=ignorelist;
|
||||
|
||||
pthread_create(&th_array[nth],&attr,predict_threadfn,(void*)(&threaddata[nth]));
|
||||
/* next source set */
|
||||
ci=ci+Nthb;
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (add_to_data==SIMUL_ONLY) {
|
||||
/* set input to zero */
|
||||
memset(x,0,sizeof(double)*8*Nbase*tilesz*Nchan);
|
||||
}
|
||||
/* now wait for threads to finish */
|
||||
for(ci=0; ci<nth; ci++) {
|
||||
pthread_join(th_array[ci],NULL);
|
||||
if (add_to_data==SIMUL_ONLY || add_to_data==SIMUL_ADD) {
|
||||
/* add to input value */
|
||||
my_daxpy(Nbase*8*tilesz*Nchan,threaddata[ci].x,1.0,x);
|
||||
} else {
|
||||
/* subtract from input value */
|
||||
my_daxpy(Nbase*8*tilesz*Nchan,threaddata[ci].x,-1.0,x);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
free(xlocal);
|
||||
free(threaddata);
|
||||
|
||||
/* find if any cluster is specified for correction of data */
|
||||
int cm,cj;
|
||||
cm=-1;
|
||||
for (cj=0; cj<M; cj++) { /* clusters */
|
||||
/* check if cluster id == ccid to do a correction */
|
||||
if (carr[cj].id==ccid) {
|
||||
/* valid cluster found */
|
||||
cm=cj;
|
||||
}
|
||||
}
|
||||
/* correction of data, if valid cluster is given */
|
||||
if (cm>=0) {
|
||||
double *pm,*pinv,*pphase=0;
|
||||
/* allocate memory for inverse J */
|
||||
if ((pinv=(double*)malloc((size_t)8*N*carr[cm].nchunk*sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if (!phase_only) {
|
||||
for (cj=0; cj<carr[cm].nchunk; cj++) {
|
||||
pm=&(p[carr[cm].p[cj]]); /* start of solutions */
|
||||
/* invert N solutions */
|
||||
for (ci=0; ci<N; ci++) {
|
||||
mat_invert(&pm[8*ci],&pinv[8*ci+8*N*cj], rho);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* joint diagonalize solutions and get only phases before inverting */
|
||||
if ((pphase=(double*)malloc((size_t)8*N*sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
for (cj=0; cj<carr[cm].nchunk; cj++) {
|
||||
pm=&(p[carr[cm].p[cj]]); /* start of solutions */
|
||||
/* extract phase of pm, output to pphase */
|
||||
extract_phases(pm,pphase,N,10);
|
||||
/* invert N solutions */
|
||||
for (ci=0; ci<N; ci++) {
|
||||
mat_invert(&pphase[8*ci],&pinv[8*ci+8*N*cj], rho);
|
||||
}
|
||||
}
|
||||
free(pphase);
|
||||
}
|
||||
|
||||
/* divide x[] over GPUs */
|
||||
int Nbase1=Nbase*tilesz;
|
||||
Nthb0=(Nbase1+Ngpu-1)/Ngpu;
|
||||
|
||||
if ((threaddata_corr=(thread_data_corr_t*)malloc((size_t)Ngpu*sizeof(thread_data_corr_t)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
ci=0;
|
||||
for (nth=0; nth<Ngpu && ci<Nbase1; nth++) {
|
||||
/* this thread will handle baselines [ci:min(Nbase-1,ci+Nthb0-1)] */
|
||||
/* determine actual no. of baselines */
|
||||
if (ci+Nthb0<Nbase1) {
|
||||
Nthb=Nthb0;
|
||||
} else {
|
||||
Nthb=Nbase1-ci;
|
||||
}
|
||||
|
||||
threaddata_corr[nth].hst=&thst;
|
||||
threaddata_corr[nth].tid=nth;
|
||||
threaddata_corr[nth].boff=ci;
|
||||
threaddata_corr[nth].Nb=Nthb;
|
||||
threaddata_corr[nth].barr=barr;
|
||||
threaddata_corr[nth].x=x;
|
||||
threaddata_corr[nth].N=N;
|
||||
threaddata_corr[nth].Nbase=Nbase1;
|
||||
threaddata_corr[nth].Nf=Nchan;
|
||||
/* for correction of data */
|
||||
threaddata_corr[nth].pinv=pinv;
|
||||
threaddata_corr[nth].nchunk=carr[cm].nchunk;
|
||||
|
||||
|
||||
pthread_create(&th_array[nth],&attr,correct_threadfn,(void*)(&threaddata_corr[nth]));
|
||||
/* next baseline set */
|
||||
ci=ci+Nthb;
|
||||
}
|
||||
|
||||
/* now wait for threads to finish */
|
||||
for(ci=0; ci<nth; ci++) {
|
||||
pthread_join(th_array[ci],NULL);
|
||||
}
|
||||
|
||||
|
||||
free(pinv);
|
||||
free(threaddata_corr);
|
||||
}
|
||||
|
||||
|
||||
free(th_array);
|
||||
pthread_attr_destroy(&attr);
|
||||
|
||||
destroy_task_hist(&thst);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue