added function to predict using stored solutions, with GPU accel

This commit is contained in:
Sarod Yatawatta 2018-06-19 14:42:03 +02:00
parent 41cf1b3b69
commit 157eaf84cd
3 changed files with 689 additions and 2 deletions

View File

@ -740,10 +740,16 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea
}
#endif
} else {
read_solutions(sfp,p,carr,iodata.N,M);
/* if solution file is given, read in the solutions and predict */
read_solutions(sfp,p,carr,iodata.N,M);
if (GPUpredict) {
predict_visibilities_withsol_withbeam_gpu(iodata.u,iodata.v,iodata.w,p,iodata.xo,ignorelist,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,
beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt,Data::DoSim,Data::ccid,Data::rho,Data::phaseOnly);
} else {
predict_visibilities_multifreq_withsol(iodata.u,iodata.v,iodata.w,p,iodata.xo,ignorelist,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,Data::DoSim,Data::ccid,Data::rho,Data::phaseOnly);
}
}
/************ end simulation only mode ***************************/
}

View File

@ -394,6 +394,9 @@ extern int
calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int ccid, double rho, int phase_only);
extern int
predict_visibilities_withsol_withbeam_gpu(double *u,double *v,double *w,double *p,double *x, int *ignorelist, int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int add_to_data, int ccid, double rho, int phase_only);
#endif /*!HAVE_CUDA */
/****************************** predict_model.cu ****************************/

View File

@ -106,6 +106,9 @@ typedef struct thread_data_pred_t_ {
/* following are only used while predict with gain */
double *p; /* parameter array, size could be 8*N*Mx1 (full) or 8*Nx1 (single)*/
/* following only used to ignore some clusters while predicting */
int *ignlist;
} thread_data_pred_t;
/* struct for passing data to threads for correcting data */
@ -2051,3 +2054,678 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit
return 0;
}
static void *
predict_threadfn(void *data) {
thread_data_pred_t *t=(thread_data_pred_t*)data;
/* first, select a GPU, if total clusters < MAX_GPU_ID
use random selection, elese use this thread id */
int card;
if (t->M<=MAX_GPU_ID) {
card=select_work_gpu(MAX_GPU_ID,t->hst);
} else {
card=t->tid;
}
cudaError_t err;
int ci,ncl,cj;
err=cudaSetDevice(card);
checkCudaError(err,__FILE__,__LINE__);
double *ud,*vd,*wd;
double *cohd;
baseline_t *barrd;
double *freqsd;
float *longd=0,*latd=0; double *timed;
int *Nelemd;
float **xx_p=0,**yy_p=0,**zz_p=0;
float **xxd,**yyd,**zzd;
/* allocate memory in GPU */
err=cudaMalloc((void**) &cohd, t->Nbase*8*t->Nf*sizeof(double)); /* coherencies only for 1 cluster, Nf freq, used to store sum of clusters*/
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &Nelemd, t->N*sizeof(int));
checkCudaError(err,__FILE__,__LINE__);
/* copy to device */
/* u,v,w and l,m,n coords need to be double for precision */
err=cudaMalloc((void**) &ud, t->Nbase*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &vd, t->Nbase*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &wd, t->Nbase*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(ud, t->u, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(vd, t->v, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(wd, t->w, t->Nbase*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(barrd, t->barr, t->Nbase*sizeof(baseline_t), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &freqsd, t->Nf*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(freqsd, t->freqs, t->Nf*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* check if beam is actually calculated */
if (t->dobeam) {
dtofcopy(t->N,&longd,t->longitude);
dtofcopy(t->N,&latd,t->latitude);
err=cudaMalloc((void**) &timed, t->tilesz*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(timed, t->time_utc, t->tilesz*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* convert time jd to GMST angle */
cudakernel_convert_time(t->tilesz,timed);
err=cudaMemcpy(Nelemd, t->Nelem, t->N*sizeof(int), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* jagged arrays for element locations */
err=cudaMalloc((void**)&xxd, t->N*sizeof(int*));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&yyd, t->N*sizeof(int*));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&zzd, t->N*sizeof(int*));
checkCudaError(err,__FILE__,__LINE__);
/* allocate host memory to store pointers */
if ((xx_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((yy_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((zz_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
for (ci=0; ci<t->N; ci++) {
err=cudaMalloc((void**)&xx_p[ci], t->Nelem[ci]*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&yy_p[ci], t->Nelem[ci]*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&zz_p[ci], t->Nelem[ci]*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
}
/* now copy data */
for (ci=0; ci<t->N; ci++) {
dtofcopy(t->Nelem[ci],&xx_p[ci],t->xx[ci]);
dtofcopy(t->Nelem[ci],&yy_p[ci],t->yy[ci]);
dtofcopy(t->Nelem[ci],&zz_p[ci],t->zz[ci]);
}
/* now copy pointer locations to device */
err=cudaMemcpy(xxd, xx_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(yyd, yy_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(zzd, zz_p, t->N*sizeof(int*), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
}
float *beamd;
double *lld,*mmd,*nnd;
double *sId; float *rad,*decd;
double *sQd,*sUd,*sVd;
unsigned char *styped;
double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d;
double *sQ0d,*sU0d,*sV0d;
int **host_p,**dev_p;
double *xlocal;
err=cudaMallocHost((void**)&xlocal,sizeof(double)*(size_t)t->Nbase*8*t->Nf);
checkCudaError(err,__FILE__,__LINE__);
double *pd; /* parameter array per cluster */
/******************* begin loop over clusters **************************/
for (ncl=t->soff; ncl<t->soff+t->Ns; ncl++) {
/* check if cluster id is not part of the ignore list */
if (!t->ignlist[ncl]) {
/* allocate memory for this clusters beam */
if (t->dobeam) {
err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float));
checkCudaError(err,__FILE__,__LINE__);
} else {
beamd=0;
}
/* copy cluster details to GPU */
err=cudaMalloc((void**)&styped, t->carr[ncl].N*sizeof(unsigned char));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &lld, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(lld, t->carr[ncl].ll, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &mmd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(mmd, t->carr[ncl].mm, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &nnd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* parameter vector size may change depending on hybrid parameter */
err=cudaMalloc((void**) &pd, t->N*8*t->carr[ncl].nchunk*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(pd, &(t->p[t->carr[ncl].p[0]]), t->N*8*t->carr[ncl].nchunk*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
if (t->Nf==1) {
err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
}
if (t->dobeam) {
dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra);
dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec);
} else {
rad=0;
decd=0;
}
err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* for multi channel data */
if (t->Nf>1) {
err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &f0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(f0d, t->carr[ncl].f0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &spec_idxd, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(spec_idxd, t->carr[ncl].spec_idx, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &spec_idx1d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(spec_idx1d, t->carr[ncl].spec_idx1, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &spec_idx2d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
}
/* extra info for source, if any */
if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
err=cudaMalloc((void**)&dev_p, t->carr[ncl].N*sizeof(int*));
checkCudaError(err,__FILE__,__LINE__);
for (cj=0; cj<t->carr[ncl].N; cj++) {
if (t->carr[ncl].stype[cj]==STYPE_POINT) {
host_p[cj]=0;
} else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) {
exinfo_shapelet *d=(exinfo_shapelet*)t->carr[ncl].ex[cj];
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_shapelet));
checkCudaError(err,__FILE__,__LINE__);
double *modes;
err=cudaMalloc((void**)&modes, d->n0*d->n0*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_shapelet), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(modes, d->modes, d->n0*d->n0*sizeof(double), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj];
err=cudaMemcpy(&(d_p->modes), &modes, sizeof(double*), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
} else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) {
exinfo_gaussian *d=(exinfo_gaussian*)t->carr[ncl].ex[cj];
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_gaussian));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_gaussian), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
} else if (t->carr[ncl].stype[cj]==STYPE_DISK) {
exinfo_disk *d=(exinfo_disk*)t->carr[ncl].ex[cj];
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_disk));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_disk), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
} else if (t->carr[ncl].stype[cj]==STYPE_RING) {
exinfo_ring *d=(exinfo_ring*)t->carr[ncl].ex[cj];
err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_ring));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_ring), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
}
}
/* now copy pointer locations to device */
err=cudaMemcpy(dev_p, host_p, t->carr[ncl].N*sizeof(int*), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
if (t->dobeam) {
/* now calculate beam for all sources in this cluster */
cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd);
}
/* calculate coherencies for all sources in this cluster, add them up */
cudakernel_residuals(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,pd,t->carr[ncl].nchunk,barrd,freqsd,beamd,
lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam);
/* copy back coherencies to host,
coherencies on host have 8M stride, on device have 8 stride */
err=cudaMemcpy(xlocal, cohd, sizeof(double)*t->Nbase*8*t->Nf, cudaMemcpyDeviceToHost);
checkCudaError(err,__FILE__,__LINE__);
my_daxpy(t->Nbase*8*t->Nf,xlocal,1.0,t->x);
for (cj=0; cj<t->carr[ncl].N; cj++) {
if (t->carr[ncl].stype[cj]==STYPE_POINT) {
} else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) {
exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj];
double *modes=0;
err=cudaMemcpy(&modes, &(d_p->modes), sizeof(double*), cudaMemcpyDeviceToHost);
err=cudaFree(modes);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(host_p[cj]);
checkCudaError(err,__FILE__,__LINE__);
} else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) {
err=cudaFree(host_p[cj]);
checkCudaError(err,__FILE__,__LINE__);
} else if (t->carr[ncl].stype[cj]==STYPE_DISK) {
err=cudaFree(host_p[cj]);
checkCudaError(err,__FILE__,__LINE__);
} else if (t->carr[ncl].stype[cj]==STYPE_RING) {
err=cudaFree(host_p[cj]);
checkCudaError(err,__FILE__,__LINE__);
}
}
free(host_p);
err=cudaFree(dev_p);
checkCudaError(err,__FILE__,__LINE__);
if (t->dobeam) {
err=cudaFree(beamd);
checkCudaError(err,__FILE__,__LINE__);
}
err=cudaFree(lld);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(mmd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(nnd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(pd);
checkCudaError(err,__FILE__,__LINE__);
if (t->Nf==1) {
err=cudaFree(sId);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sQd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sUd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sVd);
checkCudaError(err,__FILE__,__LINE__);
}
if (t->dobeam) {
err=cudaFree(rad);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(decd);
checkCudaError(err,__FILE__,__LINE__);
}
err=cudaFree(styped);
checkCudaError(err,__FILE__,__LINE__);
if (t->Nf>1) {
err=cudaFree(sI0d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(f0d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(spec_idxd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(spec_idx1d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(spec_idx2d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sQ0d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sU0d);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(sV0d);
checkCudaError(err,__FILE__,__LINE__);
}
}
}
/******************* end loop over clusters **************************/
/* free memory */
err=cudaFreeHost(xlocal);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(ud);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(vd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(wd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(cohd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(barrd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(freqsd);
checkCudaError(err,__FILE__,__LINE__);
if (t->dobeam) {
err=cudaFree(longd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(latd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(timed);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(Nelemd);
checkCudaError(err,__FILE__,__LINE__);
for (ci=0; ci<t->N; ci++) {
err=cudaFree(xx_p[ci]);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(yy_p[ci]);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(zz_p[ci]);
checkCudaError(err,__FILE__,__LINE__);
}
err=cudaFree(xxd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(yyd);
checkCudaError(err,__FILE__,__LINE__);
err=cudaFree(zzd);
checkCudaError(err,__FILE__,__LINE__);
free(xx_p);
free(yy_p);
free(zz_p);
}
cudaDeviceSynchronize();
/* reset error state */
err=cudaGetLastError();
return NULL;
}
/*
almost same as calculate_residuals_multifreq_withbeam_gpu(), but ignorelist is used to ignore prediction of some clusters, and predict/add/subtract options based on simulation mode add_to_data
int * ignorelist: Mx1 array
int add_to_data: predict/add/subtract mode
*/
int
predict_visibilities_withsol_withbeam_gpu(double *u,double *v,double *w,double *p,double *x, int *ignorelist, int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0,
double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int add_to_data, int ccid, double rho, int phase_only) {
int nth,ci;
int Nthb0,Nthb;
pthread_attr_t attr;
pthread_t *th_array;
thread_data_pred_t *threaddata;
thread_data_corr_t *threaddata_corr;
taskhist thst;
init_task_hist(&thst);
/* oversubsribe GPU */
int Ngpu=MAX_GPU_ID+1;
/* calculate min clusters thread can handle */
Nthb0=(M+Ngpu-1)/Ngpu;
/* setup threads : note: Ngpu is no of GPUs used */
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
if ((th_array=(pthread_t*)malloc((size_t)Ngpu*sizeof(pthread_t)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((threaddata=(thread_data_pred_t*)malloc((size_t)Ngpu*sizeof(thread_data_pred_t)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
/* arrays to store result */
double *xlocal;
if ((xlocal=(double*)calloc((size_t)Nbase*8*tilesz*Nchan*Ngpu,sizeof(double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
/* set common parameters, and split clusters to threads */
ci=0;
for (nth=0; nth<Ngpu && ci<M; nth++) {
if (ci+Nthb0<M) {
Nthb=Nthb0;
} else {
Nthb=M-ci;
}
threaddata[nth].hst=&thst; /* for load balancing */
threaddata[nth].tid=nth;
threaddata[nth].u=u;
threaddata[nth].v=v;
threaddata[nth].w=w;
threaddata[nth].coh=0;
threaddata[nth].x=&xlocal[nth*Nbase*8*tilesz*Nchan]; /* distinct arrays to get back the result */
threaddata[nth].N=N;
threaddata[nth].Nbase=Nbase*tilesz; /* total baselines: actually Nbasextilesz */
threaddata[nth].barr=barr;
threaddata[nth].carr=carr;
threaddata[nth].M=M;
threaddata[nth].Nf=Nchan;
threaddata[nth].freqs=freqs;
threaddata[nth].fdelta=fdelta;
threaddata[nth].tdelta=tdelta;
threaddata[nth].dec0=dec0;
threaddata[nth].dobeam=dobeam;
threaddata[nth].ph_ra0=ph_ra0;
threaddata[nth].ph_dec0=ph_dec0;
threaddata[nth].ph_freq0=ph_freq0;
threaddata[nth].longitude=longitude;
threaddata[nth].latitude=latitude;
threaddata[nth].time_utc=time_utc;
threaddata[nth].tilesz=tilesz;
threaddata[nth].Nelem=Nelem;
threaddata[nth].xx=xx;
threaddata[nth].yy=yy;
threaddata[nth].zz=zz;
/* parameters */
threaddata[nth].p=p;
threaddata[nth].Ns=Nthb;
threaddata[nth].soff=ci;
/* which clusters to ignore */
threaddata[nth].ignlist=ignorelist;
pthread_create(&th_array[nth],&attr,predict_threadfn,(void*)(&threaddata[nth]));
/* next source set */
ci=ci+Nthb;
}
if (add_to_data==SIMUL_ONLY) {
/* set input to zero */
memset(x,0,sizeof(double)*8*Nbase*tilesz*Nchan);
}
/* now wait for threads to finish */
for(ci=0; ci<nth; ci++) {
pthread_join(th_array[ci],NULL);
if (add_to_data==SIMUL_ONLY || add_to_data==SIMUL_ADD) {
/* add to input value */
my_daxpy(Nbase*8*tilesz*Nchan,threaddata[ci].x,1.0,x);
} else {
/* subtract from input value */
my_daxpy(Nbase*8*tilesz*Nchan,threaddata[ci].x,-1.0,x);
}
}
free(xlocal);
free(threaddata);
/* find if any cluster is specified for correction of data */
int cm,cj;
cm=-1;
for (cj=0; cj<M; cj++) { /* clusters */
/* check if cluster id == ccid to do a correction */
if (carr[cj].id==ccid) {
/* valid cluster found */
cm=cj;
}
}
/* correction of data, if valid cluster is given */
if (cm>=0) {
double *pm,*pinv,*pphase=0;
/* allocate memory for inverse J */
if ((pinv=(double*)malloc((size_t)8*N*carr[cm].nchunk*sizeof(double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
if (!phase_only) {
for (cj=0; cj<carr[cm].nchunk; cj++) {
pm=&(p[carr[cm].p[cj]]); /* start of solutions */
/* invert N solutions */
for (ci=0; ci<N; ci++) {
mat_invert(&pm[8*ci],&pinv[8*ci+8*N*cj], rho);
}
}
} else {
/* joint diagonalize solutions and get only phases before inverting */
if ((pphase=(double*)malloc((size_t)8*N*sizeof(double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
for (cj=0; cj<carr[cm].nchunk; cj++) {
pm=&(p[carr[cm].p[cj]]); /* start of solutions */
/* extract phase of pm, output to pphase */
extract_phases(pm,pphase,N,10);
/* invert N solutions */
for (ci=0; ci<N; ci++) {
mat_invert(&pphase[8*ci],&pinv[8*ci+8*N*cj], rho);
}
}
free(pphase);
}
/* divide x[] over GPUs */
int Nbase1=Nbase*tilesz;
Nthb0=(Nbase1+Ngpu-1)/Ngpu;
if ((threaddata_corr=(thread_data_corr_t*)malloc((size_t)Ngpu*sizeof(thread_data_corr_t)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
ci=0;
for (nth=0; nth<Ngpu && ci<Nbase1; nth++) {
/* this thread will handle baselines [ci:min(Nbase-1,ci+Nthb0-1)] */
/* determine actual no. of baselines */
if (ci+Nthb0<Nbase1) {
Nthb=Nthb0;
} else {
Nthb=Nbase1-ci;
}
threaddata_corr[nth].hst=&thst;
threaddata_corr[nth].tid=nth;
threaddata_corr[nth].boff=ci;
threaddata_corr[nth].Nb=Nthb;
threaddata_corr[nth].barr=barr;
threaddata_corr[nth].x=x;
threaddata_corr[nth].N=N;
threaddata_corr[nth].Nbase=Nbase1;
threaddata_corr[nth].Nf=Nchan;
/* for correction of data */
threaddata_corr[nth].pinv=pinv;
threaddata_corr[nth].nchunk=carr[cm].nchunk;
pthread_create(&th_array[nth],&attr,correct_threadfn,(void*)(&threaddata_corr[nth]));
/* next baseline set */
ci=ci+Nthb;
}
/* now wait for threads to finish */
for(ci=0; ci<nth; ci++) {
pthread_join(th_array[ci],NULL);
}
free(pinv);
free(threaddata_corr);
}
free(th_array);
pthread_attr_destroy(&attr);
destroy_task_hist(&thst);
return 0;
}