diff --git a/src/MS/main.cpp b/src/MS/main.cpp index 473ab29..b35a3a0 100644 --- a/src/MS/main.cpp +++ b/src/MS/main.cpp @@ -740,9 +740,15 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea } #endif } else { + /* if solution file is given, read in the solutions and predict */ read_solutions(sfp,p,carr,iodata.N,M); - /* if solution file is given, read in the solutions and predict */ - predict_visibilities_multifreq_withsol(iodata.u,iodata.v,iodata.w,p,iodata.xo,ignorelist,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,Data::DoSim,Data::ccid,Data::rho,Data::phaseOnly); + + if (GPUpredict) { + predict_visibilities_withsol_withbeam_gpu(iodata.u,iodata.v,iodata.w,p,iodata.xo,ignorelist,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0, +beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt,Data::DoSim,Data::ccid,Data::rho,Data::phaseOnly); + } else { + predict_visibilities_multifreq_withsol(iodata.u,iodata.v,iodata.w,p,iodata.xo,ignorelist,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0,Data::Nt,Data::DoSim,Data::ccid,Data::rho,Data::phaseOnly); + } } /************ end simulation only mode ***************************/ } diff --git a/src/lib/Radio/Radio.h b/src/lib/Radio/Radio.h index eaaf6d8..7822647 100644 --- a/src/lib/Radio/Radio.h +++ b/src/lib/Radio/Radio.h @@ -394,6 +394,9 @@ extern int calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0, double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc,int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int ccid, double rho, int phase_only); +extern int +predict_visibilities_withsol_withbeam_gpu(double *u,double *v,double *w,double *p,double *x, int *ignorelist, int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0, +double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int add_to_data, int ccid, double rho, int phase_only); #endif /*!HAVE_CUDA */ /****************************** predict_model.cu ****************************/ diff --git a/src/lib/Radio/predict_withbeam_gpu.c b/src/lib/Radio/predict_withbeam_gpu.c index 365f0f2..dd846e1 100644 --- a/src/lib/Radio/predict_withbeam_gpu.c +++ b/src/lib/Radio/predict_withbeam_gpu.c @@ -106,6 +106,9 @@ typedef struct thread_data_pred_t_ { /* following are only used while predict with gain */ double *p; /* parameter array, size could be 8*N*Mx1 (full) or 8*Nx1 (single)*/ + /* following only used to ignore some clusters while predicting */ + int *ignlist; + } thread_data_pred_t; /* struct for passing data to threads for correcting data */ @@ -2051,3 +2054,678 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit return 0; } + + + +static void * +predict_threadfn(void *data) { + thread_data_pred_t *t=(thread_data_pred_t*)data; + /* first, select a GPU, if total clusters < MAX_GPU_ID + use random selection, elese use this thread id */ + int card; + if (t->M<=MAX_GPU_ID) { + card=select_work_gpu(MAX_GPU_ID,t->hst); + } else { + card=t->tid; + } + cudaError_t err; + int ci,ncl,cj; + + + err=cudaSetDevice(card); + checkCudaError(err,__FILE__,__LINE__); + + double *ud,*vd,*wd; + double *cohd; + baseline_t *barrd; + double *freqsd; + float *longd=0,*latd=0; double *timed; + int *Nelemd; + float **xx_p=0,**yy_p=0,**zz_p=0; + float **xxd,**yyd,**zzd; + /* allocate memory in GPU */ + err=cudaMalloc((void**) &cohd, t->Nbase*8*t->Nf*sizeof(double)); /* coherencies only for 1 cluster, Nf freq, used to store sum of clusters*/ + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &Nelemd, t->N*sizeof(int)); + checkCudaError(err,__FILE__,__LINE__); + + /* copy to device */ + /* u,v,w and l,m,n coords need to be double for precision */ + err=cudaMalloc((void**) &ud, t->Nbase*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &vd, t->Nbase*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &wd, t->Nbase*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(ud, t->u, t->Nbase*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(vd, t->v, t->Nbase*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(wd, t->w, t->Nbase*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + + err=cudaMemcpy(barrd, t->barr, t->Nbase*sizeof(baseline_t), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &freqsd, t->Nf*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(freqsd, t->freqs, t->Nf*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + + /* check if beam is actually calculated */ + if (t->dobeam) { + dtofcopy(t->N,&longd,t->longitude); + dtofcopy(t->N,&latd,t->latitude); + err=cudaMalloc((void**) &timed, t->tilesz*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(timed, t->time_utc, t->tilesz*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + /* convert time jd to GMST angle */ + cudakernel_convert_time(t->tilesz,timed); + + err=cudaMemcpy(Nelemd, t->Nelem, t->N*sizeof(int), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + /* jagged arrays for element locations */ + err=cudaMalloc((void**)&xxd, t->N*sizeof(int*)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**)&yyd, t->N*sizeof(int*)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**)&zzd, t->N*sizeof(int*)); + checkCudaError(err,__FILE__,__LINE__); + /* allocate host memory to store pointers */ + if ((xx_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); + } + if ((yy_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); + } + if ((zz_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); + } + for (ci=0; ciN; ci++) { + err=cudaMalloc((void**)&xx_p[ci], t->Nelem[ci]*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**)&yy_p[ci], t->Nelem[ci]*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**)&zz_p[ci], t->Nelem[ci]*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + } + /* now copy data */ + for (ci=0; ciN; ci++) { + dtofcopy(t->Nelem[ci],&xx_p[ci],t->xx[ci]); + dtofcopy(t->Nelem[ci],&yy_p[ci],t->yy[ci]); + dtofcopy(t->Nelem[ci],&zz_p[ci],t->zz[ci]); + } + /* now copy pointer locations to device */ + err=cudaMemcpy(xxd, xx_p, t->N*sizeof(int*), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(yyd, yy_p, t->N*sizeof(int*), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(zzd, zz_p, t->N*sizeof(int*), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + } + + + float *beamd; + double *lld,*mmd,*nnd; + double *sId; float *rad,*decd; + double *sQd,*sUd,*sVd; + unsigned char *styped; + double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d; + double *sQ0d,*sU0d,*sV0d; + int **host_p,**dev_p; + + double *xlocal; + err=cudaMallocHost((void**)&xlocal,sizeof(double)*(size_t)t->Nbase*8*t->Nf); + checkCudaError(err,__FILE__,__LINE__); + + double *pd; /* parameter array per cluster */ + +/******************* begin loop over clusters **************************/ + for (ncl=t->soff; nclsoff+t->Ns; ncl++) { + /* check if cluster id is not part of the ignore list */ + if (!t->ignlist[ncl]) { + /* allocate memory for this clusters beam */ + if (t->dobeam) { + err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float)); + checkCudaError(err,__FILE__,__LINE__); + } else { + beamd=0; + } + + + /* copy cluster details to GPU */ + err=cudaMalloc((void**)&styped, t->carr[ncl].N*sizeof(unsigned char)); + checkCudaError(err,__FILE__,__LINE__); + + err=cudaMalloc((void**) &lld, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(lld, t->carr[ncl].ll, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &mmd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(mmd, t->carr[ncl].mm, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &nnd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + /* parameter vector size may change depending on hybrid parameter */ + err=cudaMalloc((void**) &pd, t->N*8*t->carr[ncl].nchunk*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(pd, &(t->p[t->carr[ncl].p[0]]), t->N*8*t->carr[ncl].nchunk*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + + if (t->Nf==1) { + err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + } + + + if (t->dobeam) { + dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra); + dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec); + } else { + rad=0; + decd=0; + } + err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + /* for multi channel data */ + if (t->Nf>1) { + err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &f0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(f0d, t->carr[ncl].f0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &spec_idxd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(spec_idxd, t->carr[ncl].spec_idx, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &spec_idx1d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(spec_idx1d, t->carr[ncl].spec_idx1, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &spec_idx2d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + + err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + } + + + + /* extra info for source, if any */ + if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); + } + err=cudaMalloc((void**)&dev_p, t->carr[ncl].N*sizeof(int*)); + checkCudaError(err,__FILE__,__LINE__); + + + for (cj=0; cjcarr[ncl].N; cj++) { + + if (t->carr[ncl].stype[cj]==STYPE_POINT) { + host_p[cj]=0; + } else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) { + exinfo_shapelet *d=(exinfo_shapelet*)t->carr[ncl].ex[cj]; + err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_shapelet)); + checkCudaError(err,__FILE__,__LINE__); + double *modes; + err=cudaMalloc((void**)&modes, d->n0*d->n0*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_shapelet), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(modes, d->modes, d->n0*d->n0*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj]; + err=cudaMemcpy(&(d_p->modes), &modes, sizeof(double*), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + } else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) { + exinfo_gaussian *d=(exinfo_gaussian*)t->carr[ncl].ex[cj]; + err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_gaussian)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_gaussian), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + } else if (t->carr[ncl].stype[cj]==STYPE_DISK) { + exinfo_disk *d=(exinfo_disk*)t->carr[ncl].ex[cj]; + err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_disk)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_disk), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + } else if (t->carr[ncl].stype[cj]==STYPE_RING) { + exinfo_ring *d=(exinfo_ring*)t->carr[ncl].ex[cj]; + err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_ring)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_ring), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + } + + + } + /* now copy pointer locations to device */ + err=cudaMemcpy(dev_p, host_p, t->carr[ncl].N*sizeof(int*), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + + if (t->dobeam) { + /* now calculate beam for all sources in this cluster */ + cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd); + } + + + /* calculate coherencies for all sources in this cluster, add them up */ + cudakernel_residuals(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,pd,t->carr[ncl].nchunk,barrd,freqsd,beamd, + lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam); + + /* copy back coherencies to host, + coherencies on host have 8M stride, on device have 8 stride */ + err=cudaMemcpy(xlocal, cohd, sizeof(double)*t->Nbase*8*t->Nf, cudaMemcpyDeviceToHost); + checkCudaError(err,__FILE__,__LINE__); + my_daxpy(t->Nbase*8*t->Nf,xlocal,1.0,t->x); + + for (cj=0; cjcarr[ncl].N; cj++) { + if (t->carr[ncl].stype[cj]==STYPE_POINT) { + } else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) { + exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj]; + double *modes=0; + err=cudaMemcpy(&modes, &(d_p->modes), sizeof(double*), cudaMemcpyDeviceToHost); + err=cudaFree(modes); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(host_p[cj]); + checkCudaError(err,__FILE__,__LINE__); + } else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) { + err=cudaFree(host_p[cj]); + checkCudaError(err,__FILE__,__LINE__); + } else if (t->carr[ncl].stype[cj]==STYPE_DISK) { + err=cudaFree(host_p[cj]); + checkCudaError(err,__FILE__,__LINE__); + } else if (t->carr[ncl].stype[cj]==STYPE_RING) { + err=cudaFree(host_p[cj]); + checkCudaError(err,__FILE__,__LINE__); + } + } + free(host_p); + + err=cudaFree(dev_p); + checkCudaError(err,__FILE__,__LINE__); + + if (t->dobeam) { + err=cudaFree(beamd); + checkCudaError(err,__FILE__,__LINE__); + } + err=cudaFree(lld); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(mmd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(nnd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(pd); + checkCudaError(err,__FILE__,__LINE__); + if (t->Nf==1) { + err=cudaFree(sId); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sQd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sUd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sVd); + checkCudaError(err,__FILE__,__LINE__); + } + if (t->dobeam) { + err=cudaFree(rad); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(decd); + checkCudaError(err,__FILE__,__LINE__); + } + err=cudaFree(styped); + checkCudaError(err,__FILE__,__LINE__); + + if (t->Nf>1) { + err=cudaFree(sI0d); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(f0d); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(spec_idxd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(spec_idx1d); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(spec_idx2d); + checkCudaError(err,__FILE__,__LINE__); + + err=cudaFree(sQ0d); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sU0d); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sV0d); + checkCudaError(err,__FILE__,__LINE__); + } + } + } +/******************* end loop over clusters **************************/ + + /* free memory */ + err=cudaFreeHost(xlocal); + checkCudaError(err,__FILE__,__LINE__); + + err=cudaFree(ud); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(vd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(wd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(cohd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(barrd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(freqsd); + checkCudaError(err,__FILE__,__LINE__); + + if (t->dobeam) { + err=cudaFree(longd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(latd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(timed); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(Nelemd); + checkCudaError(err,__FILE__,__LINE__); + + + for (ci=0; ciN; ci++) { + err=cudaFree(xx_p[ci]); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(yy_p[ci]); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(zz_p[ci]); + checkCudaError(err,__FILE__,__LINE__); + } + + err=cudaFree(xxd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(yyd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(zzd); + checkCudaError(err,__FILE__,__LINE__); + + free(xx_p); + free(yy_p); + free(zz_p); + } + + + cudaDeviceSynchronize(); + /* reset error state */ + err=cudaGetLastError(); + return NULL; + +} + +/* + almost same as calculate_residuals_multifreq_withbeam_gpu(), but ignorelist is used to ignore prediction of some clusters, and predict/add/subtract options based on simulation mode add_to_data + + int * ignorelist: Mx1 array + int add_to_data: predict/add/subtract mode +*/ +int +predict_visibilities_withsol_withbeam_gpu(double *u,double *v,double *w,double *p,double *x, int *ignorelist, int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0, +double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int add_to_data, int ccid, double rho, int phase_only) { + + int nth,ci; + + int Nthb0,Nthb; + pthread_attr_t attr; + pthread_t *th_array; + thread_data_pred_t *threaddata; + thread_data_corr_t *threaddata_corr; + taskhist thst; + init_task_hist(&thst); + + /* oversubsribe GPU */ + int Ngpu=MAX_GPU_ID+1; + + /* calculate min clusters thread can handle */ + Nthb0=(M+Ngpu-1)/Ngpu; + + /* setup threads : note: Ngpu is no of GPUs used */ + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE); + + if ((th_array=(pthread_t*)malloc((size_t)Ngpu*sizeof(pthread_t)))==0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + if ((threaddata=(thread_data_pred_t*)malloc((size_t)Ngpu*sizeof(thread_data_pred_t)))==0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + + /* arrays to store result */ + double *xlocal; + if ((xlocal=(double*)calloc((size_t)Nbase*8*tilesz*Nchan*Ngpu,sizeof(double)))==0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + + + /* set common parameters, and split clusters to threads */ + ci=0; + for (nth=0; nth=0) { + double *pm,*pinv,*pphase=0; + /* allocate memory for inverse J */ + if ((pinv=(double*)malloc((size_t)8*N*carr[cm].nchunk*sizeof(double)))==0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + if (!phase_only) { + for (cj=0; cj