From 028ac53347d133e7b41af50701a0518ec448e719 Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Mon, 20 Nov 2017 12:36:17 +0100 Subject: [PATCH] added model prediction and residual calculation using GPU --- src/MPI/main.cpp | 1 + src/MPI/sagecal_slave.cpp | 36 +- src/MS/main.cpp | 37 +- src/lib/clmfit.c | 1 + src/lib/predict_model.cu | 523 ++++++++++++++++++++--- src/lib/predict_withbeam_gpu.c | 729 ++++++++++++++++++++++++++++++++- src/lib/sagecal.h | 20 +- 7 files changed, 1254 insertions(+), 93 deletions(-) diff --git a/src/MPI/main.cpp b/src/MPI/main.cpp index bd4a21c..d911967 100644 --- a/src/MPI/main.cpp +++ b/src/MPI/main.cpp @@ -239,6 +239,7 @@ ParseCmdLine(int ac, char **av) { } } + /* real main program */ int main(int argc, char **argv) { diff --git a/src/MPI/sagecal_slave.cpp b/src/MPI/sagecal_slave.cpp index 261b03d..3d3c718 100644 --- a/src/MPI/sagecal_slave.cpp +++ b/src/MPI/sagecal_slave.cpp @@ -537,12 +537,27 @@ cout<eP,&sinph,&cosph); ut=a*(cosph*up-sinph*vp); vt=b*(sinph*up+cosph*vp); + /* if u,v is way off the scale (beta) of shapelet modes, the result is almost always zero, + so check this here and return 0, otherwise spurious nans may result */ + if (__fdiv_rz(100.0f,__fsqrt_rz(ut*ut+vt*vt))beta) { + return make_cuDoubleComplex(0.0,0.0); + } /* note: we decompose f(-l,m) so the Fourier transform is F(-u,v) so negate the u grid */ Av=(float*)malloc((size_t)((dp->n0)*(dp->n0))*sizeof(float)); @@ -345,45 +369,57 @@ shapelet_contrib__(int *dd, float u, float v, float w) { free(Av); free(cplx); - //return 2.0*M_PI*(realsum+_Complex_I*imagsum); realsum*=2.0f*M_PI*a*b; imagsum*=2.0f*M_PI*a*b; + /* additional safeguards */ + if ( isnan(realsum) ) { realsum=0.0f; } + if ( isnan(imagsum) ) { imagsum=0.0f; } return make_cuDoubleComplex((double)realsum,(double)imagsum); } - -__device__ cuDoubleComplex -compute_prodterm(int sta1, int sta2, int N, int K, int T, int F, -double phterm0, double sIf, double sI0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0, -const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w) { - +__device__ void +compute_prodterm_multifreq(int sta1, int sta2, int N, int K, int T, int F, +double phterm0, double sI0f, double sQ0f, double sU0f, double sV0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0, + double myfreq, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) { + /* F>1 is assumed output: 8x1 array */ double sinph,cosph; - double myfreq=__ldg(&freqs[cf]); sincos(phterm0*myfreq,&sinph,&cosph); cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph); - double If; - if (F==1) { - /* flux: do not use spectra here, because F=1*/ - If=sIf; - } else { - /* evaluate spectra */ - double fratio=log(myfreq/myf0); - double fratio1=fratio*fratio; - double fratio2=fratio1*fratio; - /* catch -ve flux */ - if (sI0f>0.0) { - If=exp(log(sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2); - } else if (sI0f<0.0) { - If=-exp(log(-sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2); - } else { - If=0.0; - } + double If,Qf,Uf,Vf; + If=Qf=Uf=Vf=0.0; + /* evaluate spectra */ + double fratio=log(myfreq/myf0); + double fratio1=fratio*fratio; + double fratio2=fratio1*fratio; + double cm=spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2; + /* catch -ve flux */ + if (sI0f>0.0) { + If=exp(log(sI0f)+cm); + } else if (sI0f<0.0) { + If=-exp(log(-sI0f)+cm); + } + if (sQ0f>0.0) { + Qf=exp(log(sQ0f)+cm); + } else if (sI0f<0.0) { + Qf=-exp(log(-sQ0f)+cm); + } + if (sU0f>0.0) { + Uf=exp(log(sU0f)+cm); + } else if (sI0f<0.0) { + Uf=-exp(log(-sU0f)+cm); + } + if (sV0f>0.0) { + Vf=exp(log(sV0f)+cm); + } else if (sI0f<0.0) { + Vf=-exp(log(-sV0f)+cm); } - /* smearing */ + + /* smearing, beam */ + double scalef=1.0; double phterm =(phterm0*0.5*deltaf); if (phterm!=0.0) { sinph=(sin(phterm)/phterm); - If *=fabsf(sinph); /* catch -ve values due to rounding off */ + scalef *=fabs(sinph); /* catch -ve values due to rounding off */ } if (dobeam) { @@ -396,13 +432,13 @@ const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int //int boffset2=sta2*K*T*F + k1*T*F + cf*T + itm; int boffset2=itm*N*K*F+k*N*F+cf*N+sta2; float beam2=__ldg(&beam[boffset2]); - If *=(double)(beam1*beam2); + scalef *=(double)(beam1*beam2); } /* form complex value */ - prodterm.x *=If; - prodterm.y *=If; + prodterm.x *=scalef; + prodterm.y *=scalef; /* check for type of source */ if (stypeT!=STYPE_POINT) { @@ -420,10 +456,99 @@ const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int } } -//printf("Input k=%d uvw %lE,%lE,%lE sI=%f phterm0=%lf\n",k,u,v,w,sIf,phterm0); -//printf("phterm %lf smearing %f\n",phterm,sinph); -//printf("k=%d uvw %lE,%lE,%lE If=%f phterm %lf out=%f,%f\n",k,u,v,w,If,phterm,prodterm.x,prodterm.y); - return prodterm; + double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy; + Ix=If*prodterm.x; + Iy=If*prodterm.y; + Qx=Qf*prodterm.x; + Qy=Qf*prodterm.y; + Ux=Uf*prodterm.x; + Uy=Uf*prodterm.y; + Vx=Vf*prodterm.x; + Vy=Vf*prodterm.y; + + + + output[0]=Ix+Qx; + output[1]=Iy+Qy; + output[2]=Ux-Vy; + output[3]=Vx+Uy; + output[4]=Ux+Vy; + output[5]=-Vx+Uy; + output[6]=Ix-Qx; + output[7]=Iy-Qy; + +} + + +__device__ void +compute_prodterm(int sta1, int sta2, int N, int K, int T, + double phterm0, double If, double Qf, double Uf, double Vf, + double myfreq, double deltaf, int dobeam, int itm, int k, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) { + /* F==1 is assumed, output: 8x1 array */ + double sinph,cosph; + sincos(phterm0*myfreq,&sinph,&cosph); + cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph); + + /* smearing, beam */ + double scalef=1.0; + double phterm =(phterm0*0.5*deltaf); + if (phterm!=0.0) { + sinph=(sin(phterm)/phterm); + scalef *=fabs(sinph); /* catch -ve values due to rounding off */ + } + + if (dobeam) { + /* get beam info */ + int boffset1=itm*N*K+k*N+sta1; + // printf("itm=%d, k1=%d, sta1=%d, sta2=%d, boffset1=%d, boffset2=%d\n", itm, k1, sta1, sta2, boffset1, boffset2); + float beam1=__ldg(&beam[boffset1]); + int boffset2=itm*N*K+k*N+sta2; + float beam2=__ldg(&beam[boffset2]); + scalef *=(double)(beam1*beam2); + } + + + /* form complex value */ + prodterm.x *=scalef; + prodterm.y *=scalef; + + /* check for type of source */ + if (stypeT!=STYPE_POINT) { + double uscaled=u*myfreq; + double vscaled=v*myfreq; + double wscaled=w*myfreq; + if (stypeT==STYPE_SHAPELET) { + prodterm=cuCmul(shapelet_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); + } else if (stypeT==STYPE_GAUSSIAN) { + prodterm=cuCmul(gaussian_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); + } else if (stypeT==STYPE_DISK) { + prodterm=cuCmul(disk_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); + } else if (stypeT==STYPE_RING) { + prodterm=cuCmul(ring_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); + } + } + + + double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy; + Ix=If*prodterm.x; + Iy=If*prodterm.y; + Qx=Qf*prodterm.x; + Qy=Qf*prodterm.y; + Ux=Uf*prodterm.x; + Uy=Uf*prodterm.y; + Vx=Vf*prodterm.x; + Vy=Vf*prodterm.y; + + + + output[0]=Ix+Qx; + output[1]=Iy+Qy; + output[2]=Ux-Vy; + output[3]=Vx+Uy; + output[4]=Ux+Vy; + output[5]=-Vx+Uy; + output[6]=Ix-Qx; + output[7]=Iy-Qy; } @@ -431,8 +556,11 @@ const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int __global__ void kernel_coherencies(int B, int N, int T, int K, int F, const double *__restrict__ u, const double *__restrict__ v, const double *__restrict__ w, - baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, const double *__restrict__ sI, - const unsigned char *__restrict__ stype, const double *__restrict__ sI0, const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) { + baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, + const double *__restrict__ sI, const double *__restrict__ sQ, const double *__restrict__ sU, const double *__restrict__ sV, + const unsigned char *__restrict__ stype, const double *__restrict__ sI0, +const double *__restrict__ sQ0, const double *__restrict__ sU0, const double *__restrict__ sV0, + const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) { /* global thread index */ unsigned int n=threadIdx.x+blockDim.x*blockIdx.x; @@ -451,46 +579,96 @@ kernel_coherencies(int B, int N, int T, int K, int F, //TODO: figure out if this max_f makes any sense #define MAX_F 20 - cuDoubleComplex l_coh[MAX_F]; + double l_coh[MAX_F][8]; for(int cf=0; cf1 + if (F==1) { + //use simply for-loop, if K is very large this may be slow and may need further parallelization + for (int k=0; k1) { - sI0f=__ldg(&sI0[k]); - spec_idxf=__ldg(&spec_idx[k]); - spec_idx1f=__ldg(&spec_idx1[k]); - spec_idx2f=__ldg(&spec_idx2[k]); - myf0=__ldg(&f0[k]); - } + sQf=__ldg(&sQ[k]); + sUf=__ldg(&sU[k]); + sVf=__ldg(&sV[k]); + + unsigned char stypeT=__ldg(&stype[k]); + + double llcoh[8]; + compute_prodterm(sta1, sta2, N, K, T, phterm0, sIf, sQf, sUf, sVf, + __ldg(&(freqs[0])), deltaf, dobeam, tslot, k, beam, exs, stypeT, u_n, v_n, w_n, llcoh); + + l_coh[0][0] +=llcoh[0]; + l_coh[0][1] +=llcoh[1]; + l_coh[0][2] +=llcoh[2]; + l_coh[0][3] +=llcoh[3]; + l_coh[0][4] +=llcoh[4]; + l_coh[0][5] +=llcoh[5]; + l_coh[0][6] +=llcoh[6]; + l_coh[0][7] +=llcoh[7]; + + } + } else { + //use simply for-loop, if K is very large this may be slow and may need further parallelization + for (int k=0; k1 + if (F==1) { + //use simply for-loop, if K is very large this may be slow and may need further parallelization + for (int k=0; k0 */ void -cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, - unsigned char *stype, double *sI0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) { +cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV, + unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) { #ifdef CUDA_DBG cudaError_t error; error = cudaGetLastError(); @@ -620,8 +988,8 @@ cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, /* note: make sure we do not exceed max no of blocks available, otherwise (too many baselines, loop over source id) */ int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock; - kernel_coherencies<<>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI, - stype, sI0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam); + kernel_coherencies<<>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV, + stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam); cudaDeviceSynchronize(); #ifdef CUDA_DBG error = cudaGetLastError(); @@ -634,6 +1002,33 @@ cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, } +/* p : parameters 8Nxnchunk values */ +void +cudakernel_residuals(int B, int N, int T, int K, int F, double *u, double *v, double *w, double *p, int nchunk, baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV, + unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) { +#ifdef CUDA_DBG + cudaError_t error; + error = cudaGetLastError(); +#endif + + /* spawn threads to handle baselines, these threads will spawn threads for sources */ + int ThreadsPerBlock=DEFAULT_TH_PER_BK; + /* note: make sure we do not exceed max no of blocks available, + otherwise (too many baselines, loop over source id) */ + int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock; + kernel_residuals<<>>(B, N, T, K, F,u,v,w,p,nchunk,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV, + stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam); + cudaDeviceSynchronize(); +#ifdef CUDA_DBG + error = cudaGetLastError(); + if(error != cudaSuccess) { + // print the CUDA error message and exit + fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__); + exit(-1); + } +#endif +} + /* convert time JD to GMST angle store result at the same location */ void diff --git a/src/lib/predict_withbeam_gpu.c b/src/lib/predict_withbeam_gpu.c index 756afd3..bef23cb 100644 --- a/src/lib/predict_withbeam_gpu.c +++ b/src/lib/predict_withbeam_gpu.c @@ -92,7 +92,7 @@ dtofcopy(int N, float **x_d, double *x) { checkCudaError(err,__FILE__,__LINE__); /* copy memory */ - err=cudaMemcpyAsync(xc,xhost,N*sizeof(float),cudaMemcpyHostToDevice,0); + err=cudaMemcpy(xc,xhost,N*sizeof(float),cudaMemcpyHostToDevice); checkCudaError(err,__FILE__,__LINE__); /* free host buffer */ @@ -120,6 +120,15 @@ precalcoh_threadfn(void *data) { err=cudaSetDevice(card); checkCudaError(err,__FILE__,__LINE__); + /* make sure enough heap memory is available for shapelet computations */ + size_t plim; + err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize); + checkCudaError(err,__FILE__,__LINE__); + if (plimNbase*8*sizeof(float)); /* coherencies only for 1 cluster, Nf=1 */ + err=cudaMalloc((void**) &cohd, t->Nbase*8*sizeof(double)); /* coherencies only for 1 cluster, Nf=1 */ checkCudaError(err,__FILE__,__LINE__); err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t)); checkCudaError(err,__FILE__,__LINE__); @@ -218,9 +227,16 @@ precalcoh_threadfn(void *data) { float *beamd; double *lld,*mmd,*nnd; double *sId; float *rad,*decd; + double *sQd,*sUd,*sVd; unsigned char *styped; double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d; + double *sQ0d,*sU0d,*sV0d; int **host_p,**dev_p; + + complex double *tempdcoh; + err=cudaMallocHost((void**)&tempdcoh,sizeof(complex double)*(size_t)t->Nbase*4); + checkCudaError(err,__FILE__,__LINE__); + /******************* begin loop over clusters **************************/ for (ncl=t->soff; nclsoff+t->Ns; ncl++) { @@ -249,11 +265,29 @@ precalcoh_threadfn(void *data) { err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); checkCudaError(err,__FILE__,__LINE__); + + if (t->Nf==1) { err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double)); checkCudaError(err,__FILE__,__LINE__); err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + } + + + if (t->dobeam) { dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra); dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec); @@ -264,7 +298,8 @@ precalcoh_threadfn(void *data) { err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice); checkCudaError(err,__FILE__,__LINE__); - /* for multi channel data */ + /* for multi channel data - FIXME: remove this part because Nf==1 always */ + if (t->Nf>1) { err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double)); checkCudaError(err,__FILE__,__LINE__); err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); @@ -286,6 +321,22 @@ precalcoh_threadfn(void *data) { err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); checkCudaError(err,__FILE__,__LINE__); + + err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + } + /* extra info for source, if any */ if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) { fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); @@ -347,15 +398,10 @@ precalcoh_threadfn(void *data) { /* calculate coherencies for all sources in this cluster, add them up */ cudakernel_coherencies(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,barrd,freqsd,beamd, - lld,mmd,nnd,sId,styped,sI0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam); + lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam); /* copy back coherencies to host, coherencies on host have 8M stride, on device have 8 stride */ - complex double *tempdcoh; - if ((tempdcoh=(complex double*)calloc((size_t)t->Nbase*4,sizeof(complex double)))==0) { - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); - exit(1); - } err=cudaMemcpy((double*)tempdcoh, cohd, sizeof(double)*t->Nbase*8, cudaMemcpyDeviceToHost); checkCudaError(err,__FILE__,__LINE__); /* now copy this with right offset and stride */ @@ -363,7 +409,6 @@ precalcoh_threadfn(void *data) { my_ccopy(t->Nbase,&tempdcoh[1],4,&(t->coh[4*ncl+1]),4*t->M); my_ccopy(t->Nbase,&tempdcoh[2],4,&(t->coh[4*ncl+2]),4*t->M); my_ccopy(t->Nbase,&tempdcoh[3],4,&(t->coh[4*ncl+3]),4*t->M); - free(tempdcoh); for (cj=0; cjcarr[ncl].N; cj++) { @@ -402,8 +447,16 @@ precalcoh_threadfn(void *data) { checkCudaError(err,__FILE__,__LINE__); err=cudaFree(nnd); checkCudaError(err,__FILE__,__LINE__); + if (t->Nf==1) { err=cudaFree(sId); checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sQd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sUd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sVd); + checkCudaError(err,__FILE__,__LINE__); + } if (t->dobeam) { err=cudaFree(rad); checkCudaError(err,__FILE__,__LINE__); @@ -413,6 +466,7 @@ precalcoh_threadfn(void *data) { err=cudaFree(styped); checkCudaError(err,__FILE__,__LINE__); + if (t->Nf>1) { err=cudaFree(sI0d); checkCudaError(err,__FILE__,__LINE__); err=cudaFree(f0d); @@ -423,10 +477,21 @@ precalcoh_threadfn(void *data) { checkCudaError(err,__FILE__,__LINE__); err=cudaFree(spec_idx2d); checkCudaError(err,__FILE__,__LINE__); + + err=cudaFree(sQ0d); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sU0d); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sV0d); + checkCudaError(err,__FILE__,__LINE__); + } } /******************* end loop over clusters **************************/ /* free memory */ + err=cudaFreeHost(tempdcoh); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(ud); checkCudaError(err,__FILE__,__LINE__); err=cudaFree(vd); @@ -545,7 +610,7 @@ precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex d threaddata[nth].x=0; /* no input data */ threaddata[nth].N=N; - threaddata[nth].Nbase=Nbase; /* total baselines: actually Nbasextilesz */ + threaddata[nth].Nbase=Nbase; /* total baselines: actually Nbasextilesz (from input) */ threaddata[nth].barr=barr; threaddata[nth].carr=carr; threaddata[nth].M=M; @@ -666,7 +731,6 @@ predictvis_threadfn(void *data) { cudaError_t err; int ci,ncl,cj; -printf("Attath %d to card %d\n",t->tid,card); err=cudaSetDevice(card); checkCudaError(err,__FILE__,__LINE__); @@ -675,12 +739,13 @@ printf("Attath %d to card %d\n",t->tid,card); size_t plim; err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize); checkCudaError(err,__FILE__,__LINE__); - if (plim<16*1024*1024) { /* 16MB */ - err=cudaDeviceSetLimit(cudaLimitMallocHeapSize, 16*1024*1024); + if (plimtid,card); float *beamd; double *lld,*mmd,*nnd; double *sId; float *rad,*decd; + double *sQd,*sUd,*sVd; unsigned char *styped; double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d; + double *sQ0d,*sU0d,*sV0d; int **host_p,**dev_p; double *xlocal; @@ -792,7 +859,6 @@ printf("Attath %d to card %d\n",t->tid,card); /******************* begin loop over clusters **************************/ for (ncl=t->soff; nclsoff+t->Ns; ncl++) { - printf("th %d clus %d:%d working %d\n",t->tid,t->soff,t->soff+t->Ns-1,ncl); /* allocate memory for this clusters beam */ if (t->dobeam) { err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float)); @@ -820,11 +886,27 @@ printf("Attath %d to card %d\n",t->tid,card); checkCudaError(err,__FILE__,__LINE__); + if (t->Nf==1) { err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double)); checkCudaError(err,__FILE__,__LINE__); err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + } + + if (t->dobeam) { dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra); dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec); @@ -836,6 +918,7 @@ printf("Attath %d to card %d\n",t->tid,card); checkCudaError(err,__FILE__,__LINE__); /* for multi channel data */ + if (t->Nf>1) { err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double)); checkCudaError(err,__FILE__,__LINE__); err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); @@ -858,6 +941,23 @@ printf("Attath %d to card %d\n",t->tid,card); checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + } + + + /* extra info for source, if any */ if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) { fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); @@ -920,7 +1020,7 @@ printf("Attath %d to card %d\n",t->tid,card); /* calculate coherencies for all sources in this cluster, add them up */ cudakernel_coherencies(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,barrd,freqsd,beamd, - lld,mmd,nnd,sId,styped,sI0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam); + lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam); /* copy back coherencies to host, coherencies on host have 8M stride, on device have 8 stride */ @@ -964,8 +1064,16 @@ printf("Attath %d to card %d\n",t->tid,card); checkCudaError(err,__FILE__,__LINE__); err=cudaFree(nnd); checkCudaError(err,__FILE__,__LINE__); + if (t->Nf==1) { err=cudaFree(sId); checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sQd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sUd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sVd); + checkCudaError(err,__FILE__,__LINE__); + } if (t->dobeam) { err=cudaFree(rad); checkCudaError(err,__FILE__,__LINE__); @@ -975,6 +1083,7 @@ printf("Attath %d to card %d\n",t->tid,card); err=cudaFree(styped); checkCudaError(err,__FILE__,__LINE__); + if (t->Nf>1) { err=cudaFree(sI0d); checkCudaError(err,__FILE__,__LINE__); err=cudaFree(f0d); @@ -985,6 +1094,14 @@ printf("Attath %d to card %d\n",t->tid,card); checkCudaError(err,__FILE__,__LINE__); err=cudaFree(spec_idx2d); checkCudaError(err,__FILE__,__LINE__); + + err=cudaFree(sQ0d); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sU0d); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sV0d); + checkCudaError(err,__FILE__,__LINE__); + } } /******************* end loop over clusters **************************/ @@ -1037,6 +1154,7 @@ printf("Attath %d to card %d\n",t->tid,card); free(zz_p); } + /* reset error state */ err=cudaGetLastError(); return NULL; @@ -1089,7 +1207,7 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit - /* set common parameters, and split baselines to threads */ + /* set common parameters, and split clusters to threads */ ci=0; for (nth=0; nthM<=MAX_GPU_ID) { + card=select_work_gpu(MAX_GPU_ID,t->hst); + } else { + card=t->tid; + } + cudaError_t err; + int ci,ncl,cj; + + + err=cudaSetDevice(card); + checkCudaError(err,__FILE__,__LINE__); + + /* make sure enough heap memory is available for shapelet computations */ + size_t plim; + err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize); + checkCudaError(err,__FILE__,__LINE__); + if (plimNbase*8*t->Nf*sizeof(double)); /* coherencies only for 1 cluster, Nf freq, used to store sum of clusters*/ + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &Nelemd, t->N*sizeof(int)); + checkCudaError(err,__FILE__,__LINE__); + + /* copy to device */ + /* u,v,w and l,m,n coords need to be double for precision */ + err=cudaMalloc((void**) &ud, t->Nbase*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &vd, t->Nbase*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &wd, t->Nbase*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(ud, t->u, t->Nbase*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(vd, t->v, t->Nbase*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(wd, t->w, t->Nbase*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + + err=cudaMemcpy(barrd, t->barr, t->Nbase*sizeof(baseline_t), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &freqsd, t->Nf*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(freqsd, t->freqs, t->Nf*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + + /* check if beam is actually calculated */ + if (t->dobeam) { + dtofcopy(t->N,&longd,t->longitude); + dtofcopy(t->N,&latd,t->latitude); + err=cudaMalloc((void**) &timed, t->tilesz*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(timed, t->time_utc, t->tilesz*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + /* convert time jd to GMST angle */ + cudakernel_convert_time(t->tilesz,timed); + + err=cudaMemcpy(Nelemd, t->Nelem, t->N*sizeof(int), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + /* jagged arrays for element locations */ + err=cudaMalloc((void**)&xxd, t->N*sizeof(int*)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**)&yyd, t->N*sizeof(int*)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**)&zzd, t->N*sizeof(int*)); + checkCudaError(err,__FILE__,__LINE__); + /* allocate host memory to store pointers */ + if ((xx_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); + } + if ((yy_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); + } + if ((zz_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); + } + for (ci=0; ciN; ci++) { + err=cudaMalloc((void**)&xx_p[ci], t->Nelem[ci]*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**)&yy_p[ci], t->Nelem[ci]*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**)&zz_p[ci], t->Nelem[ci]*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + } + /* now copy data */ + for (ci=0; ciN; ci++) { + dtofcopy(t->Nelem[ci],&xx_p[ci],t->xx[ci]); + dtofcopy(t->Nelem[ci],&yy_p[ci],t->yy[ci]); + dtofcopy(t->Nelem[ci],&zz_p[ci],t->zz[ci]); + } + /* now copy pointer locations to device */ + err=cudaMemcpy(xxd, xx_p, t->N*sizeof(int*), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(yyd, yy_p, t->N*sizeof(int*), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(zzd, zz_p, t->N*sizeof(int*), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + } + + + float *beamd; + double *lld,*mmd,*nnd; + double *sId; float *rad,*decd; + double *sQd,*sUd,*sVd; + unsigned char *styped; + double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d; + double *sQ0d,*sU0d,*sV0d; + int **host_p,**dev_p; + + double *xlocal; + err=cudaMallocHost((void**)&xlocal,sizeof(double)*(size_t)t->Nbase*8*t->Nf); + checkCudaError(err,__FILE__,__LINE__); + + double *pd; /* parameter array per cluster */ + +/******************* begin loop over clusters **************************/ + for (ncl=t->soff; nclsoff+t->Ns; ncl++) { + /* allocate memory for this clusters beam */ + if (t->dobeam) { + err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float)); + checkCudaError(err,__FILE__,__LINE__); + } else { + beamd=0; + } + + + /* copy cluster details to GPU */ + err=cudaMalloc((void**)&styped, t->carr[ncl].N*sizeof(unsigned char)); + checkCudaError(err,__FILE__,__LINE__); + + err=cudaMalloc((void**) &lld, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(lld, t->carr[ncl].ll, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &mmd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(mmd, t->carr[ncl].mm, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &nnd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + /* parameter vector size may change depending on hybrid parameter */ + err=cudaMalloc((void**) &pd, t->N*8*t->carr[ncl].nchunk*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(pd, &(t->p[t->carr[ncl].p[0]]), t->N*8*t->carr[ncl].nchunk*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + + if (t->Nf==1) { + err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + } + + + if (t->dobeam) { + dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra); + dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec); + } else { + rad=0; + decd=0; + } + err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + /* for multi channel data */ + if (t->Nf>1) { + err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &f0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(f0d, t->carr[ncl].f0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &spec_idxd, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(spec_idxd, t->carr[ncl].spec_idx, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &spec_idx1d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(spec_idx1d, t->carr[ncl].spec_idx1, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &spec_idx2d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + + err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + } + + + + /* extra info for source, if any */ + if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); + } + err=cudaMalloc((void**)&dev_p, t->carr[ncl].N*sizeof(int*)); + checkCudaError(err,__FILE__,__LINE__); + + + for (cj=0; cjcarr[ncl].N; cj++) { + + if (t->carr[ncl].stype[cj]==STYPE_POINT) { + host_p[cj]=0; + } else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) { + exinfo_shapelet *d=(exinfo_shapelet*)t->carr[ncl].ex[cj]; + err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_shapelet)); + checkCudaError(err,__FILE__,__LINE__); + double *modes; + err=cudaMalloc((void**)&modes, d->n0*d->n0*sizeof(double)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_shapelet), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(modes, d->modes, d->n0*d->n0*sizeof(double), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj]; + err=cudaMemcpy(&(d_p->modes), &modes, sizeof(double*), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + } else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) { + exinfo_gaussian *d=(exinfo_gaussian*)t->carr[ncl].ex[cj]; + err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_gaussian)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_gaussian), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + } else if (t->carr[ncl].stype[cj]==STYPE_DISK) { + exinfo_disk *d=(exinfo_disk*)t->carr[ncl].ex[cj]; + err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_disk)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_disk), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + } else if (t->carr[ncl].stype[cj]==STYPE_RING) { + exinfo_ring *d=(exinfo_ring*)t->carr[ncl].ex[cj]; + err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_ring)); + checkCudaError(err,__FILE__,__LINE__); + err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_ring), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + } + + + } + /* now copy pointer locations to device */ + err=cudaMemcpy(dev_p, host_p, t->carr[ncl].N*sizeof(int*), cudaMemcpyHostToDevice); + checkCudaError(err,__FILE__,__LINE__); + + + if (t->dobeam) { + /* now calculate beam for all sources in this cluster */ + cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd); + } + + + /* calculate coherencies for all sources in this cluster, add them up */ + cudakernel_residuals(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,pd,t->carr[ncl].nchunk,barrd,freqsd,beamd, + lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam); + + /* copy back coherencies to host, + coherencies on host have 8M stride, on device have 8 stride */ + err=cudaMemcpy(xlocal, cohd, sizeof(double)*t->Nbase*8*t->Nf, cudaMemcpyDeviceToHost); + checkCudaError(err,__FILE__,__LINE__); + my_daxpy(t->Nbase*8*t->Nf,xlocal,1.0,t->x); + + for (cj=0; cjcarr[ncl].N; cj++) { + if (t->carr[ncl].stype[cj]==STYPE_POINT) { + } else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) { + exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj]; + double *modes=0; + err=cudaMemcpy(&modes, &(d_p->modes), sizeof(double*), cudaMemcpyDeviceToHost); + err=cudaFree(modes); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(host_p[cj]); + checkCudaError(err,__FILE__,__LINE__); + } else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) { + err=cudaFree(host_p[cj]); + checkCudaError(err,__FILE__,__LINE__); + } else if (t->carr[ncl].stype[cj]==STYPE_DISK) { + err=cudaFree(host_p[cj]); + checkCudaError(err,__FILE__,__LINE__); + } else if (t->carr[ncl].stype[cj]==STYPE_RING) { + err=cudaFree(host_p[cj]); + checkCudaError(err,__FILE__,__LINE__); + } + } + free(host_p); + + err=cudaFree(dev_p); + checkCudaError(err,__FILE__,__LINE__); + + if (t->dobeam) { + err=cudaFree(beamd); + checkCudaError(err,__FILE__,__LINE__); + } + err=cudaFree(lld); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(mmd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(nnd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(pd); + checkCudaError(err,__FILE__,__LINE__); + if (t->Nf==1) { + err=cudaFree(sId); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sQd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sUd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sVd); + checkCudaError(err,__FILE__,__LINE__); + } + if (t->dobeam) { + err=cudaFree(rad); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(decd); + checkCudaError(err,__FILE__,__LINE__); + } + err=cudaFree(styped); + checkCudaError(err,__FILE__,__LINE__); + + if (t->Nf>1) { + err=cudaFree(sI0d); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(f0d); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(spec_idxd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(spec_idx1d); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(spec_idx2d); + checkCudaError(err,__FILE__,__LINE__); + + err=cudaFree(sQ0d); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sU0d); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(sV0d); + checkCudaError(err,__FILE__,__LINE__); + } + } +/******************* end loop over clusters **************************/ + + /* free memory */ + err=cudaFreeHost(xlocal); + checkCudaError(err,__FILE__,__LINE__); + + err=cudaFree(ud); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(vd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(wd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(cohd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(barrd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(freqsd); + checkCudaError(err,__FILE__,__LINE__); + + if (t->dobeam) { + err=cudaFree(longd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(latd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(timed); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(Nelemd); + checkCudaError(err,__FILE__,__LINE__); + + + for (ci=0; ciN; ci++) { + err=cudaFree(xx_p[ci]); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(yy_p[ci]); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(zz_p[ci]); + checkCudaError(err,__FILE__,__LINE__); + } + + err=cudaFree(xxd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(yyd); + checkCudaError(err,__FILE__,__LINE__); + err=cudaFree(zzd); + checkCudaError(err,__FILE__,__LINE__); + + free(xx_p); + free(yy_p); + free(zz_p); + } + + + /* reset error state */ + err=cudaGetLastError(); + return NULL; + +} + + +/* p: 8NMx1 parameter array, but M is 'effective' clusters, need to use carr to find the right offset + ccid: which cluster to use as correction + rho: MMSE robust parameter J+rho I inverted + + phase_only: if >0, and if there is any correction done, use only phase of diagonal elements for correction +*/ +int +calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0, +double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int ccid, double rho, int phase_only) { + + int nth,ci; + + int Nthb0,Nthb; + pthread_attr_t attr; + pthread_t *th_array; + thread_data_pred_t *threaddata; + taskhist thst; + init_task_hist(&thst); + + /* oversubsribe GPU */ + int Ngpu=MAX_GPU_ID+1; + + /* calculate min clusters thread can handle */ + Nthb0=(M+Ngpu-1)/Ngpu; + + /* setup threads : note: Ngpu is no of GPUs used */ + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE); + + if ((th_array=(pthread_t*)malloc((size_t)Ngpu*sizeof(pthread_t)))==0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + if ((threaddata=(thread_data_pred_t*)malloc((size_t)Ngpu*sizeof(thread_data_pred_t)))==0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + + /* arrays to store result */ + double *xlocal; + if ((xlocal=(double*)calloc((size_t)Nbase*8*tilesz*Nchan*Ngpu,sizeof(double)))==0) { + fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); + exit(1); + } + + + /* set common parameters, and split clusters to threads */ + ci=0; + for (nth=0; nth20 or so, try increasing this and recompiling + the default GPU values is ~ 8MB */ +#ifndef GPU_HEAP_SIZE +#define GPU_HEAP_SIZE 20 +#endif extern void @@ -2683,9 +2694,12 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud extern void -cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, - unsigned char *stype, double *sI0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam); +cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV, + unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam); +extern void +cudakernel_residuals(int B, int N, int T, int K, int F, double *u, double *v, double *w, double *p, int nchunk, baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV, + unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam); extern void cudakernel_convert_time(int T, double *time_utc);