diff --git a/src/lib/predict_model.cu b/src/lib/predict_model.cu index ae9d329..30f5c21 100644 --- a/src/lib/predict_model.cu +++ b/src/lib/predict_model.cu @@ -51,121 +51,12 @@ radec2azel_gmst__(float ra, float dec, float longitude, float latitude, float th *az=azd; } -/* slave kernel to calculate phase of manifold vector for given station */ -/* x,y,z: Nx1 arrays of element coords */ -/* sum: scalar to store result */ -/* NOTE: only 1 block should be used here */ -__global__ void -kernel_array_beam_slave_sin(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) { - unsigned int n=threadIdx.x+blockDim.x*blockIdx.x; - extern __shared__ float tmpsum[]; /* assumed to be size Nx1 */ - if (n 1) { - int halfPoint = (nTotalThreads >> 1); // divide by two - if (n < halfPoint) { - int thread2 = n + halfPoint; - if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 ) - tmpsum[n] = tmpsum[n]+tmpsum[thread2]; - } - } - __syncthreads(); - nTotalThreads = halfPoint; // Reducing the binary tree size by two - } - - /* now thread 0 will add up results */ - if (threadIdx.x==0) { - *sum=tmpsum[0]; - } -} - -__global__ void -kernel_array_beam_slave_cos(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) { - unsigned int n=threadIdx.x+blockDim.x*blockIdx.x; - extern __shared__ float tmpsum[]; /* assumed to be size Nx1 */ - if (n 1) { - int halfPoint = (nTotalThreads >> 1); // divide by two - if (n < halfPoint) { - int thread2 = n + halfPoint; - if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 ) - tmpsum[n] = tmpsum[n]+tmpsum[thread2]; - } - } - __syncthreads(); - nTotalThreads = halfPoint; // Reducing the binary tree size by two - } - - /* now thread 0 will add up results */ - if (threadIdx.x==0) { - *sum=tmpsum[0]; - } -} - -/* sum: 2x1 array */ -__global__ void -kernel_array_beam_slave_sincos(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) { - unsigned int n=threadIdx.x+blockDim.x*blockIdx.x; - extern __shared__ float tmpsum[]; /* assumed to be size 2*Nx1 */ - if (n 1) { - int halfPoint = (nTotalThreads >> 1); // divide by two - if (n < halfPoint) { - int thread2 = n + halfPoint; - if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 ) - tmpsum[2*n] = tmpsum[2*n]+tmpsum[2*thread2]; - tmpsum[2*n+1] = tmpsum[2*n+1]+tmpsum[2*thread2+1]; - } - } - __syncthreads(); - nTotalThreads = halfPoint; // Reducing the binary tree size by two - } - - /* now thread 0 will add up results */ - if (threadIdx.x==0) { - sum[0]=tmpsum[0]; - sum[1]=tmpsum[1]; - } -} - - -__device__ int -NearestPowerOf2 (int n){ - if (!n) return n; //(0 == 2^0) - - int x = 1; - while(x < n) { - x <<= 1; - } - return x; -} - - /* use compiler directives to use/not use shared memory */ /* ARRAY_MAX_ELEM : define this in header file beforehand, if using shared memory */ /* master kernel to calculate beam */ __global__ void kernel_array_beam(int N, int T, int K, int F, - const float *__restrict__ freqs, const float *__restrict__ longitude, const float *__restrict__ latitude, + const double *__restrict__ freqs, const float *__restrict__ longitude, const float *__restrict__ latitude, const double *__restrict__ time_utc, const int *__restrict__ Nelem, const float * const *__restrict__ xx, const float * const *__restrict__ yy, const float * const *__restrict__ zz, const float *__restrict__ ra, const float *__restrict__ dec, @@ -230,7 +121,7 @@ kernel_array_beam(int N, int T, int K, int F, r2=(float)-tpc*(ph_freq0*sint0*sinph0-freqs[ifrq]*sint*sinph); r3=(float)-tpc*(ph_freq0*cost0-freqs[ifrq]*cost); */ - float f=__ldg(&freqs[ifrq]); + float f=(float)__ldg(&freqs[ifrq]); float rat1=ph_freq0*sint0; float rat2=f*sint; r1=-tpc*(rat1*cosph0-rat2*cosph); @@ -265,8 +156,8 @@ kernel_array_beam(int N, int T, int K, int F, } /***************************************************************************/ -__device__ cuFloatComplex -gaussian_contrib(int *dd, float u, float v, float w) { +__device__ cuDoubleComplex +gaussian_contrib__(int *dd, float u, float v, float w) { exinfo_gaussian *dp=(exinfo_gaussian*)dd; float up,vp,a,b,ut,vt,cosph,sinph; @@ -286,13 +177,13 @@ gaussian_contrib(int *dd, float u, float v, float w) { ut=a*(cosph*up-sinph*vp); vt=b*(sinph*up+cosph*vp); - return make_cuFloatComplex(0.5f*M_PI*expf(-(ut*ut+vt*vt)),0.0f); + return make_cuDoubleComplex((double)(0.5f*M_PI*expf(-(ut*ut+vt*vt))),0.0); } -__device__ cuFloatComplex -ring_contrib(int *dd, float u, float v, float w) { +__device__ cuDoubleComplex +ring_contrib__(int *dd, float u, float v, float w) { exinfo_ring *dp=(exinfo_ring*)dd; float up,vp,a,b; @@ -303,11 +194,11 @@ ring_contrib(int *dd, float u, float v, float w) { a=dp->eX; /* diameter */ b=sqrtf(up*up+vp*vp)*a*2.0f*M_PI; - return make_cuFloatComplex(j0f(b),0.0f); + return make_cuDoubleComplex((double)j0f(b),0.0); } -__device__ cuFloatComplex -disk_contrib(int *dd, float u, float v, float w) { +__device__ cuDoubleComplex +disk_contrib__(int *dd, float u, float v, float w) { exinfo_disk *dp=(exinfo_disk*)dd; float up,vp,a,b; @@ -318,7 +209,7 @@ disk_contrib(int *dd, float u, float v, float w) { a=dp->eX; /* diameter */ b=sqrtf(up*up+vp*vp)*a*2.0f*M_PI; - return make_cuFloatComplex(j1f(b),0.0f); + return make_cuDoubleComplex((double)j1f(b),0.0); } @@ -411,8 +302,8 @@ calculate_uv_mode_vectors_scalar(float u, float v, float beta, int n0, float *Av free(shpvl); } -__device__ cuFloatComplex -shapelet_contrib(int *dd, float u, float v, float w) { +__device__ cuDoubleComplex +shapelet_contrib__(int *dd, float u, float v, float w) { exinfo_shapelet *dp=(exinfo_shapelet*)dd; int *cplx; float *Av; @@ -457,160 +348,91 @@ shapelet_contrib(int *dd, float u, float v, float w) { //return 2.0*M_PI*(realsum+_Complex_I*imagsum); realsum*=2.0f*M_PI*a*b; imagsum*=2.0f*M_PI*a*b; - return make_cuFloatComplex(realsum,imagsum); + return make_cuDoubleComplex((double)realsum,(double)imagsum); } +__device__ cuDoubleComplex +compute_prodterm(int sta1, int sta2, int N, int K, int T, int F, +double phterm0, double sIf, double sI0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0, +const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w) { -/* slave thread to calculate coherencies, for 1 source */ -/* baseline (sta1,sta2) at time itm */ -/* K: total sources, uset to find right offset - Kused: actual sources calculated in this thread block - Koff: offset in source array to start calculation - NOTE: only 1 block is used - */ -__global__ void -kernel_coherencies_slave(int sta1, int sta2, int itm, int B, int N, int T, int K, int Kused, int Koff, int F, float u, float v, float w, float *freqs, float *beam, float *ll, float *mm, float *nn, float *sI, - unsigned char *stype, float *sI0, float *f0, float *spec_idx, float *spec_idx1, float *spec_idx2, int **exs, float deltaf, float deltat, float dec0, float *__restrict__ coh,int dobeam,int blockDim_2) { - /* which source we work on */ - unsigned int k=threadIdx.x+blockDim.x*blockIdx.x; - - extern __shared__ float tmpcoh[]; /* assumed to be size 8*F*Kusedx1 */ - - if (k1) { - sI0f=__ldg(&sI0[k]); - spec_idxf=__ldg(&spec_idx[k]); - spec_idx1f=__ldg(&spec_idx1[k]); - spec_idx2f=__ldg(&spec_idx2[k]); - myf0=__ldg(&f0[k]); - } - unsigned char stypeT=__ldg(&stype[k]); - for(int cf=0; cf0.0f) { - If=__expf(__logf(sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2); - } else if (sI0f<0.0f) { - If=-__expf(__logf(-sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2); + if (sI0f>0.0) { + If=exp(log(sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2); + } else if (sI0f<0.0) { + If=-exp(log(-sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2); } else { - If=0.0f; + If=0.0; } } /* smearing */ - float phterm =phterm0*0.5f*deltaf; - if (phterm!=0.0f) { - sinph=__sinf(phterm)/phterm; + double phterm =(phterm0*0.5*deltaf); + if (phterm!=0.0) { + sinph=(sin(phterm)/phterm); If *=fabsf(sinph); /* catch -ve values due to rounding off */ } if (dobeam) { /* get beam info */ //int boffset1=sta1*K*T*F + k1*T*F + cf*T + itm; - int boffset1=itm*N*K*F+k1*N*F+cf*N+sta1; + + int boffset1=itm*N*K*F+k*N*F+cf*N+sta1; + // printf("itm=%d, k1=%d, sta1=%d, sta2=%d, boffset1=%d, boffset2=%d\n", itm, k1, sta1, sta2, boffset1, boffset2); float beam1=__ldg(&beam[boffset1]); //int boffset2=sta2*K*T*F + k1*T*F + cf*T + itm; - int boffset2=itm*N*K*F+k1*N*F+cf*N+sta2; + int boffset2=itm*N*K*F+k*N*F+cf*N+sta2; float beam2=__ldg(&beam[boffset2]); - If *=beam1*beam2; + If *=(double)(beam1*beam2); } + /* form complex value */ prodterm.x *=If; prodterm.y *=If; /* check for type of source */ if (stypeT!=STYPE_POINT) { - float uscaled=u*myfreq; - float vscaled=v*myfreq; - float wscaled=w*myfreq; + double uscaled=u*myfreq; + double vscaled=v*myfreq; + double wscaled=w*myfreq; if (stypeT==STYPE_SHAPELET) { - prodterm=cuCmulf(shapelet_contrib(exs[k],uscaled,vscaled,wscaled),prodterm); + prodterm=cuCmul(shapelet_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); } else if (stypeT==STYPE_GAUSSIAN) { - prodterm=cuCmulf(gaussian_contrib(exs[k],uscaled,vscaled,wscaled),prodterm); + prodterm=cuCmul(gaussian_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); } else if (stypeT==STYPE_DISK) { - prodterm=cuCmulf(disk_contrib(exs[k],uscaled,vscaled,wscaled),prodterm); + prodterm=cuCmul(disk_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); } else if (stypeT==STYPE_RING) { - prodterm=cuCmulf(ring_contrib(exs[k],uscaled,vscaled,wscaled),prodterm); + prodterm=cuCmul(ring_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); } } - -//printf("k=%d cf=%d freq=%f uvw %f,%f,%f lmn %f,%f,%f phterm %f If %f\n",k,cf,freqs[cf],u,v,w,ll[k],mm[k],nn[k],phterm,If); - /* write output to shared array */ - tmpcoh[k*8*F+8*cf]=prodterm.x; - tmpcoh[k*8*F+8*cf+1]=prodterm.y; - tmpcoh[k*8*F+8*cf+2]=0.0f; - tmpcoh[k*8*F+8*cf+3]=0.0f; - tmpcoh[k*8*F+8*cf+4]=0.0f; - tmpcoh[k*8*F+8*cf+5]=0.0f; - tmpcoh[k*8*F+8*cf+6]=prodterm.x; - tmpcoh[k*8*F+8*cf+7]=prodterm.y; - } - } - __syncthreads(); +//printf("Input k=%d uvw %lE,%lE,%lE sI=%f phterm0=%lf\n",k,u,v,w,sIf,phterm0); +//printf("phterm %lf smearing %f\n",phterm,sinph); +//printf("k=%d uvw %lE,%lE,%lE If=%f phterm %lf out=%f,%f\n",k,u,v,w,If,phterm,prodterm.x,prodterm.y); + return prodterm; - // Build summation tree over elements, handling case where total threads is not a power of two. - int nTotalThreads = blockDim_2; // Total number of threads (==Kused), rounded up to the next power of two - while(nTotalThreads > 1) { - int halfPoint = (nTotalThreads >> 1); // divide by two - if (k < halfPoint) { - int thread2 = k + halfPoint; - if (thread2 < blockDim.x) { // Skipping the fictitious threads >Kused ( blockDim.x ... blockDim_2-1 ) - for(int cf=0; cf>>(sta1,sta2,tslot,B,N,T,K,K,0,F,__ldg(&u[n]),__ldg(&v[n]),__ldg(&w[n]),freqs,beam,ll,mm,nn,sI,stype,sI0,f0,spec_idx,spec_idx1,spec_idx2,exs,deltaf,deltat,dec0,&coh[8*n],dobeam,NearestPowerOf2(K)); - cudaDeviceSynchronize(); -#ifdef CUDA_DBG - error = cudaGetLastError(); - if(error != cudaSuccess) { - // print the CUDA error message and exit - printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__); - } -#endif - } else { - /* more than 1 kernel */ - int L=(K+ThreadsPerBlock-1)/ThreadsPerBlock; - int ct=0; - int myT; - for (int ci=0; ci>>(sta1,sta2,tslot,B,N,T,K,myT,ct,F,__ldg(&u[n]),__ldg(&v[n]),__ldg(&w[n]),freqs,beam,&ll[ct],&mm[ct],&nn[ct],&sI[ct],&stype[ct],&sI0[ct],&f0[ct],&spec_idx[ct],&spec_idx1[ct],&spec_idx2[ct],&exs[ct],deltaf,deltat,dec0,&coh[8*n],dobeam,NearestPowerOf2(myT)); - cudaDeviceSynchronize(); -#ifdef CUDA_DBG - error = cudaGetLastError(); - if(error != cudaSuccess) { - // print the CUDA error message and exit - printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__); - } -#endif - ct=ct+ThreadsPerBlock; - } + //TODO: figure out if this max_f makes any sense + #define MAX_F 20 + cuDoubleComplex l_coh[MAX_F]; + for(int cf=0; cf1) { + sI0f=__ldg(&sI0[k]); + spec_idxf=__ldg(&spec_idx[k]); + spec_idx1f=__ldg(&spec_idx1[k]); + spec_idx2f=__ldg(&spec_idx2[k]); + myf0=__ldg(&f0[k]); + } + + unsigned char stypeT=__ldg(&stype[k]); + + for(int cf=0; cf0 */ void -cudakernel_coherencies(int B, int N, int T, int K, int F, float *u, float *v, float *w,baseline_t *barr, float *freqs, float *beam, float *ll, float *mm, float *nn, float *sI, - unsigned char *stype, float *sI0, float *f0, float *spec_idx, float *spec_idx1, float *spec_idx2, int **exs, float deltaf, float deltat, float dec0, float *coh,int dobeam) { +cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, + unsigned char *stype, double *sI0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) { #ifdef CUDA_DBG cudaError_t error; error = cudaGetLastError(); - error=cudaMemset(coh,0,sizeof(float)*8*B*F); - checkCudaError(error,__FILE__,__LINE__); -#endif -#ifndef CUDA_DBG - cudaMemset(coh,0,sizeof(float)*8*B*F); #endif /* spawn threads to handle baselines, these threads will spawn threads for sources */ @@ -826,7 +649,7 @@ cudakernel_convert_time(int T, double *time_utc) { int BlocksPerGrid=(T+ThreadsPerBlock-1)/ThreadsPerBlock; kernel_convert_time<<>>(T,time_utc); cudaDeviceSynchronize(); - #ifdef CUDA_DBG +#ifdef CUDA_DBG error = cudaGetLastError(); if(error != cudaSuccess) { // print the CUDA error message and exit