From c0c5424ccd8a8d772792185d1e43579518dfa363 Mon Sep 17 00:00:00 2001 From: Sarod Yatawatta Date: Wed, 7 Feb 2018 10:36:47 +0100 Subject: [PATCH] update --- src/lib/Radio/Radio.h | 2 +- src/lib/Radio/predict_model.cu | 1095 +++++++++++++++++++++----------- 2 files changed, 737 insertions(+), 360 deletions(-) diff --git a/src/lib/Radio/Radio.h b/src/lib/Radio/Radio.h index de0ee0a..4b39d76 100644 --- a/src/lib/Radio/Radio.h +++ b/src/lib/Radio/Radio.h @@ -221,7 +221,7 @@ calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double if model has n0>20 or so, try increasing this and recompiling the default GPU values is ~ 8MB */ #ifndef GPU_HEAP_SIZE -#define GPU_HEAP_SIZE 20 +#define GPU_HEAP_SIZE 32 #endif extern void diff --git a/src/lib/Radio/predict_model.cu b/src/lib/Radio/predict_model.cu index 6678c34..af1f0c4 100644 --- a/src/lib/Radio/predict_model.cu +++ b/src/lib/Radio/predict_model.cu @@ -26,6 +26,25 @@ /* enable this for checking for kernel failure */ //#define CUDA_DBG +/* matrix multiplications */ +/* C=A*B */ +__device__ void +amb(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) { + c[0]=cuCadd(cuCmul(a[0],b[0]),cuCmul(a[1],b[2])); + c[1]=cuCadd(cuCmul(a[0],b[1]),cuCmul(a[1],b[3])); + c[2]=cuCadd(cuCmul(a[2],b[0]),cuCmul(a[3],b[2])); + c[3]=cuCadd(cuCmul(a[2],b[1]),cuCmul(a[3],b[3])); +} +/* C=A*B^H */ +__device__ void +ambt(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) { + c[0]=cuCadd(cuCmul(a[0],cuConj(b[0])),cuCmul(a[1],cuConj(b[1]))); + c[1]=cuCadd(cuCmul(a[0],cuConj(b[2])),cuCmul(a[1],cuConj(b[3]))); + c[2]=cuCadd(cuCmul(a[2],cuConj(b[0])),cuCmul(a[3],cuConj(b[1]))); + c[3]=cuCadd(cuCmul(a[2],cuConj(b[2])),cuCmul(a[3],cuConj(b[3]))); +} + + __device__ void radec2azel_gmst__(float ra, float dec, float longitude, float latitude, float thetaGMST, float *az, float *el) { float thetaLST=thetaGMST+longitude*180.0f/M_PI; @@ -51,134 +70,49 @@ radec2azel_gmst__(float ra, float dec, float longitude, float latitude, float th *az=azd; } -/* slave kernel to calculate phase of manifold vector for given station */ -/* x,y,z: Nx1 arrays of element coords */ -/* sum: scalar to store result */ -/* NOTE: only 1 block should be used here */ -__global__ void -kernel_array_beam_slave_sin(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) { - unsigned int n=threadIdx.x+blockDim.x*blockIdx.x; - extern __shared__ float tmpsum[]; /* assumed to be size Nx1 */ - if (n 1) { - int halfPoint = (nTotalThreads >> 1); // divide by two - if (n < halfPoint) { - int thread2 = n + halfPoint; - if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 ) - tmpsum[n] = tmpsum[n]+tmpsum[thread2]; - } - } - __syncthreads(); - nTotalThreads = halfPoint; // Reducing the binary tree size by two - } - - /* now thread 0 will add up results */ - if (threadIdx.x==0) { - *sum=tmpsum[0]; - } -} - -__global__ void -kernel_array_beam_slave_cos(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) { - unsigned int n=threadIdx.x+blockDim.x*blockIdx.x; - extern __shared__ float tmpsum[]; /* assumed to be size Nx1 */ - if (n 1) { - int halfPoint = (nTotalThreads >> 1); // divide by two - if (n < halfPoint) { - int thread2 = n + halfPoint; - if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 ) - tmpsum[n] = tmpsum[n]+tmpsum[thread2]; - } - } - __syncthreads(); - nTotalThreads = halfPoint; // Reducing the binary tree size by two - } - - /* now thread 0 will add up results */ - if (threadIdx.x==0) { - *sum=tmpsum[0]; - } -} - -/* sum: 2x1 array */ -__global__ void -kernel_array_beam_slave_sincos(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) { - unsigned int n=threadIdx.x+blockDim.x*blockIdx.x; - extern __shared__ float tmpsum[]; /* assumed to be size 2*Nx1 */ - if (n 1) { - int halfPoint = (nTotalThreads >> 1); // divide by two - if (n < halfPoint) { - int thread2 = n + halfPoint; - if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 ) - tmpsum[2*n] = tmpsum[2*n]+tmpsum[2*thread2]; - tmpsum[2*n+1] = tmpsum[2*n+1]+tmpsum[2*thread2+1]; - } - } - __syncthreads(); - nTotalThreads = halfPoint; // Reducing the binary tree size by two - } - - /* now thread 0 will add up results */ - if (threadIdx.x==0) { - sum[0]=tmpsum[0]; - sum[1]=tmpsum[1]; - } -} - - -__device__ int -NearestPowerOf2 (int n){ - if (!n) return n; //(0 == 2^0) - - int x = 1; - while(x < n) { - x <<= 1; - } - return x; -} - - +/* use compiler directives to use/not use shared memory */ +/* ARRAY_MAX_ELEM : define this in header file beforehand, if using shared memory */ /* master kernel to calculate beam */ -/* tarr: size NTKFx2 buffer to store sin() cos() sums */ __global__ void -kernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude, float *latitude, - double *time_utc, int *Nelem, float **xx, float **yy, float **zz, float *ra, float *dec, float ph_ra0, float ph_dec0, float ph_freq0, float *beam, float *tarr) { +kernel_array_beam(int N, int T, int K, int F, + const double *__restrict__ freqs, const float *__restrict__ longitude, const float *__restrict__ latitude, + const double *__restrict__ time_utc, const int *__restrict__ Nelem, + const float * const *__restrict__ xx, const float * const *__restrict__ yy, const float * const *__restrict__ zz, + const float *__restrict__ ra, const float *__restrict__ dec, + float ph_ra0, float ph_dec0, float ph_freq0, float *beam) { - /* global thread index */ - unsigned int n=threadIdx.x+blockDim.x*blockIdx.x; - /* allowed max threads */ - int Ntotal=N*T*K*F; - if (n>>(Nelems,r1,r2,r3,xx[istat],yy[istat],zz[istat],&tarr[2*n],NearestPowerOf2(Nelems)); - cudaDeviceSynchronize(); -#ifdef CUDA_DBG - error = cudaGetLastError(); - if(error != cudaSuccess) { - // print the CUDA error message and exit - printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__); - } -#endif - float ssum=__ldg(&tarr[2*n]); - float csum=__ldg(&tarr[2*n+1]); - float Nnor=1.0f/(float)Nelems; ssum*=Nnor; csum*=Nnor; /* store output (amplitude of beam)*/ + int boffset=itm*N*K*F+isrc*N*F+ifrq*N+istat; beam[boffset]=sqrtf(ssum*ssum+csum*csum); //printf("thread %d stat %d src %d freq %d time %d : %lf longitude=%lf latitude=%lf time=%lf freq=%lf elem=%d ra=%lf dec=%lf beam=%lf\n",n,istat,isrc,ifrq,itm,time_utc[itm],longitude[istat],latitude[istat],time_utc[itm],freqs[ifrq],Nelem[istat],ra[isrc],dec[isrc],beam[boffset]); } @@ -247,8 +175,8 @@ kernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude, fl } /***************************************************************************/ -__device__ cuFloatComplex -gaussian_contrib(int *dd, float u, float v, float w) { +__device__ cuDoubleComplex +gaussian_contrib__(int *dd, float u, float v, float w) { exinfo_gaussian *dp=(exinfo_gaussian*)dd; float up,vp,a,b,ut,vt,cosph,sinph; @@ -268,13 +196,13 @@ gaussian_contrib(int *dd, float u, float v, float w) { ut=a*(cosph*up-sinph*vp); vt=b*(sinph*up+cosph*vp); - return make_cuFloatComplex(0.5f*M_PI*expf(-(ut*ut+vt*vt)),0.0f); + return make_cuDoubleComplex((double)(0.5f*M_PI*expf(-(ut*ut+vt*vt))),0.0); } -__device__ cuFloatComplex -ring_contrib(int *dd, float u, float v, float w) { +__device__ cuDoubleComplex +ring_contrib__(int *dd, float u, float v, float w) { exinfo_ring *dp=(exinfo_ring*)dd; float up,vp,a,b; @@ -285,11 +213,11 @@ ring_contrib(int *dd, float u, float v, float w) { a=dp->eX; /* diameter */ b=sqrtf(up*up+vp*vp)*a*2.0f*M_PI; - return make_cuFloatComplex(j0f(b),0.0f); + return make_cuDoubleComplex((double)j0f(b),0.0); } -__device__ cuFloatComplex -disk_contrib(int *dd, float u, float v, float w) { +__device__ cuDoubleComplex +disk_contrib__(int *dd, float u, float v, float w) { exinfo_disk *dp=(exinfo_disk*)dd; float up,vp,a,b; @@ -300,7 +228,7 @@ disk_contrib(int *dd, float u, float v, float w) { a=dp->eX; /* diameter */ b=sqrtf(up*up+vp*vp)*a*2.0f*M_PI; - return make_cuFloatComplex(j1f(b),0.0f); + return make_cuDoubleComplex((double)j1f(b),0.0); } @@ -324,7 +252,7 @@ H_e(float x, int n) { } __device__ void -calculate_uv_mode_vectors_scalar(float u, float v, float beta, int n0, float *Av, int *cplx) { +calculate_uv_mode_vectors_scalar00(float u, float v, float beta, int n0, float *Av, int *cplx) { int xci,zci,Ntot; @@ -393,8 +321,56 @@ calculate_uv_mode_vectors_scalar(float u, float v, float beta, int n0, float *Av free(shpvl); } -__device__ cuFloatComplex -shapelet_contrib(int *dd, float u, float v, float w) { + +__device__ void +calculate_uv_mode_vectors_scalar(float u, float v, float beta, int n0, float *Av, int *cplx, float *fact, float *shpvl) { + + int xci; + + int n1,n2,start; + int signval; + + + /* start filling in the array from the positive values */ + float xvalu=u*beta; + float expvalu=__expf(-0.5f*xvalu*xvalu); + float xvalv=v*beta; + float expvalv=__expf(-0.5f*xvalv*xvalv); + for (xci=0; xcieP,&sinph,&cosph); ut=a*(cosph*up-sinph*vp); vt=b*(sinph*up+cosph*vp); + /* if u,v is way off the scale (beta) of shapelet modes, the result is almost always zero, + so check this here and return 0, otherwise spurious nans may result */ + if (__fdiv_rz(100.0f,__fsqrt_rz(ut*ut+vt*vt))beta) { + return make_cuDoubleComplex(0.0,0.0); + } /* note: we decompose f(-l,m) so the Fourier transform is F(-u,v) so negate the u grid */ Av=(float*)malloc((size_t)((dp->n0)*(dp->n0))*sizeof(float)); cplx=(int*)malloc((size_t)((dp->n0)*(dp->n0))*sizeof(int)); - calculate_uv_mode_vectors_scalar(-ut, vt, dp->beta, dp->n0, Av, cplx); + float *fact=0; + float *shpvl=0; + /* set up factorial array */ + fact=(float *)malloc((size_t)(dp->n0)*sizeof(float)); + /* setup array to store calculated shapelet value */ + /* need max storage 2 x n0 */ + shpvl=(float*)malloc((size_t)(2*dp->n0)*sizeof(float)); + + if (!fact || !Av || !cplx || !shpvl) { + printf("Error: Device memory allocation failure!! increase heap size.\n"); + } + fact[0]=1.0f; + for (ci=1; cin0; ci++) { + fact[ci]=((float)ci+1.0f)*fact[ci-1]; + } + + + calculate_uv_mode_vectors_scalar(-ut, vt, dp->beta, dp->n0, Av, cplx, fact, shpvl); + + free(fact); + free(shpvl); + realsum=imagsum=0.0f; M=(dp->n0)*(dp->n0); for (ci=0; ci1) { - sI0f=__ldg(&sI0[k]); - spec_idxf=__ldg(&spec_idx[k]); - spec_idx1f=__ldg(&spec_idx1[k]); - spec_idx2f=__ldg(&spec_idx2[k]); - myf0=__ldg(&f0[k]); - } - unsigned char stypeT=__ldg(&stype[k]); - for(int cf=0; cf0.0f) { - If=__expf(__logf(sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2); - } else if (sI0f<0.0f) { - If=-__expf(__logf(-sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2); - } else { - If=0.0f; - } +__device__ void +compute_prodterm_multifreq(int sta1, int sta2, int N, int K, int T, int F, +double phterm0, double sI0f, double sQ0f, double sU0f, double sV0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0, + double myfreq, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) { + /* F>1 is assumed output: 8x1 array */ + double sinph,cosph; + sincos(phterm0*myfreq,&sinph,&cosph); + cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph); + double If,Qf,Uf,Vf; + If=Qf=Uf=Vf=0.0; + /* evaluate spectra */ + double fratio=log(myfreq/myf0); + double fratio1=fratio*fratio; + double fratio2=fratio1*fratio; + double cm=spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2; + /* catch -ve flux */ + if (sI0f>0.0) { + If=exp(log(sI0f)+cm); + } else if (sI0f<0.0) { + If=-exp(log(-sI0f)+cm); + } + if (sQ0f>0.0) { + Qf=exp(log(sQ0f)+cm); + } else if (sI0f<0.0) { + Qf=-exp(log(-sQ0f)+cm); + } + if (sU0f>0.0) { + Uf=exp(log(sU0f)+cm); + } else if (sI0f<0.0) { + Uf=-exp(log(-sU0f)+cm); + } + if (sV0f>0.0) { + Vf=exp(log(sV0f)+cm); + } else if (sI0f<0.0) { + Vf=-exp(log(-sV0f)+cm); } - /* smearing */ - float phterm =phterm0*0.5f*deltaf; - if (phterm!=0.0f) { - sinph=__sinf(phterm)/phterm; - If *=fabsf(sinph); /* catch -ve values due to rounding off */ + + /* smearing, beam */ + double scalef=1.0; + double phterm =(phterm0*0.5*deltaf); + if (phterm!=0.0) { + sinph=(sin(phterm)/phterm); + scalef *=fabs(sinph); /* catch -ve values due to rounding off */ } if (dobeam) { /* get beam info */ //int boffset1=sta1*K*T*F + k1*T*F + cf*T + itm; - int boffset1=itm*N*K*F+k1*N*F+cf*N+sta1; + + int boffset1=itm*N*K*F+k*N*F+cf*N+sta1; + // printf("itm=%d, k1=%d, sta1=%d, sta2=%d, boffset1=%d, boffset2=%d\n", itm, k1, sta1, sta2, boffset1, boffset2); float beam1=__ldg(&beam[boffset1]); //int boffset2=sta2*K*T*F + k1*T*F + cf*T + itm; - int boffset2=itm*N*K*F+k1*N*F+cf*N+sta2; + int boffset2=itm*N*K*F+k*N*F+cf*N+sta2; float beam2=__ldg(&beam[boffset2]); - If *=beam1*beam2; + scalef *=(double)(beam1*beam2); } + /* form complex value */ - prodterm.x *=If; - prodterm.y *=If; + prodterm.x *=scalef; + prodterm.y *=scalef; /* check for type of source */ - if (stypeT!=STYPE_POINT) { - float uscaled=u*myfreq; - float vscaled=v*myfreq; - float wscaled=w*myfreq; + if (stypeT!=STYPE_POINT && !(u==0.0 && v==0.0)) { + float uscaled=(float)(u*myfreq); + float vscaled=(float)(v*myfreq); + float wscaled=(float)(w*myfreq); if (stypeT==STYPE_SHAPELET) { - prodterm=cuCmulf(shapelet_contrib(exs[k],uscaled,vscaled,wscaled),prodterm); + prodterm=cuCmul(shapelet_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); } else if (stypeT==STYPE_GAUSSIAN) { - prodterm=cuCmulf(gaussian_contrib(exs[k],uscaled,vscaled,wscaled),prodterm); + prodterm=cuCmul(gaussian_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); } else if (stypeT==STYPE_DISK) { - prodterm=cuCmulf(disk_contrib(exs[k],uscaled,vscaled,wscaled),prodterm); + prodterm=cuCmul(disk_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); } else if (stypeT==STYPE_RING) { - prodterm=cuCmulf(ring_contrib(exs[k],uscaled,vscaled,wscaled),prodterm); + prodterm=cuCmul(ring_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); } } - -//printf("k=%d cf=%d freq=%f uvw %f,%f,%f lmn %f,%f,%f phterm %f If %f\n",k,cf,freqs[cf],u,v,w,ll[k],mm[k],nn[k],phterm,If); - /* write output to shared array */ - tmpcoh[k*8*F+8*cf]=prodterm.x; - tmpcoh[k*8*F+8*cf+1]=prodterm.y; - tmpcoh[k*8*F+8*cf+2]=0.0f; - tmpcoh[k*8*F+8*cf+3]=0.0f; - tmpcoh[k*8*F+8*cf+4]=0.0f; - tmpcoh[k*8*F+8*cf+5]=0.0f; - tmpcoh[k*8*F+8*cf+6]=prodterm.x; - tmpcoh[k*8*F+8*cf+7]=prodterm.y; - } - } - __syncthreads(); + double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy; + Ix=If*prodterm.x; + Iy=If*prodterm.y; + Qx=Qf*prodterm.x; + Qy=Qf*prodterm.y; + Ux=Uf*prodterm.x; + Uy=Uf*prodterm.y; + Vx=Vf*prodterm.x; + Vy=Vf*prodterm.y; - // Build summation tree over elements, handling case where total threads is not a power of two. - int nTotalThreads = blockDim_2; // Total number of threads (==Kused), rounded up to the next power of two - while(nTotalThreads > 1) { - int halfPoint = (nTotalThreads >> 1); // divide by two - if (k < halfPoint) { - int thread2 = k + halfPoint; - if (thread2 < blockDim.x) { // Skipping the fictitious threads >Kused ( blockDim.x ... blockDim_2-1 ) - for(int cf=0; cf>>(sta1,sta2,tslot,B,N,T,K,K,0,F,__ldg(&u[n]),__ldg(&v[n]),__ldg(&w[n]),freqs,beam,ll,mm,nn,sI,stype,sI0,f0,spec_idx,spec_idx1,spec_idx2,exs,deltaf,deltat,dec0,&coh[8*n],dobeam,NearestPowerOf2(K)); - cudaDeviceSynchronize(); -#ifdef CUDA_DBG - error = cudaGetLastError(); - if(error != cudaSuccess) { - // print the CUDA error message and exit - printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__); - } -#endif - } else { - /* more than 1 kernel */ - int L=(K+ThreadsPerBlock-1)/ThreadsPerBlock; - int ct=0; - int myT; - for (int ci=0; ci>>(sta1,sta2,tslot,B,N,T,K,myT,ct,F,__ldg(&u[n]),__ldg(&v[n]),__ldg(&w[n]),freqs,beam,&ll[ct],&mm[ct],&nn[ct],&sI[ct],&stype[ct],&sI0[ct],&f0[ct],&spec_idx[ct],&spec_idx1[ct],&spec_idx2[ct],&exs[ct],deltaf,deltat,dec0,&coh[8*n],dobeam,NearestPowerOf2(myT)); - cudaDeviceSynchronize(); -#ifdef CUDA_DBG - error = cudaGetLastError(); - if(error != cudaSuccess) { - // print the CUDA error message and exit - printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__); - } -#endif - ct=ct+ThreadsPerBlock; - } + //TODO: figure out if this max_f makes any sense + #define MAX_F 20 + double l_coh[MAX_F][8]; + for(int cf=0; cf1 + if (F==1) { + //use simply for-loop, if K is very large this may be slow and may need further parallelization + for (int k=0; k1 + if (F==1) { + //use simply for-loop, if K is very large this may be slow and may need further parallelization + for (int k=0; k>>(N,T,K,F,freqs,longitude,latitude,time_utc,Nelem,xx,yy,zz,ra,dec,ph_ra0,ph_dec0,ph_freq0,beam,buffer); + + /* 2D grid of threads: x dim->data, y dim-> stations */ + dim3 grid(1, 1, 1); + grid.x = (int)ceilf((K*T*F) / (float)ThreadsPerBlock); + grid.y = N; + + kernel_array_beam<<>>(N,T,K,F,freqs,longitude,latitude,time_utc,Nelem,xx,yy,zz,ra,dec,ph_ra0,ph_dec0,ph_freq0,beam); cudaDeviceSynchronize(); - cudaFree(buffer); #ifdef CUDA_DBG error = cudaGetLastError(); if(error != cudaSuccess) { @@ -751,7 +1084,7 @@ cudakernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude /* calculate coherencies: - B: total baselines + B: total baselines (could be more than one timeslot) N: no of stations T: no of time slots K: no of sources @@ -774,26 +1107,20 @@ cudakernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude dobeam: enable beam if >0 */ void -cudakernel_coherencies(int B, int N, int T, int K, int F, float *u, float *v, float *w,baseline_t *barr, float *freqs, float *beam, float *ll, float *mm, float *nn, float *sI, - unsigned char *stype, float *sI0, float *f0, float *spec_idx, float *spec_idx1, float *spec_idx2, int **exs, float deltaf, float deltat, float dec0, float *coh,int dobeam) { +cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV, + unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) { #ifdef CUDA_DBG cudaError_t error; error = cudaGetLastError(); - error=cudaMemset(coh,0,sizeof(float)*8*B*F); - checkCudaError(error,__FILE__,__LINE__); -#endif -#ifndef CUDA_DBG - cudaMemset(coh,0,sizeof(float)*8*B*F); #endif /* spawn threads to handle baselines, these threads will spawn threads for sources */ int ThreadsPerBlock=DEFAULT_TH_PER_BK; - // int ThreadsPerBlock=16; /* note: make sure we do not exceed max no of blocks available, otherwise (too many baselines, loop over source id) */ - int BlocksPerGrid= 2*(B+ThreadsPerBlock-1)/ThreadsPerBlock; - kernel_coherencies<<>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI, - stype, sI0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam); + int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock; + kernel_coherencies<<>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV, + stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam); cudaDeviceSynchronize(); #ifdef CUDA_DBG error = cudaGetLastError(); @@ -806,6 +1133,56 @@ cudakernel_coherencies(int B, int N, int T, int K, int F, float *u, float *v, fl } +/* p : parameters 8Nxnchunk values */ +void +cudakernel_residuals(int B, int N, int T, int K, int F, double *u, double *v, double *w, double *p, int nchunk, baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV, + unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) { +#ifdef CUDA_DBG + cudaError_t error; + error = cudaGetLastError(); +#endif + + /* spawn threads to handle baselines, these threads will loop over sources */ + int ThreadsPerBlock=DEFAULT_TH_PER_BK; + /* note: make sure we do not exceed max no of blocks available, + otherwise (too many baselines, loop over source id) */ + int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock; + kernel_residuals<<>>(B, N, T, K, F,u,v,w,p,nchunk,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV, + stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam); + cudaDeviceSynchronize(); +#ifdef CUDA_DBG + error = cudaGetLastError(); + if(error != cudaSuccess) { + // print the CUDA error message and exit + fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__); + exit(-1); + } +#endif +} + + +void +cudakernel_correct_residuals(int B, int N, int Nb, int boff, int F, int nchunk, double *x, double *p, baseline_t *barr) { +#ifdef CUDA_DBG + cudaError_t error; + error = cudaGetLastError(); +#endif + + /* spawn threads to handle baselines */ + int ThreadsPerBlock=DEFAULT_TH_PER_BK; + int BlocksPerGrid=(Nb+ThreadsPerBlock-1)/ThreadsPerBlock; + kernel_correct_residuals<<>>(B, N, Nb, boff, F, nchunk, x, p, barr); + cudaDeviceSynchronize(); +#ifdef CUDA_DBG + error = cudaGetLastError(); + if(error != cudaSuccess) { + // print the CUDA error message and exit + fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__); + exit(-1); + } +#endif +} + /* convert time JD to GMST angle store result at the same location */ void @@ -818,10 +1195,10 @@ cudakernel_convert_time(int T, double *time_utc) { int ThreadsPerBlock=DEFAULT_TH_PER_BK; /* note: make sure we do not exceed max no of blocks available, otherwise (too many baselines, loop over source id) */ - int BlocksPerGrid= 2*(T+ThreadsPerBlock-1)/ThreadsPerBlock; + int BlocksPerGrid=(T+ThreadsPerBlock-1)/ThreadsPerBlock; kernel_convert_time<<>>(T,time_utc); cudaDeviceSynchronize(); - #ifdef CUDA_DBG +#ifdef CUDA_DBG error = cudaGetLastError(); if(error != cudaSuccess) { // print the CUDA error message and exit