Improved kernel (coherencies) from Hanno Spreeuw and Ben van Werkhoven as part of NLeSC DIRAC project, removed dynamic parallelism, added double precision

This commit is contained in:
Sarod Yatawatta 2017-11-13 10:39:40 +01:00
parent 4fa8cffcdd
commit 2150c4d12b
1 changed files with 105 additions and 282 deletions

View File

@ -51,121 +51,12 @@ radec2azel_gmst__(float ra, float dec, float longitude, float latitude, float th
*az=azd;
}
/* slave kernel to calculate phase of manifold vector for given station */
/* x,y,z: Nx1 arrays of element coords */
/* sum: scalar to store result */
/* NOTE: only 1 block should be used here */
__global__ void
kernel_array_beam_slave_sin(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) {
unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
extern __shared__ float tmpsum[]; /* assumed to be size Nx1 */
if (n<N) {
tmpsum[n]=sinf((r1*__ldg(&x[n])+r2*__ldg(&y[n])+r3*__ldg(&z[n])));
}
__syncthreads();
// Build summation tree over elements, handling case where total threads is not a power of two.
int nTotalThreads = blockDim_2; // Total number of threads (==N), rounded up to the next power of two
while(nTotalThreads > 1) {
int halfPoint = (nTotalThreads >> 1); // divide by two
if (n < halfPoint) {
int thread2 = n + halfPoint;
if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 )
tmpsum[n] = tmpsum[n]+tmpsum[thread2];
}
}
__syncthreads();
nTotalThreads = halfPoint; // Reducing the binary tree size by two
}
/* now thread 0 will add up results */
if (threadIdx.x==0) {
*sum=tmpsum[0];
}
}
__global__ void
kernel_array_beam_slave_cos(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) {
unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
extern __shared__ float tmpsum[]; /* assumed to be size Nx1 */
if (n<N) {
tmpsum[n]=cosf((r1*__ldg(&x[n])+r2*__ldg(&y[n])+r3*__ldg(&z[n])));
}
__syncthreads();
// Build summation tree over elements, handling case where total threads is not a power of two.
int nTotalThreads = blockDim_2; // Total number of threads (==N), rounded up to the next power of two
while(nTotalThreads > 1) {
int halfPoint = (nTotalThreads >> 1); // divide by two
if (n < halfPoint) {
int thread2 = n + halfPoint;
if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 )
tmpsum[n] = tmpsum[n]+tmpsum[thread2];
}
}
__syncthreads();
nTotalThreads = halfPoint; // Reducing the binary tree size by two
}
/* now thread 0 will add up results */
if (threadIdx.x==0) {
*sum=tmpsum[0];
}
}
/* sum: 2x1 array */
__global__ void
kernel_array_beam_slave_sincos(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) {
unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
extern __shared__ float tmpsum[]; /* assumed to be size 2*Nx1 */
if (n<N) {
float ss,cc;
sincosf((r1*__ldg(&x[n])+r2*__ldg(&y[n])+r3*__ldg(&z[n])),&ss,&cc);
tmpsum[2*n]=ss;
tmpsum[2*n+1]=cc;
}
__syncthreads();
// Build summation tree over elements, handling case where total threads is not a power of two.
int nTotalThreads = blockDim_2; // Total number of threads (==N), rounded up to the next power of two
while(nTotalThreads > 1) {
int halfPoint = (nTotalThreads >> 1); // divide by two
if (n < halfPoint) {
int thread2 = n + halfPoint;
if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 )
tmpsum[2*n] = tmpsum[2*n]+tmpsum[2*thread2];
tmpsum[2*n+1] = tmpsum[2*n+1]+tmpsum[2*thread2+1];
}
}
__syncthreads();
nTotalThreads = halfPoint; // Reducing the binary tree size by two
}
/* now thread 0 will add up results */
if (threadIdx.x==0) {
sum[0]=tmpsum[0];
sum[1]=tmpsum[1];
}
}
__device__ int
NearestPowerOf2 (int n){
if (!n) return n; //(0 == 2^0)
int x = 1;
while(x < n) {
x <<= 1;
}
return x;
}
/* use compiler directives to use/not use shared memory */
/* ARRAY_MAX_ELEM : define this in header file beforehand, if using shared memory */
/* master kernel to calculate beam */
__global__ void
kernel_array_beam(int N, int T, int K, int F,
const float *__restrict__ freqs, const float *__restrict__ longitude, const float *__restrict__ latitude,
const double *__restrict__ freqs, const float *__restrict__ longitude, const float *__restrict__ latitude,
const double *__restrict__ time_utc, const int *__restrict__ Nelem,
const float * const *__restrict__ xx, const float * const *__restrict__ yy, const float * const *__restrict__ zz,
const float *__restrict__ ra, const float *__restrict__ dec,
@ -230,7 +121,7 @@ kernel_array_beam(int N, int T, int K, int F,
r2=(float)-tpc*(ph_freq0*sint0*sinph0-freqs[ifrq]*sint*sinph);
r3=(float)-tpc*(ph_freq0*cost0-freqs[ifrq]*cost);
*/
float f=__ldg(&freqs[ifrq]);
float f=(float)__ldg(&freqs[ifrq]);
float rat1=ph_freq0*sint0;
float rat2=f*sint;
r1=-tpc*(rat1*cosph0-rat2*cosph);
@ -265,8 +156,8 @@ kernel_array_beam(int N, int T, int K, int F,
}
/***************************************************************************/
__device__ cuFloatComplex
gaussian_contrib(int *dd, float u, float v, float w) {
__device__ cuDoubleComplex
gaussian_contrib__(int *dd, float u, float v, float w) {
exinfo_gaussian *dp=(exinfo_gaussian*)dd;
float up,vp,a,b,ut,vt,cosph,sinph;
@ -286,13 +177,13 @@ gaussian_contrib(int *dd, float u, float v, float w) {
ut=a*(cosph*up-sinph*vp);
vt=b*(sinph*up+cosph*vp);
return make_cuFloatComplex(0.5f*M_PI*expf(-(ut*ut+vt*vt)),0.0f);
return make_cuDoubleComplex((double)(0.5f*M_PI*expf(-(ut*ut+vt*vt))),0.0);
}
__device__ cuFloatComplex
ring_contrib(int *dd, float u, float v, float w) {
__device__ cuDoubleComplex
ring_contrib__(int *dd, float u, float v, float w) {
exinfo_ring *dp=(exinfo_ring*)dd;
float up,vp,a,b;
@ -303,11 +194,11 @@ ring_contrib(int *dd, float u, float v, float w) {
a=dp->eX; /* diameter */
b=sqrtf(up*up+vp*vp)*a*2.0f*M_PI;
return make_cuFloatComplex(j0f(b),0.0f);
return make_cuDoubleComplex((double)j0f(b),0.0);
}
__device__ cuFloatComplex
disk_contrib(int *dd, float u, float v, float w) {
__device__ cuDoubleComplex
disk_contrib__(int *dd, float u, float v, float w) {
exinfo_disk *dp=(exinfo_disk*)dd;
float up,vp,a,b;
@ -318,7 +209,7 @@ disk_contrib(int *dd, float u, float v, float w) {
a=dp->eX; /* diameter */
b=sqrtf(up*up+vp*vp)*a*2.0f*M_PI;
return make_cuFloatComplex(j1f(b),0.0f);
return make_cuDoubleComplex((double)j1f(b),0.0);
}
@ -411,8 +302,8 @@ calculate_uv_mode_vectors_scalar(float u, float v, float beta, int n0, float *Av
free(shpvl);
}
__device__ cuFloatComplex
shapelet_contrib(int *dd, float u, float v, float w) {
__device__ cuDoubleComplex
shapelet_contrib__(int *dd, float u, float v, float w) {
exinfo_shapelet *dp=(exinfo_shapelet*)dd;
int *cplx;
float *Av;
@ -457,160 +348,91 @@ shapelet_contrib(int *dd, float u, float v, float w) {
//return 2.0*M_PI*(realsum+_Complex_I*imagsum);
realsum*=2.0f*M_PI*a*b;
imagsum*=2.0f*M_PI*a*b;
return make_cuFloatComplex(realsum,imagsum);
return make_cuDoubleComplex((double)realsum,(double)imagsum);
}
__device__ cuDoubleComplex
compute_prodterm(int sta1, int sta2, int N, int K, int T, int F,
double phterm0, double sIf, double sI0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0,
const double *__restrict__ freqs, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w) {
/* slave thread to calculate coherencies, for 1 source */
/* baseline (sta1,sta2) at time itm */
/* K: total sources, uset to find right offset
Kused: actual sources calculated in this thread block
Koff: offset in source array to start calculation
NOTE: only 1 block is used
*/
__global__ void
kernel_coherencies_slave(int sta1, int sta2, int itm, int B, int N, int T, int K, int Kused, int Koff, int F, float u, float v, float w, float *freqs, float *beam, float *ll, float *mm, float *nn, float *sI,
unsigned char *stype, float *sI0, float *f0, float *spec_idx, float *spec_idx1, float *spec_idx2, int **exs, float deltaf, float deltat, float dec0, float *__restrict__ coh,int dobeam,int blockDim_2) {
/* which source we work on */
unsigned int k=threadIdx.x+blockDim.x*blockIdx.x;
extern __shared__ float tmpcoh[]; /* assumed to be size 8*F*Kusedx1 */
if (k<Kused) {
int k1=k+Koff; /* actual source id */
/* preload all freq independent variables */
/* Fourier phase */
float phterm0=2.0f*M_PI*(u*__ldg(&ll[k])+v*__ldg(&mm[k])+w*__ldg(&nn[k]));
float sIf,sI0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
sIf=__ldg(&sI[k]);
if (F>1) {
sI0f=__ldg(&sI0[k]);
spec_idxf=__ldg(&spec_idx[k]);
spec_idx1f=__ldg(&spec_idx1[k]);
spec_idx2f=__ldg(&spec_idx2[k]);
myf0=__ldg(&f0[k]);
}
unsigned char stypeT=__ldg(&stype[k]);
for(int cf=0; cf<F; cf++) {
float sinph,cosph;
float myfreq=__ldg(&freqs[cf]);
sincosf(phterm0*myfreq,&sinph,&cosph);
cuFloatComplex prodterm=make_cuFloatComplex(cosph,sinph);
float If;
double sinph,cosph;
double myfreq=__ldg(&freqs[cf]);
sincos(phterm0*myfreq,&sinph,&cosph);
cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph);
double If;
if (F==1) {
/* flux: do not use spectra here, because F=1*/
If=sIf;
} else {
/* evaluate spectra */
float fratio=__logf(myfreq/myf0);
float fratio1=fratio*fratio;
float fratio2=fratio1*fratio;
double fratio=log(myfreq/myf0);
double fratio1=fratio*fratio;
double fratio2=fratio1*fratio;
/* catch -ve flux */
if (sI0f>0.0f) {
If=__expf(__logf(sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
} else if (sI0f<0.0f) {
If=-__expf(__logf(-sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
if (sI0f>0.0) {
If=exp(log(sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
} else if (sI0f<0.0) {
If=-exp(log(-sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2);
} else {
If=0.0f;
If=0.0;
}
}
/* smearing */
float phterm =phterm0*0.5f*deltaf;
if (phterm!=0.0f) {
sinph=__sinf(phterm)/phterm;
double phterm =(phterm0*0.5*deltaf);
if (phterm!=0.0) {
sinph=(sin(phterm)/phterm);
If *=fabsf(sinph); /* catch -ve values due to rounding off */
}
if (dobeam) {
/* get beam info */
//int boffset1=sta1*K*T*F + k1*T*F + cf*T + itm;
int boffset1=itm*N*K*F+k1*N*F+cf*N+sta1;
int boffset1=itm*N*K*F+k*N*F+cf*N+sta1;
// printf("itm=%d, k1=%d, sta1=%d, sta2=%d, boffset1=%d, boffset2=%d\n", itm, k1, sta1, sta2, boffset1, boffset2);
float beam1=__ldg(&beam[boffset1]);
//int boffset2=sta2*K*T*F + k1*T*F + cf*T + itm;
int boffset2=itm*N*K*F+k1*N*F+cf*N+sta2;
int boffset2=itm*N*K*F+k*N*F+cf*N+sta2;
float beam2=__ldg(&beam[boffset2]);
If *=beam1*beam2;
If *=(double)(beam1*beam2);
}
/* form complex value */
prodterm.x *=If;
prodterm.y *=If;
/* check for type of source */
if (stypeT!=STYPE_POINT) {
float uscaled=u*myfreq;
float vscaled=v*myfreq;
float wscaled=w*myfreq;
double uscaled=u*myfreq;
double vscaled=v*myfreq;
double wscaled=w*myfreq;
if (stypeT==STYPE_SHAPELET) {
prodterm=cuCmulf(shapelet_contrib(exs[k],uscaled,vscaled,wscaled),prodterm);
prodterm=cuCmul(shapelet_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
} else if (stypeT==STYPE_GAUSSIAN) {
prodterm=cuCmulf(gaussian_contrib(exs[k],uscaled,vscaled,wscaled),prodterm);
prodterm=cuCmul(gaussian_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
} else if (stypeT==STYPE_DISK) {
prodterm=cuCmulf(disk_contrib(exs[k],uscaled,vscaled,wscaled),prodterm);
prodterm=cuCmul(disk_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
} else if (stypeT==STYPE_RING) {
prodterm=cuCmulf(ring_contrib(exs[k],uscaled,vscaled,wscaled),prodterm);
prodterm=cuCmul(ring_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm);
}
}
//printf("k=%d cf=%d freq=%f uvw %f,%f,%f lmn %f,%f,%f phterm %f If %f\n",k,cf,freqs[cf],u,v,w,ll[k],mm[k],nn[k],phterm,If);
/* write output to shared array */
tmpcoh[k*8*F+8*cf]=prodterm.x;
tmpcoh[k*8*F+8*cf+1]=prodterm.y;
tmpcoh[k*8*F+8*cf+2]=0.0f;
tmpcoh[k*8*F+8*cf+3]=0.0f;
tmpcoh[k*8*F+8*cf+4]=0.0f;
tmpcoh[k*8*F+8*cf+5]=0.0f;
tmpcoh[k*8*F+8*cf+6]=prodterm.x;
tmpcoh[k*8*F+8*cf+7]=prodterm.y;
}
}
__syncthreads();
//printf("Input k=%d uvw %lE,%lE,%lE sI=%f phterm0=%lf\n",k,u,v,w,sIf,phterm0);
//printf("phterm %lf smearing %f\n",phterm,sinph);
//printf("k=%d uvw %lE,%lE,%lE If=%f phterm %lf out=%f,%f\n",k,u,v,w,If,phterm,prodterm.x,prodterm.y);
return prodterm;
// Build summation tree over elements, handling case where total threads is not a power of two.
int nTotalThreads = blockDim_2; // Total number of threads (==Kused), rounded up to the next power of two
while(nTotalThreads > 1) {
int halfPoint = (nTotalThreads >> 1); // divide by two
if (k < halfPoint) {
int thread2 = k + halfPoint;
if (thread2 < blockDim.x) { // Skipping the fictitious threads >Kused ( blockDim.x ... blockDim_2-1 )
for(int cf=0; cf<F; cf++) {
tmpcoh[k*8*F+8*cf]=tmpcoh[k*8*F+8*cf]+tmpcoh[thread2*8*F+8*cf];
tmpcoh[k*8*F+8*cf+1]=tmpcoh[k*8*F+8*cf+1]+tmpcoh[thread2*8*F+8*cf+1];
tmpcoh[k*8*F+8*cf+2]=tmpcoh[k*8*F+8*cf+2]+tmpcoh[thread2*8*F+8*cf+2];
tmpcoh[k*8*F+8*cf+3]=tmpcoh[k*8*F+8*cf+3]+tmpcoh[thread2*8*F+8*cf+3];
tmpcoh[k*8*F+8*cf+4]=tmpcoh[k*8*F+8*cf+4]+tmpcoh[thread2*8*F+8*cf+4];
tmpcoh[k*8*F+8*cf+5]=tmpcoh[k*8*F+8*cf+5]+tmpcoh[thread2*8*F+8*cf+5];
tmpcoh[k*8*F+8*cf+6]=tmpcoh[k*8*F+8*cf+6]+tmpcoh[thread2*8*F+8*cf+6];
tmpcoh[k*8*F+8*cf+7]=tmpcoh[k*8*F+8*cf+7]+tmpcoh[thread2*8*F+8*cf+7];
}
}
}
__syncthreads();
nTotalThreads = halfPoint; // Reducing the binary tree size by two
}
/* add up to form final result (not replace!) */
if (threadIdx.x==0) {
for(int cf=0; cf<F; cf++) {
coh[cf*8*B]+=tmpcoh[8*cf];
coh[cf*8*B+1]+=tmpcoh[8*cf+1];
coh[cf*8*B+2]+=tmpcoh[8*cf+2];
coh[cf*8*B+3]+=tmpcoh[8*cf+3];
coh[cf*8*B+4]+=tmpcoh[8*cf+4];
coh[cf*8*B+5]+=tmpcoh[8*cf+5];
coh[cf*8*B+6]+=tmpcoh[8*cf+6];
coh[cf*8*B+7]+=tmpcoh[8*cf+7];
}
}
}
/* master kernel to calculate coherencies */
__global__ void
kernel_coherencies(int B, int N, int T, int K, int F,float *u, float *v, float *w,baseline_t *barr, float *freqs, float *beam, float *ll, float *mm, float *nn, float *sI,
unsigned char *stype, float *sI0, float *f0, float *spec_idx, float *spec_idx1, float *spec_idx2, int **exs, float deltaf, float deltat, float dec0, float *coh, int dobeam) {
kernel_coherencies(int B, int N, int T, int K, int F,
const double *__restrict__ u, const double *__restrict__ v, const double *__restrict__ w,
baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, const double *__restrict__ sI,
const unsigned char *__restrict__ stype, const double *__restrict__ sI0, const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) {
/* global thread index */
unsigned int n=threadIdx.x+blockDim.x*blockIdx.x;
@ -623,49 +445,55 @@ kernel_coherencies(int B, int N, int T, int K, int F,float *u, float *v, float *
int tslot=n/((N*(N-1)/2));
#ifdef CUDA_DBG
cudaError_t error;
#endif
double u_n=(u[n]);
double v_n=(v[n]);
double w_n=(w[n]);
int ThreadsPerBlock=DEFAULT_TH_PER_BK;
/* each slave thread will calculate one source, 8xF values for all freq */
/* also give right offset for coherencies */
if (K<ThreadsPerBlock) {
/* one kernel is enough, offset is 0 */
kernel_coherencies_slave<<<1,K,sizeof(float)*(8*F*K)>>>(sta1,sta2,tslot,B,N,T,K,K,0,F,__ldg(&u[n]),__ldg(&v[n]),__ldg(&w[n]),freqs,beam,ll,mm,nn,sI,stype,sI0,f0,spec_idx,spec_idx1,spec_idx2,exs,deltaf,deltat,dec0,&coh[8*n],dobeam,NearestPowerOf2(K));
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess) {
// print the CUDA error message and exit
printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
}
#endif
} else {
/* more than 1 kernel */
int L=(K+ThreadsPerBlock-1)/ThreadsPerBlock;
int ct=0;
int myT;
for (int ci=0; ci<L; ci++) {
if (ct+ThreadsPerBlock<K) {
myT=ThreadsPerBlock;
} else {
myT=K-ct;
}
/* launch kernel with myT threads, starting at ct offset */
kernel_coherencies_slave<<<1,myT,sizeof(float)*(8*F*myT)>>>(sta1,sta2,tslot,B,N,T,K,myT,ct,F,__ldg(&u[n]),__ldg(&v[n]),__ldg(&w[n]),freqs,beam,&ll[ct],&mm[ct],&nn[ct],&sI[ct],&stype[ct],&sI0[ct],&f0[ct],&spec_idx[ct],&spec_idx1[ct],&spec_idx2[ct],&exs[ct],deltaf,deltat,dec0,&coh[8*n],dobeam,NearestPowerOf2(myT));
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess) {
// print the CUDA error message and exit
printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
}
#endif
ct=ct+ThreadsPerBlock;
}
//TODO: figure out if this max_f makes any sense
#define MAX_F 20
cuDoubleComplex l_coh[MAX_F];
for(int cf=0; cf<F; cf++) {
l_coh[cf] = make_cuDoubleComplex(0.0, 0.0);
}
//use simply for-loop, if K is very large this may be slow and may need further parallelization
for (int k=0; k<K; k++) {
//source specific params
double sIf,sI0f,spec_idxf,spec_idx1f,spec_idx2f,myf0;
double phterm0 = (2.0*M_PI*(u_n*__ldg(&ll[k])+v_n*__ldg(&mm[k])+w_n*__ldg(&nn[k])));
sIf=__ldg(&sI[k]);
if (F>1) {
sI0f=__ldg(&sI0[k]);
spec_idxf=__ldg(&spec_idx[k]);
spec_idx1f=__ldg(&spec_idx1[k]);
spec_idx2f=__ldg(&spec_idx2[k]);
myf0=__ldg(&f0[k]);
}
unsigned char stypeT=__ldg(&stype[k]);
for(int cf=0; cf<F; cf++) {
l_coh[cf] = cuCadd(l_coh[cf], compute_prodterm(sta1, sta2, N, K, T, F, phterm0, sIf, sI0f, spec_idxf, spec_idx1f, spec_idx2f,
myf0, freqs, deltaf, dobeam, tslot, k, cf, beam, exs, stypeT, u_n, v_n, w_n));
}
}
//write output with right multi frequency offset
double *coh1 = &coh[8*n];
for(int cf=0; cf<F; cf++) {
coh1[cf*8*B+0] = l_coh[cf].x;
coh1[cf*8*B+1] = l_coh[cf].y;
coh1[cf*8*B+2] = 0.0;
coh1[cf*8*B+3] = 0.0;
coh1[cf*8*B+4] = 0.0;
coh1[cf*8*B+5] = 0.0;
coh1[cf*8*B+6] = l_coh[cf].x;
coh1[cf*8*B+7] = l_coh[cf].y;
}
}
}
@ -722,7 +550,7 @@ checkCudaError(cudaError_t err, const char *file, int line)
*/
void
cudakernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude, float *latitude,
cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitude, float *latitude,
double *time_utc, int *Nelem, float **xx, float **yy, float **zz, float *ra, float *dec, float ph_ra0, float ph_dec0, float ph_freq0, float *beam) {
#ifdef CUDA_DBG
cudaError_t error;
@ -780,16 +608,11 @@ cudakernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude
dobeam: enable beam if >0
*/
void
cudakernel_coherencies(int B, int N, int T, int K, int F, float *u, float *v, float *w,baseline_t *barr, float *freqs, float *beam, float *ll, float *mm, float *nn, float *sI,
unsigned char *stype, float *sI0, float *f0, float *spec_idx, float *spec_idx1, float *spec_idx2, int **exs, float deltaf, float deltat, float dec0, float *coh,int dobeam) {
cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI,
unsigned char *stype, double *sI0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) {
#ifdef CUDA_DBG
cudaError_t error;
error = cudaGetLastError();
error=cudaMemset(coh,0,sizeof(float)*8*B*F);
checkCudaError(error,__FILE__,__LINE__);
#endif
#ifndef CUDA_DBG
cudaMemset(coh,0,sizeof(float)*8*B*F);
#endif
/* spawn threads to handle baselines, these threads will spawn threads for sources */
@ -826,7 +649,7 @@ cudakernel_convert_time(int T, double *time_utc) {
int BlocksPerGrid=(T+ThreadsPerBlock-1)/ThreadsPerBlock;
kernel_convert_time<<<BlocksPerGrid,ThreadsPerBlock>>>(T,time_utc);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess) {
// print the CUDA error message and exit