diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8d2dcda --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +*/*.png +*/*.out +*/*.output +*/*.solutions +*/*.dot +test/sm.ms/ +test/nvprof-resultaten/ +src/MS/sagecal +test/analysis-*.txt +test/extended_*.* + +*/*/*.a +*/*/*.o +*/*/*.swp +*/*/*.swo +*/*/*.out +*/*/*.output + +*/*/*/*.o +*/*/*/*.a +*/*/*/*.swp +*/*/*/*.swo +*/*/*/*.out +*/*/*/*.output + diff --git a/INSTALL b/INSTALL.md similarity index 75% rename from INSTALL rename to INSTALL.md index 41bfa99..6cba268 100644 --- a/INSTALL +++ b/INSTALL.md @@ -1,7 +1,8 @@ vr 2 dec 2016 23:07:19 CET -SAGECal Installation -==================== -0) Prerequsites: + +# SAGECal Installation + +## 1 Prerequsites: - CASACORE http://casacore.googlecode.com/ - glib http://developer.gnome.org/glib - BLAS/LAPACK @@ -15,12 +16,13 @@ SAGECal Installation -- Intel MKL and other libraries - Get the source for SAGECal : git clone git://git.code.sf.net/p/sagecal/code sagecal-code -1) The basic way to build is +## 2 The basic way to build is 1.a) go to ./src/lib and run make (which will create libsagecal.a) 1.b) go to ./src/MS and run make (which will create the executable) -2) In ./src/lib and ./src/MS you MUST edit the Makefiles to suit your system. Some common items to edit are: +## 3 Build settings +In ./src/lib and ./src/MS you MUST edit the Makefiles to suit your system. Some common items to edit are: - LAPACK: directory where LAPACK/OpenBLAS is installed - GLIBI/GLIBL: include/lib files for glib - CASA_LIBDIR/CASA_INCDIR/CASA_LIBS : casacore include/library location and files: @@ -39,23 +41,23 @@ SAGECal Installation -SAGECAL-MPI Installation -======================== -0) Prerequsites: +# SAGECAL-MPI Installation + +## 1 Prerequsites: - Same as above - MPI (e.g. OpenMPI) -1) Build ./src/lib as above (using mpicc -DMPI_BUILD) +## 2 Build ./src/lib as above (using mpicc -DMPI_BUILD) -2) Build ./src/MPI using mpicc++ +## 3 Build ./src/MPI using mpicc++ -BUILDSKY Installation -===================== -1) See INSTALL in ./src/buildsky +## BUILDSKY Installation + + - See INSTALL in ./src/buildsky -RESTORE Installation -===================== -1) See INSTALL in ./src/restore +## RESTORE Installation + + - See INSTALL in ./src/restore diff --git a/README b/README.md similarity index 91% rename from README rename to README.md index 1dbc075..9ba22a0 100644 --- a/README +++ b/README.md @@ -1,15 +1,17 @@ -SAGECAL -======= +# SAGECAL + + Read INSTALL for installation. This file gives a brief guide to use SAGECal. Warning: this file may be obsolete. use sagecal -h to see up-to-date options. -Step by Step Introduction: -####################################################################### +## Step by Step Introduction: + 1a)Calibrate data in the standard way using BBS/CASA or anything else. Use NDPP to average the data in your MS to a few channels (also average in time to about 10sec). Also flag any spikes in the data. 1b)For subtraction of the ATeam from raw data (CasA,CygA,...), no initial calibration is necessary. Just run sagecal on raw data, but it is better to scale the sky model to match the apparent flux of the sources that are being subtracted. -####################################################################### + + 2) Sky Model: 3a)Make an image of your MS (using ExCon/casapy). Use Duchamp to create a mask for the image. Use buildsky to create a sky model. (see the README file on top level directory). Also create a proper cluster file. @@ -39,15 +41,15 @@ e.g. P1C1 0 12 42.996 85 43 21.514 0.030498 0 0 0 -5.713060 0 0 0 0 115039062.0 P5C1 1 18 5.864 85 58 39.755 0.041839 0 0 0 -6.672879 0 0 0 0 115039062.0 -# A Gaussian mjor,minor 0.1375,0.0917 deg diameter -> radius(rad), PA 43.4772 deg (-> rad) -# Position Angle: "West from North (counter-clockwise)" (0 deg = North, 90 deg = West). -# Note: PyBDSM and BBS use "North from East (counter-clockwise)" (0 deg = East, 90 deg = North). +#A Gaussian mjor,minor 0.1375,0.0917 deg diameter -> radius(rad), PA 43.4772 deg (-> rad) +#Position Angle: "West from North (counter-clockwise)" (0 deg = North, 90 deg = West). +#Note: PyBDSM and BBS use "North from East (counter-clockwise)" (0 deg = East, 90 deg = North). G0 5 34 31.75 22 00 52.86 100 0 0 0 0.00 0 0.0012 0.0008 -2.329615801 130.0e6 -# A Disk radius=0.041 deg +#A Disk radius=0.041 deg D01 23 23 25.67 58 48 58 80 0 0 0 0 0 0.000715 0.000715 0 130e6 -# A Ring radius=0.031 deg +#A Ring radius=0.031 deg R01 23 23 25.416 58 48 57 70 0 0 0 0 0 0.00052 0.00052 0 130e6 -# A shapelet ('S3C61MD.fits.modes' file must be in the current directory) +#A shapelet ('S3C61MD.fits.modes' file must be in the current directory) S3C61MD 2 22 49.796414 86 18 55.913266 0.135 0 0 0 -6.6 0 1 1 0.0 115000000.0 diff --git a/src/MS/Makefile b/src/MS/Makefile index e2145cb..cd817ce 100644 --- a/src/MS/Makefile +++ b/src/MS/Makefile @@ -36,5 +36,4 @@ data.o:data.cpp data.h sagecal:$(OBJECTS) ../lib/Radio/libsagecal.a ../lib/Dirac/libdirac.a $(CXX) $(CXXFLAGS) $(LDFLAGS) $(INCLUDES) $(GLIBI) $(LIBPATH) -o $@ $(OBJECTS) $(MY_LIBS) $(LAPACK) $(CASA_LIBS) $(GLIBL) clean: - rm *.o - + rm *.o *.tmp *.fits *.swp *.swo *.o *.output diff --git a/src/MS/Makefile.gpu b/src/MS/Makefile.gpu index 0f3875b..9efaddd 100644 --- a/src/MS/Makefile.gpu +++ b/src/MS/Makefile.gpu @@ -10,9 +10,9 @@ LAPACK_DIR=/cm/shared/package/openblas/0.2.17mt/lib #LAPACK_DIR=/usr/lib/atlas/sse/ CUDAINC=-I/cm/shared/apps/cuda80/toolkit/8.0.44/include/ -CUDALIB=-L/cm/shared/apps/cuda80/toolkit/8.0.44/lib64/ -lcuda -lcudart +CUDALIB=-lcublas -lcusolver -lcudadevrt -CULALIB=-lcublas -lcusolver -lcudadevrt +# CULALIB=-lcublas -lcusolver -lcudadevrt # NVML NVML_INC=/usr/include/nvidia/gdk/ NVML_LIB=-lnvidia-ml -L/usr/lib64/nvidia/ diff --git a/src/MS/main.cpp b/src/MS/main.cpp index 75566c8..829bdbf 100644 --- a/src/MS/main.cpp +++ b/src/MS/main.cpp @@ -23,6 +23,9 @@ #include #include #include +#include + +#include "cuda_profiler_api.h" #include #include @@ -241,11 +244,7 @@ main(int argc, char **argv) { Data::readMSlist(Data::MSlist,&msnames); } if (Data::TableName) { - if (!doBeam) { - Data::readAuxData(Data::TableName,&iodata); - } else { Data::readAuxData(Data::TableName,&iodata,&beam); - } cout<<"Only one MS"<more()) { start_time = time(0); if (iodata.Nms==1) { - if (!doBeam) { - Data::loadData(msitr[0]->table(),iodata,&iodata.fratio); - } else { Data::loadData(msitr[0]->table(),iodata,beam,&iodata.fratio); - } } else { Data::loadDataList(msitr,iodata,&iodata.fratio); } @@ -489,45 +469,16 @@ main(int argc, char **argv) { preset_flags_and_data(iodata.Nbase*iodata.tilesz,iodata.flag,barr,iodata.x,Data::Nt); /* if data is being whitened, whiten x here, no need for a copy because we use xo for residual calculation */ - if (Data::whiten) { - whiten_data(iodata.Nbase*iodata.tilesz,iodata.x,iodata.u,iodata.v,iodata.freq0,Data::Nt); - } /* precess source locations (also beam pointing) from J2000 to JAPP if we do any beam predictions, using first time slot as epoch */ - if (doBeam && !sources_precessed) { - precess_source_locations(beam.time_utc[iodata.tilesz/2],carr,M,&beam.p_ra0,&beam.p_dec0,Data::Nt); - sources_precessed=1; - } + // sources_precessed=1; -#ifdef USE_MIC - double *mic_u,*mic_v,*mic_w,*mic_x; - mic_u=iodata.u; - mic_v=iodata.v; - mic_w=iodata.w; - mic_x=iodata.x; - int mic_Nbase=iodata.Nbase; - int mic_tilesz=iodata.tilesz; - int mic_N=iodata.N; - double mic_freq0=iodata.freq0; - double mic_deltaf=iodata.deltaf; - double mic_data_min_uvcut=Data::min_uvcut; - int mic_data_Nt=Data::Nt; - int mic_data_max_emiter=Data::max_emiter; - int mic_data_max_iter=Data::max_iter; - int mic_data_max_lbfgs=Data::max_lbfgs; - int mic_data_lbfgs_m=Data::lbfgs_m; - int mic_data_gpu_threads=Data::gpu_threads; - int mic_data_linsolv=Data::linsolv; - int mic_data_solver_mode=Data::solver_mode; - int mic_data_randomize=Data::randomize; - double mic_data_nulow=Data::nulow; - double mic_data_nuhigh=Data::nuhigh; -#endif /* FIXME: uvmin is not needed in calibration, because its taken care of by flags */ if (!Data::DoSim) { /****************** calibration **************************/ +<<<<<<< HEAD #ifndef HAVE_CUDA if (!doBeam) { precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt); @@ -685,15 +636,19 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea #endif } +======= +#ifdef HAVE_CUDA + precalculate_coherencies_withbeam_gpu(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut, + beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt); +#endif +#ifndef HAVE_CUDA + precalculate_coherencies(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut,Data::Nt); +#endif +>>>>>>> master /****************** end calibration **************************/ /****************** begin diagnostics ************************/ -#ifdef HAVE_CUDA - if (Data::DoDiag) { - calculate_diagnostics(iodata.u,iodata.v,iodata.w,p,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,coh,M,Mt,Data::DoDiag,Data::Nt); - } -#endif /* HAVE_CUDA */ - /****************** end diagnostics **************************/ } else { +<<<<<<< HEAD /************ simulation only mode ***************************/ if (!solfile) { #ifndef HAVE_CUDA @@ -724,6 +679,23 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea } /************ end simulation only mode ***************************/ } +======= +#ifdef HAVE_CUDA + predict_visibilities_multifreq_withbeam_gpu(iodata.u,iodata.v,iodata.w,iodata.xo,iodata.N,iodata.Nbase,iodata.tilesz,barr,carr,M,iodata.freqs,iodata.Nchan,iodata.deltaf,iodata.deltat,iodata.dec0, + beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,beam.xx,beam.yy,beam.zz,doBeam,Data::Nt,(Data::DoSim>1?1:0)); +#endif +#ifndef HAVE_CUDA + precalculate_coherencies_withbeam(iodata.u,iodata.v,iodata.w,coh,iodata.N,iodata.Nbase*iodata.tilesz,barr,carr,M,iodata.freq0,iodata.deltaf,iodata.deltat,iodata.dec0,Data::min_uvcut,Data::max_uvcut, + beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,iodata.tilesz,beam.Nelem,beam.xx,beam.yy,beam.zz,Data::Nt); +#endif +} + +#ifdef HAVE_CUDA + cudaDeviceSynchronize(); + cudaProfilerStop(); + exit(0); +#endif +>>>>>>> master tilex+=iodata.tilesz; /* print solutions to file */ @@ -740,33 +712,11 @@ beam.p_ra0,beam.p_dec0,iodata.freq0,beam.sx,beam.sy,beam.time_utc,beam.Nelem,bea } /**********************************************************/ - /* also write back */ - if (iodata.Nms==1) { - Data::writeData(msitr[0]->table(),iodata); - } else { - Data::writeDataList(msitr,iodata); - } for(int cm=0; cmres_ratio*res_prev) { - cout<<"Resetting Solution"<Nt); - /* memory allocation: different solvers */ + /* memory allocation: different dirac */ if (solve_axb==0) { } else if (solve_axb==1) { @@ -739,7 +739,7 @@ mlm_der_single( #endif exit(1); } - /* memory allocation: different solvers */ + /* memory allocation: different dirac */ if (solve_axb==1) { /* QR solver ********************************/ /* workspace query */ @@ -1221,7 +1221,7 @@ oslevmar_der_single_nocuda( me_data_t *dt=(me_data_t*)adata; setweights(M,aones,1.0,dt->Nt); - /* memory allocation: different solvers */ + /* memory allocation: different dirac */ if (solve_axb==0) { } else if (solve_axb==1) { diff --git a/src/lib/Dirac/consensus_poly.c b/src/lib/Dirac/consensus_poly.c index 938f4d5..d45d492 100644 --- a/src/lib/Dirac/consensus_poly.c +++ b/src/lib/Dirac/consensus_poly.c @@ -283,185 +283,6 @@ find_prod_inverse(double *B, double *Bi, int Npoly, int Nf, double *fratio) { -typedef struct thread_data_prod_inv_ { - int startM; - int endM; - int M; - int Nf; - int Npoly; - double *B; - double *Bi; - double *rho; -} thread_data_prod_inv_t; - - -/* worker thread function for calculating sum and inverse */ -static void* -sum_inv_threadfn(void *data) { - thread_data_prod_inv_t *t=(thread_data_prod_inv_t*)data; - double w[1],*WORK,*U,*S,*VT; - - int k,ci,status,lwork=0; - int Np2=t->Npoly*t->Npoly; - /* allocate memory for the SVD here */ - if ((U=(double*)calloc((size_t)Np2,sizeof(double)))==0) { - printf("%s: %d: no free memory\n",__FILE__,__LINE__); - exit(1); - } - if ((VT=(double*)calloc((size_t)Np2,sizeof(double)))==0) { - printf("%s: %d: no free memory\n",__FILE__,__LINE__); - exit(1); - } - if ((S=(double*)calloc((size_t)t->Npoly,sizeof(double)))==0) { - printf("%s: %d: no free memory\n",__FILE__,__LINE__); - exit(1); - } - - /* memory for SVD: use first location of Bi */ - status=my_dgesvd('A','A',t->Npoly,t->Npoly,&(t->Bi[t->startM*Np2]),t->Npoly,S,U,t->Npoly,VT,t->Npoly,w,-1); - if (!status) { - lwork=(int)w[0]; - } else { - printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status); - exit(1); - } - if ((WORK=(double*)calloc((size_t)lwork,sizeof(double)))==0) { - printf("%s: %d: no free memory\n",__FILE__,__LINE__); - exit(1); - } - - - - /* iterate over clusters */ - for (k=t->startM; k<=t->endM; k++) { - memset(&(t->Bi[k*Np2]),0,sizeof(double)*Np2); - /* find sum */ - for (ci=0; ciNf; ci++) { - /* outer product */ - my_dgemm('N','T',t->Npoly,t->Npoly,1,t->rho[k+ci*t->M],&t->B[ci*t->Npoly],t->Npoly,&t->B[ci*t->Npoly],t->Npoly,1.0,&(t->Bi[k*Np2]),t->Npoly); - } - /* find SVD */ - status=my_dgesvd('A','A',t->Npoly,t->Npoly,&(t->Bi[k*Np2]),t->Npoly,S,U,t->Npoly,VT,t->Npoly,WORK,lwork); - if (status) { - printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status); - exit(1); - } - - /* find 1/singular values, and multiply columns of U with new singular values */ - for (ci=0; ciNpoly; ci++) { - if (S[ci]>CLM_EPSILON) { - S[ci]=1.0/S[ci]; - } else { - S[ci]=0.0; - } - my_dscal(t->Npoly,S[ci],&U[ci*t->Npoly]); - } - - /* find product U 1/S V^T */ - my_dgemm('N','N',t->Npoly,t->Npoly,t->Npoly,1.0,U,t->Npoly,VT,t->Npoly,0.0,&(t->Bi[k*Np2]),t->Npoly); - - } - - free(U); - free(VT); - free(S); - free(WORK); - return NULL; -} - -/* build matrix with polynomial terms - B : Npoly x Nf, each row is one basis function - Bi: Npoly x Npoly pseudo inverse of sum( B(:,col) x B(:,col)' ) : M times - Npoly : total basis functions - Nf: frequencies - M: clusters - rho: NfxM array of regularization factors (for each freq, M values) - Sum taken is a weighted sum, using weights in rho, rho is assumed to change for each freq,cluster pair - - Nt: no. of threads -*/ -int -find_prod_inverse_full(double *B, double *Bi, int Npoly, int Nf, int M, double *rho, int Nt) { - - pthread_attr_t attr; - pthread_t *th_array; - thread_data_prod_inv_t *threaddata; - - int ci,Nthb0,Nthb,nth,nth1; - /* clusters per thread */ - Nthb0=(M+Nt-1)/Nt; - - /* setup threads */ - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE); - - if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) { - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); - exit(1); - } - if ((threaddata=(thread_data_prod_inv_t*)malloc((size_t)Nt*sizeof(thread_data_prod_inv_t)))==0) { - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); - exit(1); - } - - - ci=0; - for (nth=0; nth Q - Bi : NpolyxNpoly matrix = B^T - - for each direction (M values) - select 2N,2N,... : 2Nx Npoly complex values from z (ordered by M) - select real,imag: size 2NxNpoly, 2NxNpoly vectors - reshape to 2NxNpoly => R - reshape to 2NxNpoly => I (imag) - - then Q=([R I] Bi^T) for each column - Q=[R_1^T I_1^T R_2^T I_2^T]^T Bi^T for 2 columns - R_1,I_1,R_2,I_2 : size 2NxNpoly - R : (2N 4) x Npoly - so find Q - */ - double *R,*Q; - if ((R=(double*)calloc((size_t)2*t->N*t->Npoly*4,sizeof(double)))==0) { - printf("%s: %d: no free memory\n",__FILE__,__LINE__); - exit(1); - } - if ((Q=(double*)calloc((size_t)2*t->N*t->Npoly*4,sizeof(double)))==0) { - printf("%s: %d: no free memory\n",__FILE__,__LINE__); - exit(1); - } - - int ci,np; - for (ci=t->startM; ci<=t->endM; ci++) { - for (np=0; npNpoly; np++) { - /* select 2N */ - my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M], 4, &R[np*8*t->N], 1); /* R_1 */ - my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M+1], 4, &R[np*8*t->N+2*t->N], 1); /* I_1 */ - my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M+2], 4, &R[np*8*t->N+2*2*t->N], 1); /* R_2 */ - my_dcopy(2*t->N, &t->z[8*t->N*ci+np*8*t->N*t->M+3], 4, &R[np*8*t->N+3*2*t->N], 1); /* I_2 */ - } - /* find Q=R B^T */ - memset(Q,0,sizeof(double)*2*t->N*t->Npoly*4); - my_dgemm('N','N',8*t->N,t->Npoly,t->Npoly,1.0,R,8*t->N,&t->Bi[ci*t->Npoly*t->Npoly],t->Npoly,1.0,Q,8*t->N); /* copy back to Z */ - for (np=0; npNpoly; np++) { - my_dcopy(2*t->N, &Q[np*8*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np], 4); - my_dcopy(2*t->N, &Q[np*8*t->N+2*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np+1], 4); - my_dcopy(2*t->N, &Q[np*8*t->N+2*2*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np+2], 4); - my_dcopy(2*t->N, &Q[np*8*t->N+3*2*t->N], 1, &t->Z[8*t->N*t->Npoly*ci+8*t->N*np+3], 4); + for (np=0; npoffset; - for (ci=t->startM; ci<=t->endM; ci++) { - ip12=my_ddot(8*t->N*t->carr[ci].nchunk,&t->deltaY[ck],&t->deltaJ[ck]); /* x^T y */ - /* further computations are only required if there is +ve correlation */ - if (ip12>CLM_EPSILON) { - /* find the inner products */ - ip11=my_dnrm2(8*t->N*t->carr[ci].nchunk,&t->deltaY[ck]); /* || ||_2 */ - ip22=my_dnrm2(8*t->N*t->carr[ci].nchunk,&t->deltaJ[ck]); /* || ||_2 */ - /* square the norm to get dot prod */ - ip11*=ip11; - ip22*=ip22; - /* only try to do an update if the 'delta's are finite, also - there is tangible correlation between the two deltas */ -#ifdef DEBUG - printf("%d ip11=%lf ip12=%lf ip22=%lf\n",ci,ip11,ip12,ip22); -#endif - if (ip11>CLM_EPSILON && ip22>CLM_EPSILON) { - double alphacorr=ip12/sqrt(ip11*ip22); - /* decide on whether to do further calculations only if there is sufficient correlation - between the deltas */ - if (alphacorr>alphacorrmin) { - double alphaSD=ip11/ip12; - double alphaMG=ip12/ip22; - double alphahat; - if (2.0*alphaMG>alphaSD) { - alphahat=alphaMG; - } else { - alphahat=alphaSD-alphaMG*0.5; - } -#ifdef DEBUG - printf("alphacorr=%lf alphaSD=%lf alphaMG=%lf alphahat=%lf rho=%lf\n",alphacorr,alphaSD,alphaMG,alphahat,t->rho[ci]); -#endif - /* decide on whether to update rho based on heuristics */ - if (alphahat> 0.001 && alphahatrhoupper[ci]) { -#ifdef DEBUG - printf("updating rho from %lf to %lf\n",t->rho[ci],alphahat); -#endif - t->rho[ci]=alphahat; - } - } - } - - } - ck+=t->N*8*t->carr[ci].nchunk; - } - return NULL; -} - - -/* Barzilai & Borwein update of rho [Xu et al] */ -/* rho: Mx1 values, to be updated - rhoupper: Mx1 values, upper limit of rho - N: no of stations - M : clusters - Mt: actual clusters (include hybrid parameter) Mt >= M - carr: cluster info array, to get hybrid parameters: Mx1 - Yhat: current Yhat : 8*N*Mt - Yhat_k0 : old Yhat at previous update of rho : 8*N*Mt - J: current solution : 8*N*Mt - J_k0: old solution at previous update of rho : 8*N*Mt - Nt: no. of threads -*/ -int -update_rho_bb(double *rho, double *rhoupper, int N, int M, int Mt, clus_source_t *carr, double *Yhat, double *Yhat_k0, double *J, double *J_k0, int Nt) { - - double *deltaY; /* Yhat - Yhat_k0 */ - double *deltaJ; /* J - J_k0 (with J_k0 projected to tangent plane of J)*/ - if ((deltaY=(double*)calloc((size_t)8*N*Mt,sizeof(double)))==0) { - printf("%s: %d: no free memory\n",__FILE__,__LINE__); - exit(1); - } - if ((deltaJ=(double*)calloc((size_t)8*N*Mt,sizeof(double)))==0) { - printf("%s: %d: no free memory\n",__FILE__,__LINE__); - exit(1); - } - - - my_dcopy(8*N*Mt, Yhat, 1, deltaY, 1); - my_daxpy(8*N*Mt, Yhat_k0, -1.0, deltaY); -//no need to remove unitary ambiguity from J-Jold - my_dcopy(8*N*Mt, J, 1, deltaJ, 1); - my_daxpy(8*N*Mt, J_k0,-1.0, deltaJ); - - pthread_attr_t attr; - pthread_t *th_array; - thread_data_rho_bb_t *threaddata; - - int ci,cj,ck,Nthb0,Nthb,nth,nth1; - /* clusters per thread */ - Nthb0=(M+Nt-1)/Nt; - - /* setup threads */ - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE); - - if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) { - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); - exit(1); - } - if ((threaddata=(thread_data_rho_bb_t*)malloc((size_t)Nt*sizeof(thread_data_rho_bb_t)))==0) { - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); - exit(1); - } - - - ci=0; - ck=0; - for (nth=0; nth - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - $Id$ - */ - - -#include -#include -#include -#include -#include -#include -#include - -#include "Dirac.h" - -int -open_data_stream(int file, double **d, int *count, int *N, double *freq0, double *ra0, double *dec0) { - struct stat statbuf; - - int ig; - - - /* find the file size */ - if (fstat (file,&statbuf) < 0) { - fprintf(stderr,"%s: %d: no file open\n",__FILE__,__LINE__); - exit(1); - } - - //printf("file size (bytes) %d\n",(int)statbuf.st_size); - /* total double values is size/8 */ - *count=statbuf.st_size/8; - //printf("total double values %d\n",*count); - - /* map the file to memory */ - *d= (double*)mmap(NULL, statbuf.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, file, 0); - if ( !d) { - fprintf(stderr,"%s: %d: no file open\n",__FILE__,__LINE__); - exit(1); - } - - /* remove header from data */ - *N=(int)(*d)[0]; - *freq0=(*d)[1]; - *ra0=(*d)[2]; - *dec0=(*d)[3]; - /* read ignored stations and discard them */ - ig=(int)(*d)[4]; - /* make correct value for N */ - *N=*N-ig; - - printf("Ignoring %d stations\n",ig); - /* increment to data */ - *d=&((*d)[5+ig]); - - return(0); -} - - -int -close_data_stream(double *d, int count) { - - /* sync to disk */ - msync(d, (size_t)count*sizeof(double), MS_SYNC ); - munmap((void*)d, (size_t)count*sizeof(double)); - return 0; -} - diff --git a/src/lib/Dirac/lbfgs.c b/src/lib/Dirac/lbfgs.c index 221e6a7..776101d 100644 --- a/src/lib/Dirac/lbfgs.c +++ b/src/lib/Dirac/lbfgs.c @@ -102,7 +102,7 @@ pipeline_slave_code_b(void *data) checkCudaError(err,__FILE__,__LINE__); } else if (dp->status[tid]==PT_DO_CCOST) { /* divide total baselines by 2 */ - int BlocksPerGrid=(dp->Nbase[tid]+dp->lmdata[tid]->ThreadsPerBlock-1)/dp->lmdata[tid]->ThreadsPerBlock; + int BlocksPerGrid= 2*(dp->Nbase[tid]+dp->lmdata[tid]->ThreadsPerBlock-1)/dp->lmdata[tid]->ThreadsPerBlock; int boff=dp->boff[tid]; /* copy the current solution to device */ err=cudaMemcpy(dp->cpp[tid], dp->lmdata[tid]->p, m*sizeof(double), cudaMemcpyHostToDevice); @@ -901,12 +901,12 @@ lbfgs_fit_common( /* parameters per thread (GPU) */ int Nparm=(m+2-1)/2; /* find number of blocks */ - int BlocksPerGrid = (Nparm+ThreadsPerBlock-1)/ThreadsPerBlock; + int BlocksPerGrid = 2* (Nparm+ThreadsPerBlock-1)/ThreadsPerBlock; ci=0; int nth; for (nth=0; nth<2; nth++) { threaddata[nth].ThreadsPerBlock=ThreadsPerBlock; - threaddata[nth].BlocksPerGrid=BlocksPerGrid; + threaddata[nth].BlocksPerGrid= 2*BlocksPerGrid; threaddata[nth].card=nth; threaddata[nth].Nbase=dp->Nbase; threaddata[nth].tilesz=dp->tilesz; diff --git a/src/lib/Dirac/myblas.c b/src/lib/Dirac/myblas.c index cbf1d75..35d237a 100644 --- a/src/lib/Dirac/myblas.c +++ b/src/lib/Dirac/myblas.c @@ -206,7 +206,7 @@ __attribute__ ((target(MIC))) } -/* following routines used in LAPACK solvers */ +/* following routines used in LAPACK dirac */ /* cholesky factorization: real symmetric */ int my_dpotrf(char uplo, int N, double *A, int lda) { diff --git a/src/lib/Dirac/oslmfit.c b/src/lib/Dirac/oslmfit.c index 82be194..f19f447 100644 --- a/src/lib/Dirac/oslmfit.c +++ b/src/lib/Dirac/oslmfit.c @@ -157,7 +157,7 @@ oslevmar_der_single_cuda( /* calculate no of cuda threads and blocks */ int ThreadsPerBlock=DEFAULT_TH_PER_BK; - int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock; + int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock; unsigned long int moff; @@ -190,7 +190,7 @@ oslevmar_der_single_cuda( checkCudaError(err,__FILE__,__LINE__); err=cudaMalloc((void**)&ed, N*sizeof(double)); checkCudaError(err,__FILE__,__LINE__); - /* memory allocation: different solvers */ + /* memory allocation: different dirac */ if (solve_axb==1) { err=cudaMalloc((void**)&taud, M*sizeof(double)); checkCudaError(err,__FILE__,__LINE__); diff --git a/src/lib/Dirac/robustlm.c b/src/lib/Dirac/robustlm.c index 07b0f6e..05e21d8 100644 --- a/src/lib/Dirac/robustlm.c +++ b/src/lib/Dirac/robustlm.c @@ -159,7 +159,7 @@ rlevmar_der_single_cuda( int ThreadsPerBlock=DEFAULT_TH_PER_BK; int ThreadsPerBlock1=DEFAULT_TH_PER_BK; /* DEFAULT_TH_PER_BK/8 for accessing each element of a baseline */ int ThreadsPerBlock2=Nd/2; /* for evaluating nu */ - int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock; + int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock; unsigned long int moff; @@ -196,7 +196,7 @@ rlevmar_der_single_cuda( checkCudaError(err,__FILE__,__LINE__); err=cudaMalloc((void**)&ed, N*sizeof(double)); checkCudaError(err,__FILE__,__LINE__); - /* memory allocation: different solvers */ + /* memory allocation: different dirac */ if (solve_axb==1) { err=cudaMalloc((void**)&taud, M*sizeof(double)); checkCudaError(err,__FILE__,__LINE__); @@ -805,7 +805,7 @@ rlevmar_der_single_cuda_fl( /* FIXME: might need a large value for large no of baselines */ int ThreadsPerBlock1=DEFAULT_TH_PER_BK; /* for accessing each element of a baseline */ int ThreadsPerBlock2=Nd/2; /* for evaluating nu */ - int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock; + int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock; unsigned long int moff; @@ -1382,7 +1382,7 @@ osrlevmar_der_single_cuda_fl( /* FIXME: might need a large value for large no of baselines */ int ThreadsPerBlock1=DEFAULT_TH_PER_BK; /* for accessing each element of a baseline */ int ThreadsPerBlock2=Nd/2; /* for evaluating nu */ - int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock; + int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock; unsigned long int moff; @@ -2180,7 +2180,7 @@ rlevmar_der_single_nocuda( setweights(M,aones,1.0,lmdata->Nt); /*W set initial weights to 1 */ setweights(N,wtd,1.0,lmdata->Nt); - /* memory allocation: different solvers */ + /* memory allocation: different dirac */ if (solve_axb==0) { } else if (solve_axb==1) { @@ -2766,7 +2766,7 @@ osrlevmar_der_single_nocuda( /*W set initial weights to 1 */ setweights(N,wtd,1.0,lmdata0->Nt); - /* memory allocation: different solvers */ + /* memory allocation: different dirac */ if (solve_axb==0) { } else if (solve_axb==1) { diff --git a/src/lib/Dirac/rtr_solve_cuda.c b/src/lib/Dirac/rtr_solve_cuda.c index 2d9b912..ed4ae25 100644 --- a/src/lib/Dirac/rtr_solve_cuda.c +++ b/src/lib/Dirac/rtr_solve_cuda.c @@ -582,7 +582,7 @@ rtr_solve_cuda_fl( /* calculate no of cuda threads and blocks */ int ThreadsPerBlock=DEFAULT_TH_PER_BK; - int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock; + int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock; /* reshape x to make J: 2Nx2 complex double diff --git a/src/lib/Dirac/rtr_solve_robust_cuda.c b/src/lib/Dirac/rtr_solve_robust_cuda.c index 45d62f1..163bf29 100644 --- a/src/lib/Dirac/rtr_solve_robust_cuda.c +++ b/src/lib/Dirac/rtr_solve_robust_cuda.c @@ -602,7 +602,7 @@ rtr_solve_cuda_robust_fl( /* calculate no of cuda threads and blocks */ int ThreadsPerBlock=DEFAULT_TH_PER_BK; - int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock; + int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock; /* reshape x to make J: 2Nx2 complex double @@ -985,7 +985,7 @@ nsd_solve_cuda_robust_fl( /* calculate no of cuda threads and blocks */ int ThreadsPerBlock=DEFAULT_TH_PER_BK; - int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock; + int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock; /* reshape x to make J: 2Nx2 complex double diff --git a/src/lib/Dirac/rtr_solve_robust_cuda_admm.c b/src/lib/Dirac/rtr_solve_robust_cuda_admm.c index 7451c6b..d77c735 100644 --- a/src/lib/Dirac/rtr_solve_robust_cuda_admm.c +++ b/src/lib/Dirac/rtr_solve_robust_cuda_admm.c @@ -517,7 +517,7 @@ rtr_solve_cuda_robust_admm_fl( /* calculate no of cuda threads and blocks */ int ThreadsPerBlock=DEFAULT_TH_PER_BK; - int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock; + int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock; /* reshape x to make J: 2Nx2 complex double @@ -947,7 +947,7 @@ nsd_solve_cuda_robust_admm_fl( /* calculate no of cuda threads and blocks */ int ThreadsPerBlock=DEFAULT_TH_PER_BK; - int BlocksPerGrid=(M+ThreadsPerBlock-1)/ThreadsPerBlock; + int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock; /* reshape x to make J: 2Nx2 complex double diff --git a/src/lib/Radio/Makefile b/src/lib/Radio/Makefile index 43bb3be..49bcd1c 100644 --- a/src/lib/Radio/Makefile +++ b/src/lib/Radio/Makefile @@ -2,11 +2,12 @@ CC=gcc CXX=g++ #CFLAGS= -Wall -O3 -g #-pg CFLAGS= -Wall -O3 -fopt-info-optimized +# CFLAGS= -Wall -pg -O2 -ansi -fPIC -fpermissive -fno-omit-frame-pointer -DNDEBUG -fno-inline-functions -fno-inline-functions-called-once -fno-optimize-sibling-calls CLIBS= -lm -lpthread #LAPACK=-L/usr/lib/atlas/sse -llapack -lblas #LAPACK=-L/usr/local/GotoBLAS2/lib -lgoto2 -lpthread -lgfortran -LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread - +#LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread +LAPACK=-L/cm/shared/package/openblas/0.2.17mt/lib -lopenblas -lgfortran -lpthread INCLUDES= -I. -I../Dirac/ LIBPATH= diff --git a/src/lib/Radio/Makefile.gpu b/src/lib/Radio/Makefile.gpu index d9e2d7c..eaead10 100644 --- a/src/lib/Radio/Makefile.gpu +++ b/src/lib/Radio/Makefile.gpu @@ -3,7 +3,8 @@ CXX=g++ NVCC=nvcc CFLAGS= -Wall -O3 -g -DHAVE_CUDA -DHYBRID_CODE -pg CLIBS= -lm -lpthread -LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread +#LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread +LAPACK=-L/cm/shared/package/openblas/0.2.17mt/lib -lopenblas -lgfortran -lpthread # LAPACK=-lblas -lgfortran -lpthread CUDAINC=/usr/local/cuda/include diff --git a/src/lib/Radio/predict_model.cu b/src/lib/Radio/predict_model.cu index 64b0a44..6678c34 100644 --- a/src/lib/Radio/predict_model.cu +++ b/src/lib/Radio/predict_model.cu @@ -26,25 +26,6 @@ /* enable this for checking for kernel failure */ //#define CUDA_DBG -/* matrix multiplications */ -/* C=A*B */ -__device__ void -amb(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) { - c[0]=cuCadd(cuCmul(a[0],b[0]),cuCmul(a[1],b[2])); - c[1]=cuCadd(cuCmul(a[0],b[1]),cuCmul(a[1],b[3])); - c[2]=cuCadd(cuCmul(a[2],b[0]),cuCmul(a[3],b[2])); - c[3]=cuCadd(cuCmul(a[2],b[1]),cuCmul(a[3],b[3])); -} -/* C=A*B^H */ -__device__ void -ambt(const cuDoubleComplex *__restrict__ a, const cuDoubleComplex *__restrict__ b, cuDoubleComplex *__restrict__ c) { - c[0]=cuCadd(cuCmul(a[0],cuConj(b[0])),cuCmul(a[1],cuConj(b[1]))); - c[1]=cuCadd(cuCmul(a[0],cuConj(b[2])),cuCmul(a[1],cuConj(b[3]))); - c[2]=cuCadd(cuCmul(a[2],cuConj(b[0])),cuCmul(a[3],cuConj(b[1]))); - c[3]=cuCadd(cuCmul(a[2],cuConj(b[2])),cuCmul(a[3],cuConj(b[3]))); -} - - __device__ void radec2azel_gmst__(float ra, float dec, float longitude, float latitude, float thetaGMST, float *az, float *el) { float thetaLST=thetaGMST+longitude*180.0f/M_PI; @@ -70,49 +51,134 @@ radec2azel_gmst__(float ra, float dec, float longitude, float latitude, float th *az=azd; } -/* use compiler directives to use/not use shared memory */ -/* ARRAY_MAX_ELEM : define this in header file beforehand, if using shared memory */ -/* master kernel to calculate beam */ +/* slave kernel to calculate phase of manifold vector for given station */ +/* x,y,z: Nx1 arrays of element coords */ +/* sum: scalar to store result */ +/* NOTE: only 1 block should be used here */ __global__ void -kernel_array_beam(int N, int T, int K, int F, - const double *__restrict__ freqs, const float *__restrict__ longitude, const float *__restrict__ latitude, - const double *__restrict__ time_utc, const int *__restrict__ Nelem, - const float * const *__restrict__ xx, const float * const *__restrict__ yy, const float * const *__restrict__ zz, - const float *__restrict__ ra, const float *__restrict__ dec, - float ph_ra0, float ph_dec0, float ph_freq0, float *beam) { +kernel_array_beam_slave_sin(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) { + unsigned int n=threadIdx.x+blockDim.x*blockIdx.x; + extern __shared__ float tmpsum[]; /* assumed to be size Nx1 */ + if (n 1) { + int halfPoint = (nTotalThreads >> 1); // divide by two + if (n < halfPoint) { + int thread2 = n + halfPoint; + if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 ) + tmpsum[n] = tmpsum[n]+tmpsum[thread2]; + } + } + __syncthreads(); + nTotalThreads = halfPoint; // Reducing the binary tree size by two + } - // find respective source,freq,time for this thread - int n1 = x; - int isrc=n1/(T*F); - n1=n1-isrc*(T*F); - int ifrq=n1/(T); - n1=n1-ifrq*(T); - int itm=n1; + /* now thread 0 will add up results */ + if (threadIdx.x==0) { + *sum=tmpsum[0]; + } +} - //number of elements for this station - int Nelems = __ldg(&Nelem[istat]); +__global__ void +kernel_array_beam_slave_cos(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) { + unsigned int n=threadIdx.x+blockDim.x*blockIdx.x; + extern __shared__ float tmpsum[]; /* assumed to be size Nx1 */ + if (n 1) { + int halfPoint = (nTotalThreads >> 1); // divide by two + if (n < halfPoint) { + int thread2 = n + halfPoint; + if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 ) + tmpsum[n] = tmpsum[n]+tmpsum[thread2]; + } + } + __syncthreads(); + nTotalThreads = halfPoint; // Reducing the binary tree size by two + } - //using shared memory - #if (ARRAY_USE_SHMEM==1) - __shared__ float sh_x[ARRAY_MAX_ELEM]; - __shared__ float sh_y[ARRAY_MAX_ELEM]; - __shared__ float sh_z[ARRAY_MAX_ELEM]; - for (int i=threadIdx.x; i 1) { + int halfPoint = (nTotalThreads >> 1); // divide by two + if (n < halfPoint) { + int thread2 = n + halfPoint; + if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 ) + tmpsum[2*n] = tmpsum[2*n]+tmpsum[2*thread2]; + tmpsum[2*n+1] = tmpsum[2*n+1]+tmpsum[2*thread2+1]; + } + } + __syncthreads(); + nTotalThreads = halfPoint; // Reducing the binary tree size by two + } + + /* now thread 0 will add up results */ + if (threadIdx.x==0) { + sum[0]=tmpsum[0]; + sum[1]=tmpsum[1]; + } +} + + +__device__ int +NearestPowerOf2 (int n){ + if (!n) return n; //(0 == 2^0) + + int x = 1; + while(x < n) { + x <<= 1; + } + return x; +} + + +/* master kernel to calculate beam */ +/* tarr: size NTKFx2 buffer to store sin() cos() sums */ +__global__ void +kernel_array_beam(int N, int T, int K, int F, float *freqs, float *longitude, float *latitude, + double *time_utc, int *Nelem, float **xx, float **yy, float **zz, float *ra, float *dec, float ph_ra0, float ph_dec0, float ph_freq0, float *beam, float *tarr) { + + /* global thread index */ + unsigned int n=threadIdx.x+blockDim.x*blockIdx.x; + /* allowed max threads */ + int Ntotal=N*T*K*F; + if (n>>(Nelems,r1,r2,r3,xx[istat],yy[istat],zz[istat],&tarr[2*n],NearestPowerOf2(Nelems)); + cudaDeviceSynchronize(); +#ifdef CUDA_DBG + error = cudaGetLastError(); + if(error != cudaSuccess) { + // print the CUDA error message and exit + printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__); + } +#endif + float ssum=__ldg(&tarr[2*n]); + float csum=__ldg(&tarr[2*n+1]); + float Nnor=1.0f/(float)Nelems; ssum*=Nnor; csum*=Nnor; /* store output (amplitude of beam)*/ - int boffset=itm*N*K*F+isrc*N*F+ifrq*N+istat; beam[boffset]=sqrtf(ssum*ssum+csum*csum); //printf("thread %d stat %d src %d freq %d time %d : %lf longitude=%lf latitude=%lf time=%lf freq=%lf elem=%d ra=%lf dec=%lf beam=%lf\n",n,istat,isrc,ifrq,itm,time_utc[itm],longitude[istat],latitude[istat],time_utc[itm],freqs[ifrq],Nelem[istat],ra[isrc],dec[isrc],beam[boffset]); } @@ -175,8 +247,8 @@ kernel_array_beam(int N, int T, int K, int F, } /***************************************************************************/ -__device__ cuDoubleComplex -gaussian_contrib__(int *dd, float u, float v, float w) { +__device__ cuFloatComplex +gaussian_contrib(int *dd, float u, float v, float w) { exinfo_gaussian *dp=(exinfo_gaussian*)dd; float up,vp,a,b,ut,vt,cosph,sinph; @@ -196,13 +268,13 @@ gaussian_contrib__(int *dd, float u, float v, float w) { ut=a*(cosph*up-sinph*vp); vt=b*(sinph*up+cosph*vp); - return make_cuDoubleComplex((double)(0.5f*M_PI*expf(-(ut*ut+vt*vt))),0.0); + return make_cuFloatComplex(0.5f*M_PI*expf(-(ut*ut+vt*vt)),0.0f); } -__device__ cuDoubleComplex -ring_contrib__(int *dd, float u, float v, float w) { +__device__ cuFloatComplex +ring_contrib(int *dd, float u, float v, float w) { exinfo_ring *dp=(exinfo_ring*)dd; float up,vp,a,b; @@ -213,11 +285,11 @@ ring_contrib__(int *dd, float u, float v, float w) { a=dp->eX; /* diameter */ b=sqrtf(up*up+vp*vp)*a*2.0f*M_PI; - return make_cuDoubleComplex((double)j0f(b),0.0); + return make_cuFloatComplex(j0f(b),0.0f); } -__device__ cuDoubleComplex -disk_contrib__(int *dd, float u, float v, float w) { +__device__ cuFloatComplex +disk_contrib(int *dd, float u, float v, float w) { exinfo_disk *dp=(exinfo_disk*)dd; float up,vp,a,b; @@ -228,7 +300,7 @@ disk_contrib__(int *dd, float u, float v, float w) { a=dp->eX; /* diameter */ b=sqrtf(up*up+vp*vp)*a*2.0f*M_PI; - return make_cuDoubleComplex((double)j1f(b),0.0); + return make_cuFloatComplex(j1f(b),0.0f); } @@ -321,8 +393,8 @@ calculate_uv_mode_vectors_scalar(float u, float v, float beta, int n0, float *Av free(shpvl); } -__device__ cuDoubleComplex -shapelet_contrib__(int *dd, float u, float v, float w) { +__device__ cuFloatComplex +shapelet_contrib(int *dd, float u, float v, float w) { exinfo_shapelet *dp=(exinfo_shapelet*)dd; int *cplx; float *Av; @@ -346,11 +418,6 @@ shapelet_contrib__(int *dd, float u, float v, float w) { __sincosf((float)dp->eP,&sinph,&cosph); ut=a*(cosph*up-sinph*vp); vt=b*(sinph*up+cosph*vp); - /* if u,v is way off the scale (beta) of shapelet modes, the result is almost always zero, - so check this here and return 0, otherwise spurious nans may result */ - if (__fdiv_rz(100.0f,__fsqrt_rz(ut*ut+vt*vt))beta) { - return make_cuDoubleComplex(0.0,0.0); - } /* note: we decompose f(-l,m) so the Fourier transform is F(-u,v) so negate the u grid */ Av=(float*)malloc((size_t)((dp->n0)*(dp->n0))*sizeof(float)); @@ -369,198 +436,163 @@ shapelet_contrib__(int *dd, float u, float v, float w) { free(Av); free(cplx); + //return 2.0*M_PI*(realsum+_Complex_I*imagsum); realsum*=2.0f*M_PI*a*b; imagsum*=2.0f*M_PI*a*b; - /* additional safeguards */ - if ( isnan(realsum) ) { realsum=0.0f; } - if ( isnan(imagsum) ) { imagsum=0.0f; } - return make_cuDoubleComplex((double)realsum,(double)imagsum); + return make_cuFloatComplex(realsum,imagsum); } -__device__ void -compute_prodterm_multifreq(int sta1, int sta2, int N, int K, int T, int F, -double phterm0, double sI0f, double sQ0f, double sU0f, double sV0f, double spec_idxf, double spec_idx1f, double spec_idx2f, double myf0, - double myfreq, double deltaf, int dobeam, int itm, int k, int cf, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) { - /* F>1 is assumed output: 8x1 array */ - double sinph,cosph; - sincos(phterm0*myfreq,&sinph,&cosph); - cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph); - double If,Qf,Uf,Vf; - If=Qf=Uf=Vf=0.0; - /* evaluate spectra */ - double fratio=log(myfreq/myf0); - double fratio1=fratio*fratio; - double fratio2=fratio1*fratio; - double cm=spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2; - /* catch -ve flux */ - if (sI0f>0.0) { - If=exp(log(sI0f)+cm); - } else if (sI0f<0.0) { - If=-exp(log(-sI0f)+cm); - } - if (sQ0f>0.0) { - Qf=exp(log(sQ0f)+cm); - } else if (sI0f<0.0) { - Qf=-exp(log(-sQ0f)+cm); - } - if (sU0f>0.0) { - Uf=exp(log(sU0f)+cm); - } else if (sI0f<0.0) { - Uf=-exp(log(-sU0f)+cm); - } - if (sV0f>0.0) { - Vf=exp(log(sV0f)+cm); - } else if (sI0f<0.0) { - Vf=-exp(log(-sV0f)+cm); - } - /* smearing, beam */ - double scalef=1.0; - double phterm =(phterm0*0.5*deltaf); - if (phterm!=0.0) { - sinph=(sin(phterm)/phterm); - scalef *=fabs(sinph); /* catch -ve values due to rounding off */ + +/* slave thread to calculate coherencies, for 1 source */ +/* baseline (sta1,sta2) at time itm */ +/* K: total sources, uset to find right offset + Kused: actual sources calculated in this thread block + Koff: offset in source array to start calculation + NOTE: only 1 block is used + */ +__global__ void +kernel_coherencies_slave(int sta1, int sta2, int itm, int B, int N, int T, int K, int Kused, int Koff, int F, float u, float v, float w, float *freqs, float *beam, float *ll, float *mm, float *nn, float *sI, + unsigned char *stype, float *sI0, float *f0, float *spec_idx, float *spec_idx1, float *spec_idx2, int **exs, float deltaf, float deltat, float dec0, float *__restrict__ coh,int dobeam,int blockDim_2) { + /* which source we work on */ + unsigned int k=threadIdx.x+blockDim.x*blockIdx.x; + + extern __shared__ float tmpcoh[]; /* assumed to be size 8*F*Kusedx1 */ + + if (k1) { + sI0f=__ldg(&sI0[k]); + spec_idxf=__ldg(&spec_idx[k]); + spec_idx1f=__ldg(&spec_idx1[k]); + spec_idx2f=__ldg(&spec_idx2[k]); + myf0=__ldg(&f0[k]); + } + unsigned char stypeT=__ldg(&stype[k]); + for(int cf=0; cf0.0f) { + If=__expf(__logf(sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2); + } else if (sI0f<0.0f) { + If=-__expf(__logf(-sI0f)+spec_idxf*fratio+spec_idx1f*fratio1+spec_idx2f*fratio2); + } else { + If=0.0f; + } + } + /* smearing */ + float phterm =phterm0*0.5f*deltaf; + if (phterm!=0.0f) { + sinph=__sinf(phterm)/phterm; + If *=fabsf(sinph); /* catch -ve values due to rounding off */ } if (dobeam) { /* get beam info */ //int boffset1=sta1*K*T*F + k1*T*F + cf*T + itm; - - int boffset1=itm*N*K*F+k*N*F+cf*N+sta1; - // printf("itm=%d, k1=%d, sta1=%d, sta2=%d, boffset1=%d, boffset2=%d\n", itm, k1, sta1, sta2, boffset1, boffset2); + int boffset1=itm*N*K*F+k1*N*F+cf*N+sta1; float beam1=__ldg(&beam[boffset1]); //int boffset2=sta2*K*T*F + k1*T*F + cf*T + itm; - int boffset2=itm*N*K*F+k*N*F+cf*N+sta2; + int boffset2=itm*N*K*F+k1*N*F+cf*N+sta2; float beam2=__ldg(&beam[boffset2]); - scalef *=(double)(beam1*beam2); + If *=beam1*beam2; } - /* form complex value */ - prodterm.x *=scalef; - prodterm.y *=scalef; + prodterm.x *=If; + prodterm.y *=If; /* check for type of source */ if (stypeT!=STYPE_POINT) { - double uscaled=u*myfreq; - double vscaled=v*myfreq; - double wscaled=w*myfreq; + float uscaled=u*myfreq; + float vscaled=v*myfreq; + float wscaled=w*myfreq; if (stypeT==STYPE_SHAPELET) { - prodterm=cuCmul(shapelet_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); + prodterm=cuCmulf(shapelet_contrib(exs[k],uscaled,vscaled,wscaled),prodterm); } else if (stypeT==STYPE_GAUSSIAN) { - prodterm=cuCmul(gaussian_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); + prodterm=cuCmulf(gaussian_contrib(exs[k],uscaled,vscaled,wscaled),prodterm); } else if (stypeT==STYPE_DISK) { - prodterm=cuCmul(disk_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); + prodterm=cuCmulf(disk_contrib(exs[k],uscaled,vscaled,wscaled),prodterm); } else if (stypeT==STYPE_RING) { - prodterm=cuCmul(ring_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); + prodterm=cuCmulf(ring_contrib(exs[k],uscaled,vscaled,wscaled),prodterm); } } + +//printf("k=%d cf=%d freq=%f uvw %f,%f,%f lmn %f,%f,%f phterm %f If %f\n",k,cf,freqs[cf],u,v,w,ll[k],mm[k],nn[k],phterm,If); - double Ix,Iy,Qx,Qy,Ux,Uy,Vx,Vy; - Ix=If*prodterm.x; - Iy=If*prodterm.y; - Qx=Qf*prodterm.x; - Qy=Qf*prodterm.y; - Ux=Uf*prodterm.x; - Uy=Uf*prodterm.y; - Vx=Vf*prodterm.x; - Vy=Vf*prodterm.y; + /* write output to shared array */ + tmpcoh[k*8*F+8*cf]=prodterm.x; + tmpcoh[k*8*F+8*cf+1]=prodterm.y; + tmpcoh[k*8*F+8*cf+2]=0.0f; + tmpcoh[k*8*F+8*cf+3]=0.0f; + tmpcoh[k*8*F+8*cf+4]=0.0f; + tmpcoh[k*8*F+8*cf+5]=0.0f; + tmpcoh[k*8*F+8*cf+6]=prodterm.x; + tmpcoh[k*8*F+8*cf+7]=prodterm.y; + } + } + __syncthreads(); - - - output[0]=Ix+Qx; - output[1]=Iy+Qy; - output[2]=Ux-Vy; - output[3]=Vx+Uy; - output[4]=Ux+Vy; - output[5]=-Vx+Uy; - output[6]=Ix-Qx; - output[7]=Iy-Qy; - -} - - -__device__ void -compute_prodterm(int sta1, int sta2, int N, int K, int T, - double phterm0, double If, double Qf, double Uf, double Vf, - double myfreq, double deltaf, int dobeam, int itm, int k, const float *__restrict__ beam, int **exs, unsigned char stypeT, double u, double v, double w, double *__restrict__ output) { - /* F==1 is assumed, output: 8x1 array */ - double sinph,cosph; - sincos(phterm0*myfreq,&sinph,&cosph); - cuDoubleComplex prodterm=make_cuDoubleComplex(cosph,sinph); - - /* smearing, beam */ - double scalef=1.0; - double phterm =(phterm0*0.5*deltaf); - if (phterm!=0.0) { - sinph=(sin(phterm)/phterm); - scalef *=fabs(sinph); /* catch -ve values due to rounding off */ - } - - if (dobeam) { - /* get beam info */ - int boffset1=itm*N*K+k*N+sta1; - // printf("itm=%d, k1=%d, sta1=%d, sta2=%d, boffset1=%d, boffset2=%d\n", itm, k1, sta1, sta2, boffset1, boffset2); - float beam1=__ldg(&beam[boffset1]); - int boffset2=itm*N*K+k*N+sta2; - float beam2=__ldg(&beam[boffset2]); - scalef *=(double)(beam1*beam2); - } - - - /* form complex value */ - prodterm.x *=scalef; - prodterm.y *=scalef; - - /* check for type of source */ - if (stypeT!=STYPE_POINT) { - double uscaled=u*myfreq; - double vscaled=v*myfreq; - double wscaled=w*myfreq; - if (stypeT==STYPE_SHAPELET) { - prodterm=cuCmul(shapelet_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); - } else if (stypeT==STYPE_GAUSSIAN) { - prodterm=cuCmul(gaussian_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); - } else if (stypeT==STYPE_DISK) { - prodterm=cuCmul(disk_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); - } else if (stypeT==STYPE_RING) { - prodterm=cuCmul(ring_contrib__(exs[k],uscaled,vscaled,wscaled),prodterm); + // Build summation tree over elements, handling case where total threads is not a power of two. + int nTotalThreads = blockDim_2; // Total number of threads (==Kused), rounded up to the next power of two + while(nTotalThreads > 1) { + int halfPoint = (nTotalThreads >> 1); // divide by two + if (k < halfPoint) { + int thread2 = k + halfPoint; + if (thread2 < blockDim.x) { // Skipping the fictitious threads >Kused ( blockDim.x ... blockDim_2-1 ) + for(int cf=0; cf1 - if (F==1) { - //use simply for-loop, if K is very large this may be slow and may need further parallelization - for (int k=0; k>>(sta1,sta2,tslot,B,N,T,K,K,0,F,__ldg(&u[n]),__ldg(&v[n]),__ldg(&w[n]),freqs,beam,ll,mm,nn,sI,stype,sI0,f0,spec_idx,spec_idx1,spec_idx2,exs,deltaf,deltat,dec0,&coh[8*n],dobeam,NearestPowerOf2(K)); + cudaDeviceSynchronize(); +#ifdef CUDA_DBG + error = cudaGetLastError(); + if(error != cudaSuccess) { + // print the CUDA error message and exit + printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__); + } +#endif } else { - //use simply for-loop, if K is very large this may be slow and may need further parallelization - for (int k=0; k>>(sta1,sta2,tslot,B,N,T,K,myT,ct,F,__ldg(&u[n]),__ldg(&v[n]),__ldg(&w[n]),freqs,beam,&ll[ct],&mm[ct],&nn[ct],&sI[ct],&stype[ct],&sI0[ct],&f0[ct],&spec_idx[ct],&spec_idx1[ct],&spec_idx2[ct],&exs[ct],deltaf,deltat,dec0,&coh[8*n],dobeam,NearestPowerOf2(myT)); + cudaDeviceSynchronize(); +#ifdef CUDA_DBG + error = cudaGetLastError(); + if(error != cudaSuccess) { + // print the CUDA error message and exit + printf("CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__); + } +#endif + ct=ct+ThreadsPerBlock; } + } - } } -/* kernel to calculate residuals */ -__global__ void -kernel_residuals(int B, int N, int T, int K, int F, - const double *__restrict__ u, const double *__restrict__ v, const double *__restrict__ w, - const double *__restrict__ p, int nchunk, - baseline_t *barr, const double *__restrict__ freqs, const float *__restrict__ beam, const double *__restrict__ ll, const double *__restrict__ mm, const double *__restrict__ nn, - const double *__restrict__ sI, const double *__restrict__ sQ, const double *__restrict__ sU, const double *__restrict__ sV, - const unsigned char *__restrict__ stype, const double *__restrict__ sI0, -const double *__restrict__ sQ0, const double *__restrict__ sU0, const double *__restrict__ sV0, - const double *__restrict__ f0, const double *__restrict__ spec_idx, const double *__restrict__ spec_idx1, const double *__restrict__ spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh, int dobeam) { - - /* global thread index */ - unsigned int n=threadIdx.x+blockDim.x*blockIdx.x; - - /* each thread will calculate for one baseline, over all sources */ - if (n1 - if (F==1) { - //use simply for-loop, if K is very large this may be slow and may need further parallelization - for (int k=0; kdata, y dim-> stations */ - dim3 grid(1, 1, 1); - grid.x = (int)ceilf((K*T*F) / (float)ThreadsPerBlock); - grid.y = N; - - kernel_array_beam<<>>(N,T,K,F,freqs,longitude,latitude,time_utc,Nelem,xx,yy,zz,ra,dec,ph_ra0,ph_dec0,ph_freq0,beam); + int BlocksPerGrid= 2*(Ntotal+ThreadsPerBlock-1)/ThreadsPerBlock; + kernel_array_beam<<>>(N,T,K,F,freqs,longitude,latitude,time_utc,Nelem,xx,yy,zz,ra,dec,ph_ra0,ph_dec0,ph_freq0,beam,buffer); cudaDeviceSynchronize(); + cudaFree(buffer); #ifdef CUDA_DBG error = cudaGetLastError(); if(error != cudaSuccess) { @@ -1015,7 +751,7 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud /* calculate coherencies: - B: total baselines (could be more than one timeslot) + B: total baselines N: no of stations T: no of time slots K: no of sources @@ -1038,20 +774,26 @@ cudakernel_array_beam(int N, int T, int K, int F, double *freqs, float *longitud dobeam: enable beam if >0 */ void -cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, double *w,baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV, - unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) { +cudakernel_coherencies(int B, int N, int T, int K, int F, float *u, float *v, float *w,baseline_t *barr, float *freqs, float *beam, float *ll, float *mm, float *nn, float *sI, + unsigned char *stype, float *sI0, float *f0, float *spec_idx, float *spec_idx1, float *spec_idx2, int **exs, float deltaf, float deltat, float dec0, float *coh,int dobeam) { #ifdef CUDA_DBG cudaError_t error; error = cudaGetLastError(); + error=cudaMemset(coh,0,sizeof(float)*8*B*F); + checkCudaError(error,__FILE__,__LINE__); +#endif +#ifndef CUDA_DBG + cudaMemset(coh,0,sizeof(float)*8*B*F); #endif /* spawn threads to handle baselines, these threads will spawn threads for sources */ int ThreadsPerBlock=DEFAULT_TH_PER_BK; + // int ThreadsPerBlock=16; /* note: make sure we do not exceed max no of blocks available, otherwise (too many baselines, loop over source id) */ - int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock; - kernel_coherencies<<>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV, - stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam); + int BlocksPerGrid= 2*(B+ThreadsPerBlock-1)/ThreadsPerBlock; + kernel_coherencies<<>>(B, N, T, K, F,u,v,w,barr,freqs, beam, ll, mm, nn, sI, + stype, sI0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam); cudaDeviceSynchronize(); #ifdef CUDA_DBG error = cudaGetLastError(); @@ -1064,56 +806,6 @@ cudakernel_coherencies(int B, int N, int T, int K, int F, double *u, double *v, } -/* p : parameters 8Nxnchunk values */ -void -cudakernel_residuals(int B, int N, int T, int K, int F, double *u, double *v, double *w, double *p, int nchunk, baseline_t *barr, double *freqs, float *beam, double *ll, double *mm, double *nn, double *sI, double *sQ, double *sU, double *sV, - unsigned char *stype, double *sI0, double *sQ0, double *sU0, double *sV0, double *f0, double *spec_idx, double *spec_idx1, double *spec_idx2, int **exs, double deltaf, double deltat, double dec0, double *coh,int dobeam) { -#ifdef CUDA_DBG - cudaError_t error; - error = cudaGetLastError(); -#endif - - /* spawn threads to handle baselines, these threads will loop over sources */ - int ThreadsPerBlock=DEFAULT_TH_PER_BK; - /* note: make sure we do not exceed max no of blocks available, - otherwise (too many baselines, loop over source id) */ - int BlocksPerGrid=(B+ThreadsPerBlock-1)/ThreadsPerBlock; - kernel_residuals<<>>(B, N, T, K, F,u,v,w,p,nchunk,barr,freqs, beam, ll, mm, nn, sI, sQ, sU, sV, - stype, sI0, sQ0, sU0, sV0, f0, spec_idx, spec_idx1, spec_idx2, exs, deltaf, deltat, dec0, coh, dobeam); - cudaDeviceSynchronize(); -#ifdef CUDA_DBG - error = cudaGetLastError(); - if(error != cudaSuccess) { - // print the CUDA error message and exit - fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__); - exit(-1); - } -#endif -} - - -void -cudakernel_correct_residuals(int B, int N, int Nb, int boff, int F, int nchunk, double *x, double *p, baseline_t *barr) { -#ifdef CUDA_DBG - cudaError_t error; - error = cudaGetLastError(); -#endif - - /* spawn threads to handle baselines */ - int ThreadsPerBlock=DEFAULT_TH_PER_BK; - int BlocksPerGrid=(Nb+ThreadsPerBlock-1)/ThreadsPerBlock; - kernel_correct_residuals<<>>(B, N, Nb, boff, F, nchunk, x, p, barr); - cudaDeviceSynchronize(); -#ifdef CUDA_DBG - error = cudaGetLastError(); - if(error != cudaSuccess) { - // print the CUDA error message and exit - fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__); - exit(-1); - } -#endif -} - /* convert time JD to GMST angle store result at the same location */ void @@ -1126,10 +818,10 @@ cudakernel_convert_time(int T, double *time_utc) { int ThreadsPerBlock=DEFAULT_TH_PER_BK; /* note: make sure we do not exceed max no of blocks available, otherwise (too many baselines, loop over source id) */ - int BlocksPerGrid=(T+ThreadsPerBlock-1)/ThreadsPerBlock; + int BlocksPerGrid= 2*(T+ThreadsPerBlock-1)/ThreadsPerBlock; kernel_convert_time<<>>(T,time_utc); cudaDeviceSynchronize(); -#ifdef CUDA_DBG + #ifdef CUDA_DBG error = cudaGetLastError(); if(error != cudaSuccess) { // print the CUDA error message and exit diff --git a/src/lib/Radio/predict_withbeam.c b/src/lib/Radio/predict_withbeam.c index 53a3d9a..52722e2 100644 --- a/src/lib/Radio/predict_withbeam.c +++ b/src/lib/Radio/predict_withbeam.c @@ -1114,7 +1114,7 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit for (cj=0; cj1) */ - int Nb; /* no of baselines handeled by this thread */ - baseline_t *barr; /* baseline info */ - int Nf; /* of of freqs */ - - double *pinv; /* inverse of solutions to apply: size 8N*nchunk */ - int nchunk; /* how many solutions */ -} thread_data_corr_t; /* copy Nx1 double array x to device as float first allocate device memory */ @@ -146,7 +91,7 @@ dtofcopy(int N, float **x_d, double *x) { checkCudaError(err,__FILE__,__LINE__); /* copy memory */ - err=cudaMemcpy(xc,xhost,N*sizeof(float),cudaMemcpyHostToDevice); + err=cudaMemcpyAsync(xc,xhost,N*sizeof(float),cudaMemcpyHostToDevice,0); checkCudaError(err,__FILE__,__LINE__); /* free host buffer */ @@ -173,57 +118,38 @@ precalcoh_threadfn(void *data) { err=cudaSetDevice(card); checkCudaError(err,__FILE__,__LINE__); - err=cudaSetDeviceFlags(cudaDeviceLmemResizeToMax); - checkCudaError(err,__FILE__,__LINE__); - /* make sure enough heap memory is available for shapelet computations */ - size_t plim; - err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize); - checkCudaError(err,__FILE__,__LINE__); - if (plimtime_utc as input to fwrite. + if ((copyoftimed=(double*)calloc((size_t) t->tilesz, sizeof(double)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); } - double *ud,*vd,*wd; - double *cohd; - baseline_t *barrd; - double *freqsd; - float *longd=0,*latd=0; double *timed; int *Nelemd; - float **xx_p=0,**yy_p=0,**zz_p=0; + float **xx_p,**yy_p,**zz_p; float **xxd,**yyd,**zzd; /* allocate memory in GPU */ - err=cudaMalloc((void**) &cohd, t->Nbase*8*sizeof(double)); /* coherencies only for 1 cluster, Nf=1 */ + err=cudaMalloc((void**) &cohd, t->Nbase*8*sizeof(float)); /* coherencies only for 1 cluster, Nf=1 */ checkCudaError(err,__FILE__,__LINE__); err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t)); checkCudaError(err,__FILE__,__LINE__); err=cudaMalloc((void**) &Nelemd, t->N*sizeof(int)); checkCudaError(err,__FILE__,__LINE__); - /* u,v,w and l,m,n coords need to be double for precision */ - err=cudaMalloc((void**) &ud, t->Nbase*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &vd, t->Nbase*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &wd, t->Nbase*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(ud, t->u, t->Nbase*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(vd, t->v, t->Nbase*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(wd, t->w, t->Nbase*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - + /* copy to device */ + dtofcopy(t->Nbase,&ud,t->u); + dtofcopy(t->Nbase,&vd,t->v); + dtofcopy(t->Nbase,&wd,t->w); err=cudaMemcpy(barrd, t->barr, t->Nbase*sizeof(baseline_t), cudaMemcpyHostToDevice); checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &freqsd, t->Nf*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(freqsd, t->freqs, t->Nf*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - if (t->dobeam) { + dtofcopy(t->Nf,&freqsd,t->freqs); dtofcopy(t->N,&longd,t->longitude); dtofcopy(t->N,&latd,t->latitude); err=cudaMalloc((void**) &timed, t->tilesz*sizeof(double)); @@ -233,9 +159,19 @@ precalcoh_threadfn(void *data) { /* convert time jd to GMST angle */ cudakernel_convert_time(t->tilesz,timed); + // Fill copyoftimed. + err=cudaMemcpy((double*)copyoftimed, timed, t->tilesz*sizeof(double), cudaMemcpyDeviceToHost); + err=cudaMemcpy(Nelemd, t->Nelem, t->N*sizeof(int), cudaMemcpyHostToDevice); checkCudaError(err,__FILE__,__LINE__); + /* temp host storage to copy coherencies */ + complex float *tempcoh; + if ((tempcoh=(complex float*)calloc((size_t)t->Nbase*4,sizeof(complex float)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); + } + /* jagged arrays for element locations */ err=cudaMalloc((void**)&xxd, t->N*sizeof(int*)); checkCudaError(err,__FILE__,__LINE__); @@ -277,121 +213,40 @@ precalcoh_threadfn(void *data) { checkCudaError(err,__FILE__,__LINE__); err=cudaMemcpy(zzd, zz_p, t->N*sizeof(int*), cudaMemcpyHostToDevice); checkCudaError(err,__FILE__,__LINE__); - } - float *beamd; - double *lld,*mmd,*nnd; - double *sId; float *rad,*decd; - double *sQd,*sUd,*sVd; + /* temp host storage for beam */ + float *tempbeam; + + float *lld,*mmd,*nnd,*sId,*rad,*decd; unsigned char *styped; - double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d; - double *sQ0d,*sU0d,*sV0d; + float *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d; int **host_p,**dev_p; - - complex double *tempdcoh; - err=cudaMallocHost((void**)&tempdcoh,sizeof(complex double)*(size_t)t->Nbase*4); - checkCudaError(err,__FILE__,__LINE__); - /******************* begin loop over clusters **************************/ for (ncl=t->soff; nclsoff+t->Ns; ncl++) { - - if (t->dobeam) { - /* allocate memory for this clusters beam */ - err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float)); - checkCudaError(err,__FILE__,__LINE__); - } else { - beamd=0; - } + /* allocate memory for this clusters beam */ + err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float)); + checkCudaError(err,__FILE__,__LINE__); /* copy cluster details to GPU */ err=cudaMalloc((void**)&styped, t->carr[ncl].N*sizeof(unsigned char)); checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &lld, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(lld, t->carr[ncl].ll, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &mmd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(mmd, t->carr[ncl].mm, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &nnd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - - if (t->Nf==1) { - err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - } - - - - if (t->dobeam) { - dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra); - dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec); - } else { - rad=0; - decd=0; - } + dtofcopy(t->carr[ncl].N,&lld,t->carr[ncl].ll); + dtofcopy(t->carr[ncl].N,&mmd,t->carr[ncl].mm); + dtofcopy(t->carr[ncl].N,&nnd,t->carr[ncl].nn); + dtofcopy(t->carr[ncl].N,&sId,t->carr[ncl].sI); + dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra); + dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec); err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice); checkCudaError(err,__FILE__,__LINE__); - /* for multi channel data - FIXME: remove this part because Nf==1 always */ - if (t->Nf>1) { - err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &f0d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(f0d, t->carr[ncl].f0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &spec_idxd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(spec_idxd, t->carr[ncl].spec_idx, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &spec_idx1d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(spec_idx1d, t->carr[ncl].spec_idx1, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &spec_idx2d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - - err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - } + /* for multi channel data */ + dtofcopy(t->carr[ncl].N,&sI0d,t->carr[ncl].sI0); + dtofcopy(t->carr[ncl].N,&f0d,t->carr[ncl].f0); + dtofcopy(t->carr[ncl].N,&spec_idxd,t->carr[ncl].spec_idx); + dtofcopy(t->carr[ncl].N,&spec_idx1d,t->carr[ncl].spec_idx1); + dtofcopy(t->carr[ncl].N,&spec_idx2d,t->carr[ncl].spec_idx2); /* extra info for source, if any */ if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) { @@ -442,29 +297,152 @@ precalcoh_threadfn(void *data) { } + + if ((tempbeam=(float*)calloc((size_t) t->N*t->tilesz*t->carr[ncl].N*t->Nf,sizeof(float)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); + } + /* now copy pointer locations to device */ err=cudaMemcpy(dev_p, host_p, t->carr[ncl].N*sizeof(int*), cudaMemcpyHostToDevice); checkCudaError(err,__FILE__,__LINE__); - if (t->dobeam) { - /* now calculate beam for all sources in this cluster */ - cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd); - } + FILE *t_N; + t_N=fopen("t_N.bin","wb"); + fwrite(&t->N, sizeof(int), sizeof(t->N)/sizeof(int), t_N); + fclose(t_N); + FILE *t_tilesz; + t_tilesz=fopen("t_tilesz.bin","wb"); + fwrite(&t->tilesz, sizeof(int), sizeof(t->tilesz)/sizeof(int), t_tilesz); + fclose(t_tilesz); + + FILE *t_carr_ncl_N; + t_carr_ncl_N=fopen("t_carr_ncl_N.bin","wb"); + fwrite(&t->carr[ncl].N, sizeof(int), sizeof(t->carr[ncl].N)/sizeof(int), t_carr_ncl_N); + fclose(t_carr_ncl_N); + + FILE *t_Nf; + t_Nf=fopen("t_Nf.bin","wb"); + fwrite(&t->Nf, sizeof(int), sizeof(t->Nf)/sizeof(int), t_Nf); + fclose(t_Nf); + + FILE *freq_sd; + freq_sd=fopen("freq_sd.bin","wb"); + fwrite(t->freqs, sizeof(double), t->Nf, freq_sd); + fclose(freq_sd); + + FILE *long_d; + long_d=fopen("long_d.bin","wb"); + fwrite(t->longitude, sizeof(double), t->N, long_d); + fclose(long_d); + + FILE *lat_d; + lat_d=fopen("lat_d.bin","wb"); + fwrite(t->latitude, sizeof(double), t->N, lat_d); + fclose(lat_d); + + FILE *time_d; + time_d=fopen("time_d.bin","wb"); + fwrite(©oftimed, sizeof(double), t->tilesz, time_d); + fclose(time_d); + + FILE *Nelem_d; + Nelem_d=fopen("Nelem_d.bin","wb"); + fwrite(t->Nelem, sizeof(int), t->N, Nelem_d); + fclose(Nelem_d); + + FILE *xx_d; + xx_d=fopen("xx_d.bin","wb"); + int j; + for (j=0; jN; j++){ + fwrite(t->xx[j], sizeof(double), t->Nelem[j], xx_d); + } + fclose(xx_d); + + FILE *yy_d; + yy_d=fopen("yy_d.bin","wb"); + for (j=0; jN; j++){ + fwrite(t->yy[j], sizeof(double), t->Nelem[j], yy_d); + } + fclose(yy_d); + + FILE *zz_d; + zz_d=fopen("zz_d.bin","wb"); + for (j=0; jN; j++){ + fwrite(t->zz[j], sizeof(double), t->Nelem[j], zz_d); + } + fclose(zz_d); + + FILE *ra_d; + ra_d=fopen("ra_d.bin","wb"); + fwrite(t->carr[ncl].ra, t->carr[ncl].N, sizeof(double), ra_d); + fclose(ra_d); + + FILE *dec_d; + dec_d=fopen("dec_d.bin","wb"); + fwrite(t->carr[ncl].dec, t->carr[ncl].N, sizeof(double), dec_d); + fclose(dec_d); + + FILE *t_ph_ra0; + t_ph_ra0=fopen("t_ph_ra0.bin","wb"); + fwrite((double *)&t->ph_ra0, 1, sizeof(double), t_ph_ra0); + fclose(t_ph_ra0); + + FILE *t_ph_dec0; + t_ph_dec0=fopen("t_ph_dec0.bin","wb"); + fwrite((double *)&t->ph_dec0, 1, sizeof(double), t_ph_dec0); + fclose(t_ph_dec0); + + FILE *t_ph_freq0; + t_ph_freq0=fopen("t_ph_freq0.bin","wb"); + fwrite((double *)&t->ph_freq0, 1, sizeof(double), t_ph_freq0); + fclose(t_ph_freq0); + + /* now calculate beam for all sources in this cluster */ + cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd); + + FILE *beam_d; + beam_d=fopen("beam_d.bin","wb"); + err=cudaMemcpy((float*)tempbeam, beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float), cudaMemcpyDeviceToHost); + + for (j=0; jN*t->tilesz*t->carr[ncl].N*t->Nf; j++){ + fwrite(&tempbeam[j], sizeof(float), 1, beam_d); + } + fclose(beam_d); /* calculate coherencies for all sources in this cluster, add them up */ cudakernel_coherencies(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,barrd,freqsd,beamd, - lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam); + lld,mmd,nnd,sId,styped,sI0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,(float)t->fdelta,(float)t->tdelta,(float)t->dec0,cohd,t->dobeam); /* copy back coherencies to host, coherencies on host have 8M stride, on device have 8 stride */ - err=cudaMemcpy((double*)tempdcoh, cohd, sizeof(double)*t->Nbase*8, cudaMemcpyDeviceToHost); + err=cudaMemcpy((float*)tempcoh, cohd, sizeof(float)*t->Nbase*8, cudaMemcpyDeviceToHost); checkCudaError(err,__FILE__,__LINE__); + complex double *tempdcoh; + if ((tempdcoh=(complex double*)calloc((size_t)t->Nbase*4,sizeof(complex double)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); + } + int di; + double *dcp=(double*)tempdcoh; + float *fcp=(float*)tempcoh; + for (di=0; diNbase; di++) { + dcp[8*di]=(double)fcp[8*di]; + dcp[8*di+1]=(double)fcp[8*di+1]; + dcp[8*di+2]=(double)fcp[8*di+2]; + dcp[8*di+3]=(double)fcp[8*di+3]; + dcp[8*di+4]=(double)fcp[8*di+4]; + dcp[8*di+5]=(double)fcp[8*di+5]; + dcp[8*di+6]=(double)fcp[8*di+6]; + dcp[8*di+7]=(double)fcp[8*di+7]; + } /* now copy this with right offset and stride */ my_ccopy(t->Nbase,&tempdcoh[0],4,&(t->coh[4*ncl]),4*t->M); my_ccopy(t->Nbase,&tempdcoh[1],4,&(t->coh[4*ncl+1]),4*t->M); my_ccopy(t->Nbase,&tempdcoh[2],4,&(t->coh[4*ncl+2]),4*t->M); my_ccopy(t->Nbase,&tempdcoh[3],4,&(t->coh[4*ncl+3]),4*t->M); + free(tempdcoh); for (cj=0; cjcarr[ncl].N; cj++) { @@ -493,36 +471,24 @@ precalcoh_threadfn(void *data) { err=cudaFree(dev_p); checkCudaError(err,__FILE__,__LINE__); - if (t->dobeam) { - err=cudaFree(beamd); - checkCudaError(err,__FILE__,__LINE__); - } + + err=cudaFree(beamd); + checkCudaError(err,__FILE__,__LINE__); err=cudaFree(lld); checkCudaError(err,__FILE__,__LINE__); err=cudaFree(mmd); checkCudaError(err,__FILE__,__LINE__); err=cudaFree(nnd); checkCudaError(err,__FILE__,__LINE__); - if (t->Nf==1) { err=cudaFree(sId); checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(sQd); + err=cudaFree(rad); checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(sUd); + err=cudaFree(decd); checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(sVd); - checkCudaError(err,__FILE__,__LINE__); - } - if (t->dobeam) { - err=cudaFree(rad); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(decd); - checkCudaError(err,__FILE__,__LINE__); - } err=cudaFree(styped); checkCudaError(err,__FILE__,__LINE__); - if (t->Nf>1) { err=cudaFree(sI0d); checkCudaError(err,__FILE__,__LINE__); err=cudaFree(f0d); @@ -533,21 +499,14 @@ precalcoh_threadfn(void *data) { checkCudaError(err,__FILE__,__LINE__); err=cudaFree(spec_idx2d); checkCudaError(err,__FILE__,__LINE__); - - err=cudaFree(sQ0d); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(sU0d); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(sV0d); - checkCudaError(err,__FILE__,__LINE__); - } } /******************* end loop over clusters **************************/ - /* free memory */ - err=cudaFreeHost(tempdcoh); - checkCudaError(err,__FILE__,__LINE__); + free(tempcoh); + free(tempbeam); + free(copyoftimed); + /* free memory */ err=cudaFree(ud); checkCudaError(err,__FILE__,__LINE__); err=cudaFree(vd); @@ -560,8 +519,6 @@ precalcoh_threadfn(void *data) { checkCudaError(err,__FILE__,__LINE__); err=cudaFree(freqsd); checkCudaError(err,__FILE__,__LINE__); - - if (t->dobeam) { err=cudaFree(longd); checkCudaError(err,__FILE__,__LINE__); err=cudaFree(latd); @@ -571,6 +528,7 @@ precalcoh_threadfn(void *data) { err=cudaFree(Nelemd); checkCudaError(err,__FILE__,__LINE__); + for (ci=0; ciN; ci++) { err=cudaFree(xx_p[ci]); checkCudaError(err,__FILE__,__LINE__); @@ -590,11 +548,9 @@ precalcoh_threadfn(void *data) { free(xx_p); free(yy_p); free(zz_p); - } /* reset error state */ err=cudaGetLastError(); - return NULL; } @@ -630,7 +586,12 @@ precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex d taskhist thst; init_task_hist(&thst); - int Ngpu=MAX_GPU_ID+1; + int Ngpu; + if (M<4) { + Ngpu=2; + } else { + Ngpu=4; + } /* calculate min clusters thread can handle */ Nthb0=(M+Ngpu-1)/Ngpu; @@ -664,10 +625,10 @@ precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex d threaddata[nth].v=v; threaddata[nth].w=w; threaddata[nth].coh=x; - threaddata[nth].x=0; /* no input data */ + threaddata[nth].x=0; /* no data */ threaddata[nth].N=N; - threaddata[nth].Nbase=Nbase; /* total baselines: actually Nbasextilesz (from input) */ + threaddata[nth].Nbase=Nbase; /* total baselines: actually Nbasextilesz */ threaddata[nth].barr=barr; threaddata[nth].carr=carr; threaddata[nth].M=M; @@ -768,7 +729,6 @@ precalculate_coherencies_withbeam_gpu(double *u, double *v, double *w, complex d free(threaddata1); pthread_attr_destroy(&attr); free(th_array); - return 0; } @@ -789,33 +749,18 @@ predictvis_threadfn(void *data) { cudaError_t err; int ci,ncl,cj; - err=cudaSetDevice(card); checkCudaError(err,__FILE__,__LINE__); - err=cudaSetDeviceFlags(cudaDeviceLmemResizeToMax); - checkCudaError(err,__FILE__,__LINE__); - /* make sure enough heap memory is available for shapelet computations */ - size_t plim; - err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize); - checkCudaError(err,__FILE__,__LINE__); - if (plimNbase*8*t->Nf*sizeof(double)); /* coherencies only for 1 cluster, Nf freq, used to store sum of clusters*/ + err=cudaMalloc((void**) &cohd, t->Nbase*8*t->Nf*sizeof(float)); /* coherencies only for 1 cluster, Nf freq, used to store sum of clusters*/ checkCudaError(err,__FILE__,__LINE__); err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t)); checkCudaError(err,__FILE__,__LINE__); @@ -823,31 +768,13 @@ predictvis_threadfn(void *data) { checkCudaError(err,__FILE__,__LINE__); /* copy to device */ - /* u,v,w and l,m,n coords need to be double for precision */ - err=cudaMalloc((void**) &ud, t->Nbase*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &vd, t->Nbase*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &wd, t->Nbase*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(ud, t->u, t->Nbase*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(vd, t->v, t->Nbase*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(wd, t->w, t->Nbase*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - + dtofcopy(t->Nbase,&ud,t->u); + dtofcopy(t->Nbase,&vd,t->v); + dtofcopy(t->Nbase,&wd,t->w); err=cudaMemcpy(barrd, t->barr, t->Nbase*sizeof(baseline_t), cudaMemcpyHostToDevice); checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &freqsd, t->Nf*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(freqsd, t->freqs, t->Nf*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - /* check if beam is actually calculated */ - if (t->dobeam) { + dtofcopy(t->Nf,&freqsd,t->freqs); dtofcopy(t->N,&longd,t->longitude); dtofcopy(t->N,&latd,t->latitude); err=cudaMalloc((void**) &timed, t->tilesz*sizeof(double)); @@ -901,122 +828,39 @@ predictvis_threadfn(void *data) { checkCudaError(err,__FILE__,__LINE__); err=cudaMemcpy(zzd, zz_p, t->N*sizeof(int*), cudaMemcpyHostToDevice); checkCudaError(err,__FILE__,__LINE__); - } float *beamd; - double *lld,*mmd,*nnd; - double *sId; float *rad,*decd; - double *sQd,*sUd,*sVd; + float *lld,*mmd,*nnd,*sId,*rad,*decd; unsigned char *styped; - double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d; - double *sQ0d,*sU0d,*sV0d; + float *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d; int **host_p,**dev_p; - - double *xlocal; - err=cudaMallocHost((void**)&xlocal,sizeof(double)*(size_t)t->Nbase*8*t->Nf); - checkCudaError(err,__FILE__,__LINE__); - /******************* begin loop over clusters **************************/ for (ncl=t->soff; nclsoff+t->Ns; ncl++) { /* allocate memory for this clusters beam */ - if (t->dobeam) { - err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float)); - checkCudaError(err,__FILE__,__LINE__); - } else { - beamd=0; - } + err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float)); + checkCudaError(err,__FILE__,__LINE__); /* copy cluster details to GPU */ err=cudaMalloc((void**)&styped, t->carr[ncl].N*sizeof(unsigned char)); checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &lld, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(lld, t->carr[ncl].ll, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &mmd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(mmd, t->carr[ncl].mm, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &nnd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - - if (t->Nf==1) { - err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - } - - - if (t->dobeam) { - dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra); - dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec); - } else { - rad=0; - decd=0; - } + dtofcopy(t->carr[ncl].N,&lld,t->carr[ncl].ll); + dtofcopy(t->carr[ncl].N,&mmd,t->carr[ncl].mm); + dtofcopy(t->carr[ncl].N,&nnd,t->carr[ncl].nn); + dtofcopy(t->carr[ncl].N,&sId,t->carr[ncl].sI); + dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra); + dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec); err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice); checkCudaError(err,__FILE__,__LINE__); /* for multi channel data */ - if (t->Nf>1) { - err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &f0d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(f0d, t->carr[ncl].f0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &spec_idxd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(spec_idxd, t->carr[ncl].spec_idx, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &spec_idx1d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(spec_idx1d, t->carr[ncl].spec_idx1, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &spec_idx2d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - - err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - } - - + dtofcopy(t->carr[ncl].N,&sI0d,t->carr[ncl].sI0); + dtofcopy(t->carr[ncl].N,&f0d,t->carr[ncl].f0); + dtofcopy(t->carr[ncl].N,&spec_idxd,t->carr[ncl].spec_idx); + dtofcopy(t->carr[ncl].N,&spec_idx1d,t->carr[ncl].spec_idx1); + dtofcopy(t->carr[ncl].N,&spec_idx2d,t->carr[ncl].spec_idx2); /* extra info for source, if any */ if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) { @@ -1072,21 +916,37 @@ predictvis_threadfn(void *data) { checkCudaError(err,__FILE__,__LINE__); - if (t->dobeam) { - /* now calculate beam for all sources in this cluster */ - cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd); - } + /* now calculate beam for all sources in this cluster */ + cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd); /* calculate coherencies for all sources in this cluster, add them up */ cudakernel_coherencies(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,barrd,freqsd,beamd, - lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam); + lld,mmd,nnd,sId,styped,sI0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,(float)t->fdelta,(float)t->tdelta,(float)t->dec0,cohd,t->dobeam); /* copy back coherencies to host, coherencies on host have 8M stride, on device have 8 stride */ - err=cudaMemcpy(xlocal, cohd, sizeof(double)*t->Nbase*8*t->Nf, cudaMemcpyDeviceToHost); + float *tempx; + if ((tempx=(float*)calloc((size_t)t->Nbase*8*t->Nf,sizeof(float)))==0) { + fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); + exit(1); + } + err=cudaMemcpy(tempx, cohd, sizeof(float)*t->Nbase*8*t->Nf, cudaMemcpyDeviceToHost); checkCudaError(err,__FILE__,__LINE__); - my_daxpy(t->Nbase*8*t->Nf,xlocal,1.0,t->x); + /* copy back as double */ + int di; + for (di=0; diNbase*t->Nf; di++) { + t->x[8*di]=(double)tempx[8*di]; + t->x[8*di+1]=(double)tempx[8*di+1]; + t->x[8*di+2]=(double)tempx[8*di+2]; + t->x[8*di+3]=(double)tempx[8*di+3]; + t->x[8*di+4]=(double)tempx[8*di+4]; + t->x[8*di+5]=(double)tempx[8*di+5]; + t->x[8*di+6]=(double)tempx[8*di+6]; + t->x[8*di+7]=(double)tempx[8*di+7]; + } + free(tempx); + for (cj=0; cjcarr[ncl].N; cj++) { if (t->carr[ncl].stype[cj]==STYPE_POINT) { @@ -1114,36 +974,24 @@ predictvis_threadfn(void *data) { err=cudaFree(dev_p); checkCudaError(err,__FILE__,__LINE__); - if (t->dobeam) { - err=cudaFree(beamd); - checkCudaError(err,__FILE__,__LINE__); - } + + err=cudaFree(beamd); + checkCudaError(err,__FILE__,__LINE__); err=cudaFree(lld); checkCudaError(err,__FILE__,__LINE__); err=cudaFree(mmd); checkCudaError(err,__FILE__,__LINE__); err=cudaFree(nnd); checkCudaError(err,__FILE__,__LINE__); - if (t->Nf==1) { err=cudaFree(sId); checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(sQd); + err=cudaFree(rad); checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(sUd); + err=cudaFree(decd); checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(sVd); - checkCudaError(err,__FILE__,__LINE__); - } - if (t->dobeam) { - err=cudaFree(rad); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(decd); - checkCudaError(err,__FILE__,__LINE__); - } err=cudaFree(styped); checkCudaError(err,__FILE__,__LINE__); - if (t->Nf>1) { err=cudaFree(sI0d); checkCudaError(err,__FILE__,__LINE__); err=cudaFree(f0d); @@ -1154,21 +1002,10 @@ predictvis_threadfn(void *data) { checkCudaError(err,__FILE__,__LINE__); err=cudaFree(spec_idx2d); checkCudaError(err,__FILE__,__LINE__); - - err=cudaFree(sQ0d); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(sU0d); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(sV0d); - checkCudaError(err,__FILE__,__LINE__); - } } /******************* end loop over clusters **************************/ /* free memory */ - err=cudaFreeHost(xlocal); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(ud); checkCudaError(err,__FILE__,__LINE__); err=cudaFree(vd); @@ -1181,8 +1018,6 @@ predictvis_threadfn(void *data) { checkCudaError(err,__FILE__,__LINE__); err=cudaFree(freqsd); checkCudaError(err,__FILE__,__LINE__); - - if (t->dobeam) { err=cudaFree(longd); checkCudaError(err,__FILE__,__LINE__); err=cudaFree(latd); @@ -1212,9 +1047,6 @@ predictvis_threadfn(void *data) { free(xx_p); free(yy_p); free(zz_p); - } - - /* reset error state */ err=cudaGetLastError(); @@ -1235,8 +1067,12 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit taskhist thst; init_task_hist(&thst); - /* oversubsribe GPU */ - int Ngpu=MAX_GPU_ID+1; + int Ngpu; + if (M<4) { + Ngpu=2; + } else { + Ngpu=4; + } /* calculate min clusters thread can handle */ Nthb0=(M+Ngpu-1)/Ngpu; @@ -1261,14 +1097,14 @@ double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latit exit(1); } - if (add_to_data==SIMUL_ONLY) { - /* set input column to zero */ + if (!add_to_data) { + /* set output column to zero */ memset(x,0,sizeof(double)*Nbase*8*tilesz*Nchan); } - /* set common parameters, and split clusters to threads */ + /* set common parameters, and split baselines to threads */ ci=0; for (nth=0; nthM<=MAX_GPU_ID) { - card=select_work_gpu(MAX_GPU_ID,t->hst); - } else { - card=t->tid; - } - cudaError_t err; - int ci,ncl,cj; - - - err=cudaSetDevice(card); - checkCudaError(err,__FILE__,__LINE__); - err=cudaSetDeviceFlags(cudaDeviceLmemResizeToMax); - checkCudaError(err,__FILE__,__LINE__); - - /* make sure enough heap memory is available for shapelet computations */ - size_t plim; - err=cudaDeviceGetLimit(&plim,cudaLimitMallocHeapSize); - checkCudaError(err,__FILE__,__LINE__); - if (plimNbase*8*t->Nf*sizeof(double)); /* coherencies only for 1 cluster, Nf freq, used to store sum of clusters*/ - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &barrd, t->Nbase*sizeof(baseline_t)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &Nelemd, t->N*sizeof(int)); - checkCudaError(err,__FILE__,__LINE__); - - /* copy to device */ - /* u,v,w and l,m,n coords need to be double for precision */ - err=cudaMalloc((void**) &ud, t->Nbase*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &vd, t->Nbase*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &wd, t->Nbase*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(ud, t->u, t->Nbase*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(vd, t->v, t->Nbase*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(wd, t->w, t->Nbase*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - - err=cudaMemcpy(barrd, t->barr, t->Nbase*sizeof(baseline_t), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &freqsd, t->Nf*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(freqsd, t->freqs, t->Nf*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - - /* check if beam is actually calculated */ - if (t->dobeam) { - dtofcopy(t->N,&longd,t->longitude); - dtofcopy(t->N,&latd,t->latitude); - err=cudaMalloc((void**) &timed, t->tilesz*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(timed, t->time_utc, t->tilesz*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - /* convert time jd to GMST angle */ - cudakernel_convert_time(t->tilesz,timed); - - err=cudaMemcpy(Nelemd, t->Nelem, t->N*sizeof(int), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - /* jagged arrays for element locations */ - err=cudaMalloc((void**)&xxd, t->N*sizeof(int*)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**)&yyd, t->N*sizeof(int*)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**)&zzd, t->N*sizeof(int*)); - checkCudaError(err,__FILE__,__LINE__); - /* allocate host memory to store pointers */ - if ((xx_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) { - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); - exit(1); - } - if ((yy_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) { - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); - exit(1); - } - if ((zz_p=(float**)calloc((size_t)t->N,sizeof(int*)))==0) { - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); - exit(1); - } - for (ci=0; ciN; ci++) { - err=cudaMalloc((void**)&xx_p[ci], t->Nelem[ci]*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**)&yy_p[ci], t->Nelem[ci]*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**)&zz_p[ci], t->Nelem[ci]*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - } - /* now copy data */ - for (ci=0; ciN; ci++) { - dtofcopy(t->Nelem[ci],&xx_p[ci],t->xx[ci]); - dtofcopy(t->Nelem[ci],&yy_p[ci],t->yy[ci]); - dtofcopy(t->Nelem[ci],&zz_p[ci],t->zz[ci]); - } - /* now copy pointer locations to device */ - err=cudaMemcpy(xxd, xx_p, t->N*sizeof(int*), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(yyd, yy_p, t->N*sizeof(int*), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(zzd, zz_p, t->N*sizeof(int*), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - } - - - float *beamd; - double *lld,*mmd,*nnd; - double *sId; float *rad,*decd; - double *sQd,*sUd,*sVd; - unsigned char *styped; - double *sI0d,*f0d,*spec_idxd,*spec_idx1d,*spec_idx2d; - double *sQ0d,*sU0d,*sV0d; - int **host_p,**dev_p; - - double *xlocal; - err=cudaMallocHost((void**)&xlocal,sizeof(double)*(size_t)t->Nbase*8*t->Nf); - checkCudaError(err,__FILE__,__LINE__); - - double *pd; /* parameter array per cluster */ - -/******************* begin loop over clusters **************************/ - for (ncl=t->soff; nclsoff+t->Ns; ncl++) { - /* check if cluster id >=0 to do a subtraction */ - if (t->carr[ncl].id>=0) { - /* allocate memory for this clusters beam */ - if (t->dobeam) { - err=cudaMalloc((void**)&beamd, t->N*t->tilesz*t->carr[ncl].N*t->Nf*sizeof(float)); - checkCudaError(err,__FILE__,__LINE__); - } else { - beamd=0; - } - - - /* copy cluster details to GPU */ - err=cudaMalloc((void**)&styped, t->carr[ncl].N*sizeof(unsigned char)); - checkCudaError(err,__FILE__,__LINE__); - - err=cudaMalloc((void**) &lld, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(lld, t->carr[ncl].ll, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &mmd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(mmd, t->carr[ncl].mm, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &nnd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(nnd, t->carr[ncl].nn, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - /* parameter vector size may change depending on hybrid parameter */ - err=cudaMalloc((void**) &pd, t->N*8*t->carr[ncl].nchunk*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(pd, &(t->p[t->carr[ncl].p[0]]), t->N*8*t->carr[ncl].nchunk*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - - if (t->Nf==1) { - err=cudaMalloc((void**) &sId, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sId, t->carr[ncl].sI, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - err=cudaMalloc((void**) &sQd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sQd, t->carr[ncl].sQ, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &sUd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sUd, t->carr[ncl].sU, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &sVd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sVd, t->carr[ncl].sV, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - } - - - if (t->dobeam) { - dtofcopy(t->carr[ncl].N,&rad,t->carr[ncl].ra); - dtofcopy(t->carr[ncl].N,&decd,t->carr[ncl].dec); - } else { - rad=0; - decd=0; - } - err=cudaMemcpy(styped, t->carr[ncl].stype, t->carr[ncl].N*sizeof(unsigned char), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - /* for multi channel data */ - if (t->Nf>1) { - err=cudaMalloc((void**) &sI0d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sI0d, t->carr[ncl].sI0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &f0d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(f0d, t->carr[ncl].f0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &spec_idxd, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(spec_idxd, t->carr[ncl].spec_idx, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &spec_idx1d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(spec_idx1d, t->carr[ncl].spec_idx1, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &spec_idx2d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(spec_idx2d, t->carr[ncl].spec_idx2, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - - err=cudaMalloc((void**) &sQ0d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sQ0d, t->carr[ncl].sQ0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &sU0d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sU0d, t->carr[ncl].sU0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &sV0d, t->carr[ncl].N*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(sV0d, t->carr[ncl].sV0, t->carr[ncl].N*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - } - - - - /* extra info for source, if any */ - if ((host_p=(int**)calloc((size_t)t->carr[ncl].N,sizeof(int*)))==0) { - fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__); - exit(1); - } - err=cudaMalloc((void**)&dev_p, t->carr[ncl].N*sizeof(int*)); - checkCudaError(err,__FILE__,__LINE__); - - - for (cj=0; cjcarr[ncl].N; cj++) { - - if (t->carr[ncl].stype[cj]==STYPE_POINT) { - host_p[cj]=0; - } else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) { - exinfo_shapelet *d=(exinfo_shapelet*)t->carr[ncl].ex[cj]; - err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_shapelet)); - checkCudaError(err,__FILE__,__LINE__); - double *modes; - err=cudaMalloc((void**)&modes, d->n0*d->n0*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_shapelet), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(modes, d->modes, d->n0*d->n0*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj]; - err=cudaMemcpy(&(d_p->modes), &modes, sizeof(double*), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - } else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) { - exinfo_gaussian *d=(exinfo_gaussian*)t->carr[ncl].ex[cj]; - err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_gaussian)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_gaussian), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - } else if (t->carr[ncl].stype[cj]==STYPE_DISK) { - exinfo_disk *d=(exinfo_disk*)t->carr[ncl].ex[cj]; - err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_disk)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_disk), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - } else if (t->carr[ncl].stype[cj]==STYPE_RING) { - exinfo_ring *d=(exinfo_ring*)t->carr[ncl].ex[cj]; - err=cudaMalloc((void**)&host_p[cj], sizeof(exinfo_ring)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(host_p[cj], d, sizeof(exinfo_ring), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - } - - - } - /* now copy pointer locations to device */ - err=cudaMemcpy(dev_p, host_p, t->carr[ncl].N*sizeof(int*), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - - if (t->dobeam) { - /* now calculate beam for all sources in this cluster */ - cudakernel_array_beam(t->N,t->tilesz,t->carr[ncl].N,t->Nf,freqsd,longd,latd,timed,Nelemd,xxd,yyd,zzd,rad,decd,(float)t->ph_ra0,(float)t->ph_dec0,(float)t->ph_freq0,beamd); - } - - - /* calculate coherencies for all sources in this cluster, add them up */ - cudakernel_residuals(t->Nbase,t->N,t->tilesz,t->carr[ncl].N,t->Nf,ud,vd,wd,pd,t->carr[ncl].nchunk,barrd,freqsd,beamd, - lld,mmd,nnd,sId,sQd,sUd,sVd,styped,sI0d,sQ0d,sU0d,sV0d,f0d,spec_idxd,spec_idx1d,spec_idx2d,dev_p,t->fdelta,t->tdelta,t->dec0,cohd,t->dobeam); - - /* copy back coherencies to host, - coherencies on host have 8M stride, on device have 8 stride */ - err=cudaMemcpy(xlocal, cohd, sizeof(double)*t->Nbase*8*t->Nf, cudaMemcpyDeviceToHost); - checkCudaError(err,__FILE__,__LINE__); - my_daxpy(t->Nbase*8*t->Nf,xlocal,1.0,t->x); - - for (cj=0; cjcarr[ncl].N; cj++) { - if (t->carr[ncl].stype[cj]==STYPE_POINT) { - } else if (t->carr[ncl].stype[cj]==STYPE_SHAPELET) { - exinfo_shapelet *d_p=(exinfo_shapelet *)host_p[cj]; - double *modes=0; - err=cudaMemcpy(&modes, &(d_p->modes), sizeof(double*), cudaMemcpyDeviceToHost); - err=cudaFree(modes); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(host_p[cj]); - checkCudaError(err,__FILE__,__LINE__); - } else if (t->carr[ncl].stype[cj]==STYPE_GAUSSIAN) { - err=cudaFree(host_p[cj]); - checkCudaError(err,__FILE__,__LINE__); - } else if (t->carr[ncl].stype[cj]==STYPE_DISK) { - err=cudaFree(host_p[cj]); - checkCudaError(err,__FILE__,__LINE__); - } else if (t->carr[ncl].stype[cj]==STYPE_RING) { - err=cudaFree(host_p[cj]); - checkCudaError(err,__FILE__,__LINE__); - } - } - free(host_p); - - err=cudaFree(dev_p); - checkCudaError(err,__FILE__,__LINE__); - - if (t->dobeam) { - err=cudaFree(beamd); - checkCudaError(err,__FILE__,__LINE__); - } - err=cudaFree(lld); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(mmd); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(nnd); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(pd); - checkCudaError(err,__FILE__,__LINE__); - if (t->Nf==1) { - err=cudaFree(sId); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(sQd); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(sUd); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(sVd); - checkCudaError(err,__FILE__,__LINE__); - } - if (t->dobeam) { - err=cudaFree(rad); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(decd); - checkCudaError(err,__FILE__,__LINE__); - } - err=cudaFree(styped); - checkCudaError(err,__FILE__,__LINE__); - - if (t->Nf>1) { - err=cudaFree(sI0d); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(f0d); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(spec_idxd); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(spec_idx1d); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(spec_idx2d); - checkCudaError(err,__FILE__,__LINE__); - - err=cudaFree(sQ0d); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(sU0d); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(sV0d); - checkCudaError(err,__FILE__,__LINE__); - } - } - } -/******************* end loop over clusters **************************/ - - /* free memory */ - err=cudaFreeHost(xlocal); - checkCudaError(err,__FILE__,__LINE__); - - err=cudaFree(ud); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(vd); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(wd); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(cohd); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(barrd); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(freqsd); - checkCudaError(err,__FILE__,__LINE__); - - if (t->dobeam) { - err=cudaFree(longd); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(latd); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(timed); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(Nelemd); - checkCudaError(err,__FILE__,__LINE__); - - - for (ci=0; ciN; ci++) { - err=cudaFree(xx_p[ci]); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(yy_p[ci]); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(zz_p[ci]); - checkCudaError(err,__FILE__,__LINE__); - } - - err=cudaFree(xxd); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(yyd); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(zzd); - checkCudaError(err,__FILE__,__LINE__); - - free(xx_p); - free(yy_p); - free(zz_p); - } - - - - /* reset error state */ - err=cudaGetLastError(); - return NULL; - -} - - -static void * -correct_threadfn(void *data) { - thread_data_corr_t *t=(thread_data_corr_t*)data; - /* first, select a GPU, if total threads > MAX_GPU_ID - use random selection, elese use this thread id */ - int card; - if (t->tid>MAX_GPU_ID) { - card=select_work_gpu(MAX_GPU_ID,t->hst); - } else { - card=t->tid; - } - cudaError_t err; - int cf; - - err=cudaSetDevice(card); - checkCudaError(err,__FILE__,__LINE__); - err=cudaSetDeviceFlags(cudaDeviceLmemResizeToMax); - checkCudaError(err,__FILE__,__LINE__); - - double *xd; - baseline_t *barrd; - double *pd; - - /* allocate memory in GPU */ - err=cudaMalloc((void**) &xd, t->Nb*8*t->Nf*sizeof(double)); /* coherencies only for Nb baselines, Nf freqs */ - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &barrd, t->Nb*sizeof(baseline_t)); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMalloc((void**) &pd, t->N*8*t->nchunk*sizeof(double)); - checkCudaError(err,__FILE__,__LINE__); - - /* copy with right offset */ - err=cudaMemcpy(barrd, &(t->barr[t->boff]), t->Nb*sizeof(baseline_t), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - err=cudaMemcpy(pd, t->pinv, t->N*8*t->nchunk*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - - for (cf=0; cfNf; cf++) { - err=cudaMemcpy(&xd[cf*8*t->Nb], &(t->x[cf*8*t->Nbase+t->boff*8]), t->Nb*8*sizeof(double), cudaMemcpyHostToDevice); - checkCudaError(err,__FILE__,__LINE__); - } - - /* run kernel */ - cudakernel_correct_residuals(t->Nbase,t->N,t->Nb,t->boff,t->Nf,t->nchunk,xd,pd,barrd); - - /* copy back corrected result */ - for (cf=0; cfNf; cf++) { - err=cudaMemcpy(&(t->x[cf*8*t->Nbase+t->boff*8]), &xd[cf*8*t->Nb], t->Nb*8*sizeof(double), cudaMemcpyDeviceToHost); - checkCudaError(err,__FILE__,__LINE__); - } - - err=cudaFree(xd); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(barrd); - checkCudaError(err,__FILE__,__LINE__); - err=cudaFree(pd); - checkCudaError(err,__FILE__,__LINE__); - - - /* reset error state */ - err=cudaGetLastError(); - return NULL; -} - -/* p: 8NMx1 parameter array, but M is 'effective' clusters, need to use carr to find the right offset - ccid: which cluster to use as correction - rho: MMSE robust parameter J+rho I inverted - - phase_only: if >0, and if there is any correction done, use only phase of diagonal elements for correction -*/ -int -calculate_residuals_multifreq_withbeam_gpu(double *u,double *v,double *w,double *p,double *x,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, int M,double *freqs,int Nchan, double fdelta,double tdelta, double dec0, -double ph_ra0, double ph_dec0, double ph_freq0, double *longitude, double *latitude, double *time_utc, int *Nelem, double **xx, double **yy, double **zz, int dobeam, int Nt, int ccid, double rho, int phase_only) { - - int nth,ci; - - int Nthb0,Nthb; - pthread_attr_t attr; - pthread_t *th_array; - thread_data_pred_t *threaddata; - thread_data_corr_t *threaddata_corr; - taskhist thst; - init_task_hist(&thst); - - /* oversubsribe GPU */ - int Ngpu=MAX_GPU_ID+1; - - /* calculate min clusters thread can handle */ - Nthb0=(M+Ngpu-1)/Ngpu; - - /* setup threads : note: Ngpu is no of GPUs used */ - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE); - - if ((th_array=(pthread_t*)malloc((size_t)Ngpu*sizeof(pthread_t)))==0) { - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); - exit(1); - } - if ((threaddata=(thread_data_pred_t*)malloc((size_t)Ngpu*sizeof(thread_data_pred_t)))==0) { - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); - exit(1); - } - - /* arrays to store result */ - double *xlocal; - if ((xlocal=(double*)calloc((size_t)Nbase*8*tilesz*Nchan*Ngpu,sizeof(double)))==0) { - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); - exit(1); - } - - - /* set common parameters, and split clusters to threads */ - ci=0; - for (nth=0; nth=0) { - double *pm,*pinv,*pphase=0; - /* allocate memory for inverse J */ - if ((pinv=(double*)malloc((size_t)8*N*carr[cm].nchunk*sizeof(double)))==0) { - fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__); - exit(1); - } - if (!phase_only) { - for (cj=0; cj sm.ms.output +../src/MS/sagecal -d sm.ms -s extended_source_list.txt -c extended_source_list.txt.cluster -n 4 -t 10 -p sm.ms.solutions -e 4 -g 2 -l 10 -m 7 -x 30 -F 1 -j 2 -k -1 -B 1 -W 0 > sm.ms.output