removed diagnostic routines that are not used anymore
This commit is contained in:
parent
6330095ca2
commit
2d8ff507a8
|
@ -1,270 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
#include "cuda.h"
|
||||
#include <cuComplex.h>
|
||||
#include <stdio.h>
|
||||
|
||||
/* enable this for checking for kernel failure */
|
||||
#define CUDA_DBG
|
||||
|
||||
__global__ void
|
||||
kernel_sqrtdiv_fl(int M, float eps, float *__restrict__ x){
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* make sure to use only M threads */
|
||||
if (tid<M) {
|
||||
if (x[tid]>eps) {
|
||||
x[tid]=1.0f/sqrtf(x[tid]);
|
||||
} else {
|
||||
x[tid]=0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
kernel_diagmult_fl(int M, float *__restrict__ U, const float *__restrict__ D) {
|
||||
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* which column this tid operates on */
|
||||
unsigned int col = tid/M;
|
||||
if (tid<M*M) {
|
||||
U[tid]=U[tid]*D[col];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
__global__ void
|
||||
kernel_jnorm_fl(int N, int M, const float *__restrict__ J, float *__restrict__ d) {
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* each thread handles one row */
|
||||
if (tid<N) {
|
||||
d[tid]=0.0f;
|
||||
for (int ci=0; ci<M; ci++) {
|
||||
/* J is transposed, so read each column */
|
||||
d[tid]=d[tid]+J[tid*M+ci]*J[tid*M+ci];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
kernel_jacf_fl2(int Nbase, int M, float *__restrict__ jac, const float *__restrict__ coh, const float *__restrict__ p, const short *__restrict__ bb, int N){
|
||||
/* global thread index : equal to the baseline */
|
||||
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
/* which parameter:0...M */
|
||||
unsigned int m = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
|
||||
if(n<Nbase && m<M) {
|
||||
int sta1=(int)bb[3*n];
|
||||
int sta2=(int)bb[3*n+1];
|
||||
/* condition for calculating this baseline sum is
|
||||
If this baseline is flagged,
|
||||
or if this parameter does not belong to sta1 or sta2
|
||||
we do not compute
|
||||
*/
|
||||
int stc=m>>3; /* 0...Ns-1 (because M=total par= 8 * Nstations */
|
||||
/* flags are not taken into account */
|
||||
if (((stc==sta2)||(stc==sta1))) {
|
||||
|
||||
cuFloatComplex C[4];
|
||||
C[0].x=coh[8*n];
|
||||
C[0].y=coh[8*n+1];
|
||||
C[1].x=coh[8*n+2];
|
||||
C[1].y=coh[8*n+3];
|
||||
C[2].x=coh[8*n+4];
|
||||
C[2].y=coh[8*n+5];
|
||||
C[3].x=coh[8*n+6];
|
||||
C[3].y=coh[8*n+7];
|
||||
|
||||
/* which parameter exactly 0..7 */
|
||||
int stoff=m-stc*8;
|
||||
float pp1[8];
|
||||
float pp2[8];
|
||||
if (stc==sta1) {
|
||||
for (int cn=0; cn<8; cn++) {
|
||||
pp1[cn]=0.0f;
|
||||
pp2[cn]=p[sta2*8+cn];
|
||||
}
|
||||
pp1[stoff]=1.0f;
|
||||
} else if (stc==sta2) {
|
||||
for (int cn=0; cn<8; cn++) {
|
||||
pp2[cn]=0.0f;
|
||||
pp1[cn]=p[sta1*8+cn];
|
||||
}
|
||||
pp2[stoff]=1.0f;
|
||||
}
|
||||
|
||||
|
||||
cuFloatComplex G1[4];
|
||||
G1[0].x=pp1[0];
|
||||
G1[0].y=pp1[1];
|
||||
G1[1].x=pp1[2];
|
||||
G1[1].y=pp1[3];
|
||||
G1[2].x=pp1[4];
|
||||
G1[2].y=pp1[5];
|
||||
G1[3].x=pp1[6];
|
||||
G1[3].y=pp1[7];
|
||||
|
||||
cuFloatComplex T1[4];
|
||||
/* T=G1*C */
|
||||
T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
|
||||
T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
|
||||
T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
|
||||
T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
|
||||
|
||||
cuFloatComplex G2[4];
|
||||
/* conjugate this */
|
||||
G2[0].x=pp2[0];
|
||||
G2[0].y=-pp2[1];
|
||||
G2[2].x=pp2[2];
|
||||
G2[2].y=-pp2[3];
|
||||
G2[1].x=pp2[4];
|
||||
G2[1].y=-pp2[5];
|
||||
G2[3].x=pp2[6];
|
||||
G2[3].y=-pp2[7];
|
||||
|
||||
cuFloatComplex T2[4];
|
||||
T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
|
||||
T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
|
||||
T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
|
||||
T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
|
||||
/* update jacobian */
|
||||
/* NOTE: row major order */
|
||||
jac[m+M*8*n]=T2[0].x;
|
||||
jac[m+M*(8*n+1)]=T2[0].y;
|
||||
jac[m+M*(8*n+2)]=T2[1].x;
|
||||
jac[m+M*(8*n+3)]=T2[1].y;
|
||||
jac[m+M*(8*n+4)]=T2[2].x;
|
||||
jac[m+M*(8*n+5)]=T2[2].y;
|
||||
jac[m+M*(8*n+6)]=T2[3].x;
|
||||
jac[m+M*(8*n+7)]=T2[3].y;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* only use extern if calling code is C */
|
||||
extern "C"
|
||||
{
|
||||
|
||||
|
||||
/* cuda driver for calculating jacf() */
|
||||
/* p: params (Mx1), jac: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
|
||||
void
|
||||
cudakernel_jacf_fl2(float *p, float *jac, int M, int N, float *coh, short *bbh, int Nbase, int Mclus, int Nstations) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
/* NOTE: use small value for ThreadsPerBlock here, like 8 */
|
||||
dim3 threadsPerBlock(16, 8);
|
||||
/* jacobian: Nbase x Nstations (proportional to N), so */
|
||||
dim3 numBlocks((Nbase+threadsPerBlock.x-1)/threadsPerBlock.x,
|
||||
(M+threadsPerBlock.y-1)/threadsPerBlock.y);
|
||||
/* set memory of jac to zero */
|
||||
cudaMemset(jac, 0, N*M*sizeof(float));
|
||||
// printf("Kernel Jax data size=%d, params=%d, block=%d,%d, thread=%d,%d, baselines=%d\n",N, M, numBlocks.x,numBlocks.y, threadsPerBlock.x, threadsPerBlock.y, Nbase);
|
||||
kernel_jacf_fl2<<< numBlocks, threadsPerBlock>>>(Nbase, M, jac, coh, p, bbh, Nstations);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
/* invert sqrt(singular values) 1/Sd[] for Sd[]> eps */
|
||||
void
|
||||
cudakernel_sqrtdiv_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float eps, float *Sd) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_sqrtdiv_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, eps, Sd);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* U <= U D,
|
||||
U : MxM
|
||||
D : Mx1, diagonal matrix
|
||||
*/
|
||||
void
|
||||
cudakernel_diagmult_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float *U, float *D) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_diagmult_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, U, D);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* diag(J^T J)
|
||||
d[i] = J[i,:] * J[i,:]
|
||||
J: NxM (in row major order, so J[i,:] is actually J[:,i]
|
||||
d: Nx1
|
||||
*/
|
||||
void
|
||||
cudakernel_jnorm_fl(int ThreadsPerBlock, int BlocksPerGrid, float *J, int N, int M, float *d) {
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_jnorm_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N,M,J,d);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
|
@ -1,550 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2014 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
#include "Dirac.h"
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <pthread.h>
|
||||
#include <math.h>
|
||||
|
||||
|
||||
static void
|
||||
checkCudaError(cudaError_t err, const char *file, int line)
|
||||
{
|
||||
#ifdef CUDA_DEBUG
|
||||
if(!err)
|
||||
return;
|
||||
fprintf(stderr,"GPU (CUDA): %s %s %d\n", cudaGetErrorString(err),file,line);
|
||||
exit(EXIT_FAILURE);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
checkCublasError(cublasStatus_t cbstatus, char *file, int line)
|
||||
{
|
||||
#ifdef CUDA_DEBUG
|
||||
if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
|
||||
fprintf(stderr,"%s: %d: CUBLAS failure\n",file,line);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/* find for one cluster J (J^T W J+ eW)^-1 J^T and extract diagonal as output
|
||||
p: parameters M x 1
|
||||
rd: residual vector N x 1 (on the device, invarient)
|
||||
x: (output) diagonal of leverage matrix
|
||||
|
||||
cbhandle,gWORK: BLAS/storage pointers
|
||||
|
||||
tileoff: need for hybrid parameters
|
||||
|
||||
adata: has all additional info: coherency,baselines,flags
|
||||
*/
|
||||
static int
|
||||
calculate_leverage(float *p, float *rd, float *x, int M, int N, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle, float *gWORK, int tileoff, int ntiles, me_data_t *dp) {
|
||||
|
||||
/* p needs to be copied to device and x needs to be copied back from device
|
||||
rd always remains in the device (select part with the right offset)
|
||||
N will change in hybrid mode, so copy back to x with right offset */
|
||||
|
||||
int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
|
||||
float *jacd,*xd,*jacTjacd,*pd,*cohd,*Ud,*VTd,*Sd;
|
||||
unsigned long int moff=0;
|
||||
short *bbd;
|
||||
|
||||
cudaError_t err;
|
||||
|
||||
/* total storage N+M*N+M*M+M+Nbase*8+M*M+M*M+M+M+Nbase*3(short)/(float) */
|
||||
xd=&gWORK[moff];
|
||||
moff+=N;
|
||||
jacd=&gWORK[moff];
|
||||
moff+=M*N;
|
||||
jacTjacd=&gWORK[moff];
|
||||
moff+=M*M;
|
||||
pd=&gWORK[moff];
|
||||
moff+=M;
|
||||
cohd=&gWORK[moff];
|
||||
moff+=Nbase*8;
|
||||
Ud=&gWORK[moff];
|
||||
moff+=M*M;
|
||||
VTd=&gWORK[moff];
|
||||
moff+=M*M;
|
||||
Sd=&gWORK[moff];
|
||||
moff+=M;
|
||||
|
||||
bbd=(short*)&gWORK[moff];
|
||||
moff+=(Nbase*3*sizeof(short))/sizeof(float);
|
||||
|
||||
err=cudaMemcpyAsync(pd, p, M*sizeof(float), cudaMemcpyHostToDevice,0);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* need to give right offset for coherencies */
|
||||
/* offset: cluster offset+time offset */
|
||||
err=cudaMemcpyAsync(cohd, &(dp->ddcohf[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(float), cudaMemcpyHostToDevice,0);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* correct offset for baselines */
|
||||
err=cudaMemcpyAsync(bbd, &(dp->ddbase[3*(dp->Nbase)*(tileoff)]), Nbase*3*sizeof(short), cudaMemcpyHostToDevice,0);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
int ThreadsPerBlock=DEFAULT_TH_PER_BK;
|
||||
int ci,Mi;
|
||||
|
||||
/* extra storage for cusolver */
|
||||
int work_size=0;
|
||||
int *devInfo;
|
||||
err=cudaMalloc((void**)&devInfo, sizeof(int));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
float *work;
|
||||
float *rwork;
|
||||
cusolverDnSgesvd_bufferSize(solver_handle, M, M, &work_size);
|
||||
err=cudaMalloc((void**)&work, work_size*sizeof(float));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&rwork, 5*M*sizeof(float));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
/* set mem to 0 */
|
||||
cudaMemset(xd, 0, N*sizeof(float));
|
||||
|
||||
/* calculate J^T, not taking flags into account */
|
||||
cudakernel_jacf_fl2(pd, jacd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
|
||||
|
||||
/* calculate JTJ=(J^T J - [e] [W]) */
|
||||
//status=culaDeviceSgemm('N','T',M,M,N,1.0f,jacd,M,jacd,M,0.0f,jacTjacd,M);
|
||||
//checkStatus(status,__FILE__,__LINE__);
|
||||
cublasStatus_t cbstatus=CUBLAS_STATUS_SUCCESS;
|
||||
float cone=1.0f; float czero=0.0f;
|
||||
cbstatus=cublasSgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_T,M,M,N,&cone,jacd,M,jacd,M,&czero,jacTjacd,M);
|
||||
|
||||
|
||||
/* add mu * I to JTJ */
|
||||
cudakernel_diagmu_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, jacTjacd, 1e-9f);
|
||||
|
||||
/* calculate inv(JTJ) using SVD */
|
||||
/* inv(JTJ) = Ud x Sid x VTd : we take into account that JTJ is symmetric */
|
||||
//status=culaDeviceSgesvd('A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M);
|
||||
//checkStatus(status,__FILE__,__LINE__);
|
||||
cusolverDnSgesvd(solver_handle,'A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M,work,work_size,rwork,devInfo);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
|
||||
/* find Sd= 1/sqrt(Sd) of the singular values (positive singular values) */
|
||||
cudakernel_sqrtdiv_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, 1e-9f, Sd);
|
||||
|
||||
/* multiply Ud with Sid (diagonal) Ud <= Ud Sid (columns modified) */
|
||||
cudakernel_diagmult_fl(ThreadsPerBlock, (M*M+ThreadsPerBlock-1)/ThreadsPerBlock, M, Ud, Sd);
|
||||
/* now multiply Ud VTd to get the square root */
|
||||
//status=culaDeviceSgemm('N','N',M,M,M,1.0f,Ud,M,VTd,M,0.0f,jacTjacd,M);
|
||||
//checkStatus(status,__FILE__,__LINE__);
|
||||
cbstatus=cublasSgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_N,M,M,M,&cone,Ud,M,VTd,M,&czero,jacTjacd,M);
|
||||
|
||||
/* calculate J^T, without taking flags into account (use same storage as previous J^T) */
|
||||
cudakernel_jacf_fl2(pd, jacd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
|
||||
|
||||
/* multiply (J^T)^T sqrt(B) == sqrt(B)^T J^T, taking M columns at a time */
|
||||
for (ci=0; ci<(N+M-1)/M;ci++) {
|
||||
if (ci*M+M<N) {
|
||||
Mi=M;
|
||||
} else {
|
||||
Mi=N-ci*M;
|
||||
}
|
||||
//status=culaDeviceSgemm('T','N',M,Mi,M,1.0f,jacTjacd,M,&jacd[ci*M*M],M,0.0f,VTd,M);
|
||||
//checkStatus(status,__FILE__,__LINE__);
|
||||
cbstatus=cublasSgemm(cbhandle,CUBLAS_OP_T,CUBLAS_OP_N,M,Mi,M,&cone,jacTjacd,M,&jacd[ci*M*M],M,&czero,VTd,M);
|
||||
|
||||
err=cudaMemcpy(&jacd[ci*M*M],VTd,Mi*M*sizeof(float),cudaMemcpyDeviceToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
/* xd[i] <= ||J[i,:]||^2 */
|
||||
cudakernel_jnorm_fl(ThreadsPerBlock, (N+ThreadsPerBlock-1)/ThreadsPerBlock, jacd, N, M, xd);
|
||||
|
||||
/* output x <=xd */
|
||||
err=cudaMemcpyAsync(x, xd, N*sizeof(float), cudaMemcpyDeviceToHost,0);
|
||||
cudaDeviceSynchronize();
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
checkCublasError(cbstatus,__FILE__,__LINE__);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/******************** pipeline functions **************************/
|
||||
typedef struct gb_data_dg_ {
|
||||
int status[2];
|
||||
float *p[2]; /* pointer to parameters being used by each thread (depends on cluster) */
|
||||
float *xo; /* residual vector (copied to device) */
|
||||
float *x[2]; /* output leverage values from each thread */
|
||||
int M[2]; /* no. of parameters (per cluster,hybrid) */
|
||||
int N[2]; /* no. of visibilities (might change in hybrid mode) */
|
||||
me_data_t *lmdata[2]; /* two for each thread */
|
||||
|
||||
/* GPU related info */
|
||||
cublasHandle_t cbhandle[2]; /* CUBLAS handles */
|
||||
cusolverDnHandle_t solver_handle[2];
|
||||
float *rd[2]; /* residual vector on the device (invarient) */
|
||||
float *gWORK[2]; /* GPU buffers */
|
||||
int64_t data_size; /* size of buffer (bytes) */
|
||||
|
||||
} gbdatadg;
|
||||
|
||||
|
||||
/* slave thread 2GPU function */
|
||||
static void *
|
||||
pipeline_slave_code_dg(void *data)
|
||||
{
|
||||
slave_tdata *td=(slave_tdata*)data;
|
||||
gbdatadg *gd=(gbdatadg*)(td->pline->data);
|
||||
int tid=td->tid;
|
||||
|
||||
while(1) {
|
||||
sync_barrier(&(td->pline->gate1)); /* stop at gate 1*/
|
||||
if(td->pline->terminate) break; /* if flag is set, break loop */
|
||||
sync_barrier(&(td->pline->gate2)); /* stop at gate 2 */
|
||||
/* do work */
|
||||
if (gd->status[tid]==PT_DO_CDERIV) {
|
||||
me_data_t *t=(me_data_t *)gd->lmdata[tid];
|
||||
/* divide the tiles into chunks tilesz/nchunk */
|
||||
int tilechunk=(t->tilesz+t->carr[t->clus].nchunk-1)/t->carr[t->clus].nchunk;
|
||||
|
||||
int ci;
|
||||
int cj=0;
|
||||
int ntiles;
|
||||
|
||||
/* loop over chunk, righ set of parameters and residual vector */
|
||||
for (ci=0; ci<t->carr[t->clus].nchunk; ci++) {
|
||||
/* divide the tiles into chunks tilesz/nchunk */
|
||||
if (cj+tilechunk<t->tilesz) {
|
||||
ntiles=tilechunk;
|
||||
} else {
|
||||
ntiles=t->tilesz-cj;
|
||||
}
|
||||
|
||||
/* right offset for rd[] and x[] needed and since no overlap,
|
||||
can wait for all chunks to complete */
|
||||
calculate_leverage(&gd->p[tid][ci*(gd->M[tid])],&gd->rd[tid][8*cj*t->Nbase],&gd->x[tid][8*cj*t->Nbase], gd->M[tid], 8*ntiles*t->Nbase, gd->cbhandle[tid], gd->solver_handle[tid], gd->gWORK[tid], cj, ntiles, gd->lmdata[tid]);
|
||||
|
||||
cj=cj+tilechunk;
|
||||
}
|
||||
|
||||
} else if (gd->status[tid]==PT_DO_AGPU) {
|
||||
attach_gpu_to_thread2(tid,&gd->cbhandle[tid],&gd->solver_handle[tid],&gd->gWORK[tid],gd->data_size,1);
|
||||
|
||||
/* copy residual vector to device */
|
||||
cudaError_t err;
|
||||
me_data_t *t=(me_data_t *)gd->lmdata[tid];
|
||||
err=cudaMalloc((void**)&gd->rd[tid], (size_t)8*t->tilesz*t->Nbase*sizeof(float));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
err=cudaMemcpy(gd->rd[tid], gd->xo, 8*t->tilesz*t->Nbase*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (gd->status[tid]==PT_DO_DGPU) {
|
||||
cudaFree(gd->rd[tid]);
|
||||
detach_gpu_from_thread2(gd->cbhandle[tid],gd->solver_handle[tid],gd->gWORK[tid],1);
|
||||
} else if (gd->status[tid]!=PT_DO_NOTHING) { /* catch error */
|
||||
fprintf(stderr,"%s: %d: invalid mode for slave tid=%d status=%d\n",__FILE__,__LINE__,tid,gd->status[tid]);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* initialize the pipeline
|
||||
and start the slaves rolling */
|
||||
static void
|
||||
init_pipeline_dg(th_pipeline *pline,
|
||||
void *data)
|
||||
{
|
||||
slave_tdata *t0,*t1;
|
||||
pthread_attr_init(&(pline->attr));
|
||||
pthread_attr_setdetachstate(&(pline->attr),PTHREAD_CREATE_JOINABLE);
|
||||
|
||||
init_th_barrier(&(pline->gate1),3); /* 3 threads, including master */
|
||||
init_th_barrier(&(pline->gate2),3); /* 3 threads, including master */
|
||||
pline->terminate=0;
|
||||
pline->data=data; /* data should have pointers to t1 and t2 */
|
||||
|
||||
if ((t0=(slave_tdata*)malloc(sizeof(slave_tdata)))==0) {
|
||||
fprintf(stderr,"no free memory\n");
|
||||
exit(1);
|
||||
}
|
||||
if ((t1=(slave_tdata*)malloc(sizeof(slave_tdata)))==0) {
|
||||
fprintf(stderr,"no free memory\n");
|
||||
exit(1);
|
||||
}
|
||||
if ((pline->thst=(taskhist*)malloc(sizeof(taskhist)))==0) {
|
||||
fprintf(stderr,"no free memory\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
init_task_hist(pline->thst);
|
||||
t0->pline=t1->pline=pline;
|
||||
t0->tid=0;
|
||||
t1->tid=1; /* link back t1, t2 to data so they could be freed */
|
||||
pline->sd0=t0;
|
||||
pline->sd1=t1;
|
||||
pthread_create(&(pline->slave0),&(pline->attr),pipeline_slave_code_dg,(void*)t0);
|
||||
pthread_create(&(pline->slave1),&(pline->attr),pipeline_slave_code_dg,(void*)t1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* destroy the pipeline */
|
||||
/* need to kill the slaves first */
|
||||
static void
|
||||
destroy_pipeline_dg(th_pipeline *pline)
|
||||
{
|
||||
|
||||
pline->terminate=1;
|
||||
sync_barrier(&(pline->gate1));
|
||||
pthread_join(pline->slave0,NULL);
|
||||
pthread_join(pline->slave1,NULL);
|
||||
destroy_th_barrier(&(pline->gate1));
|
||||
destroy_th_barrier(&(pline->gate2));
|
||||
pthread_attr_destroy(&(pline->attr));
|
||||
destroy_task_hist(pline->thst);
|
||||
free(pline->thst);
|
||||
free(pline->sd0);
|
||||
free(pline->sd1);
|
||||
pline->data=NULL;
|
||||
}
|
||||
/******************** end pipeline functions **************************/
|
||||
|
||||
|
||||
|
||||
/* Calculate St.Laurent-Cook Jacobian leverage
|
||||
xo: residual (modified)
|
||||
flags: 2 for flags based on uvcut, 1 for normal flags
|
||||
coh: coherencies are calculated for all baselines, regardless of flag
|
||||
diagmode: 1: replace residual, 2: calc noise/leverage ratio
|
||||
*/
|
||||
int
|
||||
calculate_diagnostics(double *u,double *v,double *w,double *p,double *xo,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, complex double *coh, int M,int Mt,int diagmode, int Nt) {
|
||||
|
||||
|
||||
int cj;
|
||||
int n;
|
||||
me_data_t lmdata0,lmdata1;
|
||||
int Nbase1;
|
||||
|
||||
/* no of data */
|
||||
n=Nbase*tilesz*8;
|
||||
|
||||
/* true no of baselines */
|
||||
Nbase1=Nbase*tilesz;
|
||||
|
||||
double *ddcoh;
|
||||
short *ddbase;
|
||||
|
||||
int c0,c1;
|
||||
|
||||
float *ddcohf, *pf, *xdummy0f, *xdummy1f, *res0, *dgf;
|
||||
/********* thread data ******************/
|
||||
/* barrier */
|
||||
th_pipeline tp;
|
||||
gbdatadg tpg;
|
||||
/****************************************/
|
||||
|
||||
lmdata0.clus=lmdata1.clus=-1;
|
||||
/* setup data for lmfit */
|
||||
lmdata0.u=lmdata1.u=u;
|
||||
lmdata0.v=lmdata1.v=v;
|
||||
lmdata0.w=lmdata1.w=w;
|
||||
lmdata0.Nbase=lmdata1.Nbase=Nbase;
|
||||
lmdata0.tilesz=lmdata1.tilesz=tilesz;
|
||||
lmdata0.N=lmdata1.N=N;
|
||||
lmdata0.barr=lmdata1.barr=barr;
|
||||
lmdata0.carr=lmdata1.carr=carr;
|
||||
lmdata0.M=lmdata1.M=M;
|
||||
lmdata0.Mt=lmdata1.Mt=Mt;
|
||||
lmdata0.freq0=lmdata1.freq0=NULL; /* not used */
|
||||
lmdata0.Nt=lmdata1.Nt=Nt;
|
||||
lmdata0.coh=lmdata1.coh=coh;
|
||||
/* rearrange coh for GPU use */
|
||||
if ((ddcoh=(double*)calloc((size_t)(M*Nbase1*8),sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((ddcohf=(float*)calloc((size_t)(M*Nbase1*8),sizeof(float)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((ddbase=(short*)calloc((size_t)(Nbase1*3),sizeof(short)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
rearrange_coherencies2(Nbase1, barr, coh, ddcoh, ddbase, M, Nt);
|
||||
lmdata0.ddcoh=lmdata1.ddcoh=ddcoh;
|
||||
lmdata0.ddbase=lmdata1.ddbase=ddbase;
|
||||
/* ddcohf (float) << ddcoh (double) */
|
||||
double_to_float(ddcohf,ddcoh,M*Nbase1*8,Nt);
|
||||
lmdata0.ddcohf=lmdata1.ddcohf=ddcohf;
|
||||
|
||||
if ((pf=(float*)calloc((size_t)(Mt*8*N),sizeof(float)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
double_to_float(pf,p,Mt*8*N,Nt);
|
||||
/* residual */
|
||||
if ((res0=(float*)calloc((size_t)(n),sizeof(float)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
double_to_float(res0,xo,n,Nt);
|
||||
|
||||
/* sum of diagonal values of leverage */
|
||||
if ((dgf=(float*)calloc((size_t)(n),sizeof(float)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((xdummy0f=(float*)calloc((size_t)(n),sizeof(float)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((xdummy1f=(float*)calloc((size_t)(n),sizeof(float)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
/********** setup threads *******************************/
|
||||
/* also calculate the total storage needed to be allocated on a GPU */
|
||||
/* determine total size for memory allocation
|
||||
residual = n (separately allocated)
|
||||
diagonal = n
|
||||
For one cluster,
|
||||
Jacobian = nxm, J^T J = mxm, (also inverse)
|
||||
*/
|
||||
int Mm=8*N; /* no of parameters */
|
||||
int64_t data_sz=0;
|
||||
data_sz=(int64_t)(n+Mm*n+3*Mm*Mm+3*Mm+Nbase1*8)*sizeof(float)+(int64_t)Nbase1*3*sizeof(short);
|
||||
tpg.data_size=data_sz;
|
||||
tpg.lmdata[0]=&lmdata0;
|
||||
tpg.lmdata[1]=&lmdata1;
|
||||
tpg.xo=res0; /* residual */
|
||||
|
||||
init_pipeline_dg(&tp,&tpg);
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_AGPU;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2*/
|
||||
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2*/
|
||||
|
||||
/********** done setup threads *******************************/
|
||||
tpg.x[0]=xdummy0f;
|
||||
tpg.M[0]=8*N; /* even though size of p is > M, dont change this */
|
||||
tpg.N[0]=n; /* Nbase*tilesz*8 */
|
||||
tpg.x[1]=xdummy1f;
|
||||
tpg.M[1]=8*N; /* even though size of p is > M, dont change this */
|
||||
tpg.N[1]=n; /* Nbase*tilesz*8 */
|
||||
|
||||
for (cj=0; cj<M/2; cj++) { /* iter per cluster pairs */
|
||||
c0=2*cj;
|
||||
c1=2*cj+1;
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1 */
|
||||
lmdata0.clus=c0;
|
||||
lmdata1.clus=c1;
|
||||
|
||||
/* run this from a separate thread */
|
||||
tpg.p[0]=&pf[carr[c0].p[0]]; /* length carr[c0].nchunk times */
|
||||
tpg.p[1]=&pf[carr[c1].p[0]]; /* length carr[c1].nchunk times */
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_CDERIV;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1 */
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
|
||||
/* add result to the sum */
|
||||
my_saxpy(n, xdummy0f, 1.0f, dgf);
|
||||
my_saxpy(n, xdummy1f, 1.0f, dgf);
|
||||
}
|
||||
/* odd cluster out, if M is odd */
|
||||
if (M%2) {
|
||||
c0=M-1;
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1 */
|
||||
tpg.p[0]=&pf[carr[c0].p[0]];
|
||||
lmdata0.clus=c0;
|
||||
|
||||
tpg.status[0]=PT_DO_CDERIV;
|
||||
tpg.status[1]=PT_DO_NOTHING;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
|
||||
/**************************************************************************/
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1 */
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
|
||||
my_saxpy(n, xdummy0f, 1.0f, dgf);
|
||||
}
|
||||
free(pf);
|
||||
free(ddcohf);
|
||||
free(xdummy1f);
|
||||
free(res0);
|
||||
free(ddcoh);
|
||||
|
||||
/******** free threads ***************/
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_DGPU;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2*/
|
||||
destroy_pipeline_dg(&tp);
|
||||
/******** done free threads ***************/
|
||||
|
||||
/* now add 1's to locations with flagged data */
|
||||
/* create array for adding */
|
||||
create_onezerovec(Nbase1, ddbase, xdummy0f, Nt);
|
||||
my_saxpy(n, xdummy0f, 1.0f, dgf);
|
||||
free(xdummy0f);
|
||||
free(ddbase);
|
||||
/* output */
|
||||
// for (cj=0; cj<n; cj++) {
|
||||
// printf("%d %f\n",cj,dgf[cj]);
|
||||
// }
|
||||
if (diagmode==1) {
|
||||
/* copy back to output */
|
||||
float_to_double(xo,dgf,n,Nt);
|
||||
} else {
|
||||
/* solve system of equations a * leverage + b * 1 = |residual|
|
||||
to find a,b scalars, and just print them as output */
|
||||
/* find 1^T |r| = sum (|residual|) and lev^T |r| */
|
||||
float sum1,sum2;
|
||||
find_sumproduct(n, res0, dgf, &sum1, &sum2, Nt);
|
||||
//printf("sum|res|=%f sum(lev^T |res|)=%f\n",sum1,sum2);
|
||||
float a00,a01,a11;
|
||||
a00=my_fnrm2(n,dgf); /* lev^T lev */
|
||||
a01=my_fasum(n,dgf); /* = a10 = sum|leverage| */
|
||||
a00=a00*a00;
|
||||
a11=(float)n; /* sum( 1 ) */
|
||||
float r00,r01;
|
||||
r00=sum1;
|
||||
r01=sum2;
|
||||
//printf("A=[\n %f %f;\n %f %f];\n b=[\n %f\n %f\n]\n",a00,a01,a01,a11,r00,r01);
|
||||
/* solve A [a b]^T = r */
|
||||
float alpha,beta,denom;
|
||||
denom=(a00*a11-a01*a01);
|
||||
//printf("denom=%f\n",denom);
|
||||
if (denom>1e-6f) { /* can be solved */
|
||||
alpha=(r00*a11-r01*a01)/denom;
|
||||
} else {
|
||||
alpha=0.0f;
|
||||
}
|
||||
beta=(r00-a00*alpha)/a01;
|
||||
printf("Error Noise/Model %e/%e\n",beta,alpha);
|
||||
}
|
||||
free(dgf);
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue