Header file and Solvers folder redundant as well

This commit is contained in:
Hanno Spreeuw 2017-11-03 14:27:07 +01:00
parent d6b518116e
commit 1e2c24b31a
74 changed files with 0 additions and 43385 deletions

Binary file not shown.

Binary file not shown.

View File

@ -1,2 +0,0 @@
*.swp
*.swo

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,54 +0,0 @@
CC=gcc
CXX=g++
#CFLAGS= -Wall -O3 -g #-pg
CFLAGS= -Wall -O3 -fopt-info-optimized
CLIBS= -lm -lpthread
#LAPACK=-L/usr/lib/atlas/sse -llapack -lblas
#LAPACK=-L/usr/local/GotoBLAS2/lib -lgoto2 -lpthread -lgfortran
LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
INCLUDES= -I.
LIBPATH=
#### glib
GLIBI=-I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include -I/usr/lib/x86_64-linux-gnu/glib-2.0/include/ -I/usr/lib64/glib-2.0/include
GLIBL=-lglib-2.0
OBJECTS= lmfit_nocuda.o clmfit_nocuda.o lbfgs_nocuda.o myblas.o residual.o robustlm.o updatenu.o robust_lbfgs_nocuda.o rtr_solve.o rtr_solve_robust.o manifold_average.o consensus_poly.o rtr_solve_robust_admm.o admm_solve.o
default:libsolvers.a
lmfit_nocuda.o:lmfit_nocuda.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
clmfit_nocuda.o:clmfit_nocuda.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
lbfgs_nocuda.o:lbfgs_nocuda.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
myblas.o:myblas.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
residual.o:residual.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
robustlm.o:robustlm.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
updatenu.o:updatenu.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
robust_lbfgs_nocuda.o:robust_lbfgs_nocuda.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
rtr_solve.o:rtr_solve.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
rtr_solve_robust.o:rtr_solve_robust.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
manifold_average.o:manifold_average.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
consensus_poly.o:consensus_poly.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
rtr_solve_robust_admm.o:rtr_solve_robust_admm.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
admm_solve.o:admm_solve.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
RANLIB=ranlib
libsolvers.a:$(OBJECTS) Solvers.h
ar rv $@ $(OBJECTS); \
$(RANLIB) $@;

View File

@ -1,88 +0,0 @@
CC=gcc
CXX=g++
NVCC=nvcc
CFLAGS= -Wall -O3 -g -DHAVE_CUDA -DHYBRID_CODE
CLIBS= -lm -lpthread
LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
CUDAINC=/usr/local/cuda/include
CUDALIB=-L/usr/local/cuda/lib64 -lcuda -lcudart
#NVCC=/usr/local/cuda/bin/nvcc
#NVCFLAGS=-arch=sm_35 -g -G --ptxas-options=-v -O3
NVCFLAGS=-arch=sm_35 --ptxas-options=-v -O3
#### glib
GLIBI=-I/usr/include/glib-2.0 -I/usr/lib64/glib-2.0/include/
GLIBL=-lglib-2.0 -L/usr/lib64
# NVML
NVML_INC=/usr/include/nvidia/gdk/
NVML_LIB=-lnvidia-ml -L/usr/lib64/nvidia/
INCLUDES= -I. -I$(CUDAINC) -I$(NVML_INC)
LIBPATH= $(CUDALIB)
OBJECTS=lmfit.o lbfgs.o myblas.o mderiv.o clmfit.o clmfit_nocuda.o residual.o barrier.o robust.o robustlm.o oslmfit.o mderiv_fl.o clmfit_fl.o updatenu.o robust_lbfgs_nocuda.o robust_fl.o manifold_fl.o rtr_solve_cuda.o rtr_solve_robust_cuda.o diagnostics.o diag_fl.o manifold_average.o consensus_poly.o rtr_solve_robust_cuda_admm.o rtr_solve_robust_admm.o admm_solve.o load_balance.o
default:libsolvers-gpu.a
lmfit.o:lmfit.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
lbfgs.o:lbfgs.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
myblas.o:myblas.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
mderiv.o:mderiv.cu
$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
clmfit.o:clmfit.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
clmfit_nocuda.o:clmfit_nocuda.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
residual.o:residual.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
barrier.o:barrier.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
robustlm.o:robustlm.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
robust.o:robust.cu
$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
robust_fl.o:robust_fl.cu
$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
oslmfit.o:oslmfit.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
robust_lbfgs_nocuda.o:robust_lbfgs_nocuda.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
clmfit_fl.o:clmfit_fl.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
updatenu.o:updatenu.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
mderiv_fl.o:mderiv_fl.cu
$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
manifold_fl.o:manifold_fl.cu
$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
rtr_solve_cuda.o:rtr_solve_cuda.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
rtr_solve_robust_cuda.o:rtr_solve_robust_cuda.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
diagnostics.o:diagnostics.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
diag_fl.o:diag_fl.cu
$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
manifold_average.o:manifold_average.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
consensus_poly.o:consensus_poly.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
rtr_solve_robust_cuda_admm.o:rtr_solve_robust_cuda_admm.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
rtr_solve_robust_admm.o:rtr_solve_robust_admm.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
admm_solve.o:admm_solve.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
load_balance.o:load_balance.c
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
RANLIB=ranlib
libsolvers-gpu.a:$(OBJECTS) Solvers.h
ar rv $@ $(OBJECTS); \
$(RANLIB) $@;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,121 +0,0 @@
/*
*
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
$Id$
*/
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "Solvers.h"
/* implementation of a barrier to sync threads.
The barrier has two doors (enter and exit). Only one door
can be open at a time. Initially the enter door is open.
All threads that enter the barrier are sleeping (wait).
The last thread to enter the barrier will
1)close the enter door
2)wakeup all sleeping threads.
3)open the exit door.
So the woken up threads will leave the barrier one by
one, as they are awoken. The last thread to leave the barrier
will
1)open the enter door
2)close the exit door,
So finally the barrier reaches its initial state
*/
/* initialize barrier */
/* N - no. of accomodated threads */
void
init_th_barrier(th_barrier *barrier, int N)
{
barrier->tcount=0; /* initially empty */
barrier->nthreads=N;
pthread_mutex_init(&barrier->enter_mutex,NULL);
pthread_mutex_init(&barrier->exit_mutex,NULL);
pthread_cond_init(&barrier->lastthread_cond,NULL);
pthread_cond_init(&barrier->exit_cond,NULL);
}
/* destroy barrier */
void
destroy_th_barrier(th_barrier *barrier)
{
pthread_mutex_destroy(&barrier->enter_mutex);
pthread_mutex_destroy(&barrier->exit_mutex);
pthread_cond_destroy(&barrier->lastthread_cond);
pthread_cond_destroy(&barrier->exit_cond);
barrier->tcount=barrier->nthreads=0;
}
/* the main operation of the barrier */
void
sync_barrier(th_barrier *barrier)
{
/* trivial case */
if(barrier->nthreads <2) return;
/* else */
/* new threads enters the barrier. Now close the entry door
so that other threads cannot enter the barrier until we are done */
pthread_mutex_lock(&barrier->enter_mutex);
/* next lock the exit mutex - no threads can leave either */
pthread_mutex_lock(&barrier->exit_mutex);
/* now check to see if this is the last expected thread */
if( ++(barrier->tcount) < barrier->nthreads) {
/* no. this is not the last thread. so open the entry door */
pthread_mutex_unlock(&barrier->enter_mutex);
/* go to sleep */
pthread_cond_wait(&barrier->exit_cond,&barrier->exit_mutex);
} else {
/* this is the last thread */
/* wakeup sleeping threads */
pthread_cond_broadcast(&barrier->exit_cond);
/* go to sleep until all threads are woken up
and leave the barrier */
pthread_cond_wait(&barrier->lastthread_cond,&barrier->exit_mutex);
/* now all threads have left the barrier. so open the entry door again */
pthread_mutex_unlock(&barrier->enter_mutex);
}
/* next to the last thread leaving the barrier */
if(--(barrier->tcount)==1) {
/* wakeup the last sleeping thread */
pthread_cond_broadcast(&barrier->lastthread_cond);
}
pthread_mutex_unlock(&barrier->exit_mutex);
}
/* master and two slaves */
//int
//main(int argc, char *argv[]) {
// th_pipeline p;
//
// gbdata g;
//
// init_pipeline(&p,&g);
//sync_barrier(&(p.gate1)); /* stop at gate 1 */
// g.status=0; /* master work */
//sync_barrier(&(p.gate2)); /* stop at gate 2 */
// //exec_pipeline(&p);
//sync_barrier(&(p.gate1)); /* stop at gate 1 */
// g.status=10; /* master work */
//sync_barrier(&(p.gate2)); /* stop at gate 2 */
// //exec_pipeline(&p);
// destroy_pipeline(&p);
// /* still need to free slave_data structs, from data */
// return 0;
//}

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,349 +0,0 @@
/*
*
Copyright (C) 2014 Sarod Yatawatta <sarod@users.sf.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
$Id$
*/
#include "Solvers.h"
#include <math.h>
#include <stdio.h>
//#define DEBUG
/* build matrix with polynomial terms
B : Npoly x Nf, each row is one basis function
Npoly : total basis functions
Nf: frequencies
freqs: Nfx1 array freqs
freq0: reference freq
type :
0 :[1 ((f-fo)/fo) ((f-fo)/fo)^2 ...] basis functions
1 : normalize each row such that norm is 1
2 : Bernstein poly \sum N_C_r x^r (1-x)^r where x in [0,1] : use min,max values of freq to normalize
Note: freqs might not be in sorted order, so need to search array to find min,max values
3: [1 ((f-fo)/fo) (fo/f-1) ((f-fo)/fo)^2 (fo/f-1)^2 ... ] basis, for this case odd Npoly preferred
*/
int
setup_polynomials(double *B, int Npoly, int Nf, double *freqs, double freq0, int type) {
if (type==0 || type==1) {
double frat,dsum;
double invf=1.0/freq0;
int ci,cm;
for (ci=0; ci<Nf; ci++) {
B[ci*Npoly]=1.0;
frat=(freqs[ci]-freq0)*invf;
for (cm=1; cm<Npoly; cm++) {
B[ci*Npoly+cm]=B[ci*Npoly+cm-1]*frat;
}
}
#ifdef DEBUG
int cj;
printf("BT=[\n");
for(cj=0; cj<Npoly; cj++) {
for (ci=0; ci<Nf; ci++) {
printf("%lf ",B[ci*Npoly+cj]);
}
printf("\n");
}
printf("];\n");
#endif
if (type==1) {
/* normalize each row such that norm is 1 */
for (cm=0; cm<Npoly; cm++) {
dsum=0.0;
for (ci=0; ci<Nf; ci++) {
dsum+=B[ci*Npoly+cm]*B[ci*Npoly+cm];
}
if (dsum>0.0) {
invf=1.0/sqrt(dsum);
} else {
invf=0.0;
}
for (ci=0; ci<Nf; ci++) {
B[ci*Npoly+cm] *=invf;
}
}
}
} else if (type==2) {
/* Bernstein polynomials */
int idmax=my_idamax(Nf, freqs, 1);
int idmin=my_idamin(Nf, freqs, 1);
double fmax=freqs[idmax-1];
double fmin=freqs[idmin-1];
double *fact; /* factorial array */
double *px,*p1x; /* arrays for powers of x and (1+x) */
if ((fact=(double*)calloc((size_t)Npoly,sizeof(double)))==0) {
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((px=(double*)calloc((size_t)Npoly*Nf,sizeof(double)))==0) {
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((p1x=(double*)calloc((size_t)Npoly*Nf,sizeof(double)))==0) {
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
fact[0]=1.0;
int ci,cj;
for (ci=1; ci<Npoly; ci++) {
fact[ci]=fact[ci-1]*(double)ci;
}
double invf=1.0/(fmax-fmin);
double frat;
for (ci=0; ci<Nf; ci++) {
/* normalize coordinates */
frat=(freqs[ci]-fmin)*invf;
px[ci]=1.0;
p1x[ci]=1.0;
px[ci+Nf]=frat;
p1x[ci+Nf]=1.0-frat;
}
for (cj=2; cj<Npoly; cj++) {
for (ci=0; ci<Nf; ci++) {
px[cj*Nf+ci]=px[(cj-1)*Nf+ci]*px[Nf+ci];
p1x[cj*Nf+ci]=p1x[(cj-1)*Nf+ci]*p1x[Nf+ci];
}
}
for (cj=0; cj<Npoly; cj++) { /* ci: freq, cj: poly order */
frat=fact[Npoly-1]/(fact[Npoly-cj-1]*fact[cj]);
for (ci=0; ci<Nf; ci++) {
B[ci*Npoly+cj]=frat*px[cj*Nf+ci]*p1x[(Npoly-cj-1)*Nf+ci];
}
}
#ifdef DEBUG
printf("BT=[\n");
for(cj=0; cj<Npoly; cj++) {
for (ci=0; ci<Nf; ci++) {
printf("%lf ",B[ci*Npoly+cj]);
}
printf("\n");
}
printf("];\n");
#endif
free(fact);
free(px);
free(p1x);
} else if (type==3) { /* [1 (f-fo)/fo (fo/f-1) ... */
double frat;
double invf=1.0/freq0;
int ci,cm;
for (ci=0; ci<Nf; ci++) {
B[ci*Npoly]=1.0;
frat=(freqs[ci]-freq0)*invf;
double lastval=frat;
for (cm=1; cm<Npoly; cm+=2) { /* odd values 1,3,5,... */
B[ci*Npoly+cm]=lastval;
lastval*=frat;
}
frat=(freq0/freqs[ci]-1.0);
lastval=frat;
for (cm=2; cm<Npoly; cm+=2) { /* even values 2,4,6,... */
B[ci*Npoly+cm]=lastval;
lastval*=frat;
}
}
#ifdef DEBUG
int cj;
printf("BT=[\n");
for(cj=0; cj<Npoly; cj++) {
for (ci=0; ci<Nf; ci++) {
printf("%lf ",B[ci*Npoly+cj]);
}
printf("\n");
}
printf("];\n");
#endif
} else {
fprintf(stderr,"%s : %d: undefined polynomial type\n",__FILE__,__LINE__);
}
return 0;
}
/* build matrix with polynomial terms
B : Npoly x Nf, each row is one basis function
Bi: Npoly x Npoly pseudo inverse of sum( B(:,col) x B(:,col)' )
Npoly : total basis functions
Nf: frequencies
fratio: Nfx1 array of weighing factors depending on the flagged data of each freq
Sum taken is a weighted sum, using weights in fratio
*/
int
find_prod_inverse(double *B, double *Bi, int Npoly, int Nf, double *fratio) {
int ci,status,lwork=0;
double w[1],*WORK,*U,*S,*VT;
/* set Bi to zero */
memset(Bi,0,sizeof(double)*Npoly*Npoly);
/* find sum */
for (ci=0; ci<Nf; ci++) {
/* outer product */
my_dgemm('N','T',Npoly,Npoly,1,fratio[ci],&B[ci*Npoly],Npoly,&B[ci*Npoly],Npoly,1.0,Bi,Npoly);
}
#ifdef DEBUG
int cj;
printf("BT=[\n");
for (ci=0; ci<Nf; ci++) {
for(cj=0; cj<Npoly; cj++) {
printf("%lf ",B[ci*Npoly+cj]);
}
printf("\n");
}
printf("];\nBi=[\n");
for (ci=0; ci<Npoly; ci++) {
for(cj=0; cj<Npoly; cj++) {
printf("%lf ",Bi[ci*Npoly+cj]);
}
printf("\n");
}
printf("];\n");
#endif
if ((U=(double*)calloc((size_t)Npoly*Npoly,sizeof(double)))==0) {
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((VT=(double*)calloc((size_t)Npoly*Npoly,sizeof(double)))==0) {
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((S=(double*)calloc((size_t)Npoly,sizeof(double)))==0) {
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
/* memory for SVD */
status=my_dgesvd('A','A',Npoly,Npoly,Bi,Npoly,S,U,Npoly,VT,Npoly,w,-1);
if (!status) {
lwork=(int)w[0];
} else {
printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
exit(1);
}
if ((WORK=(double*)calloc((size_t)lwork,sizeof(double)))==0) {
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
status=my_dgesvd('A','A',Npoly,Npoly,Bi,Npoly,S,U,Npoly,VT,Npoly,WORK,lwork);
if (status) {
printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
exit(1);
}
/* find 1/singular values, and multiply columns of U with new singular values */
for (ci=0; ci<Npoly; ci++) {
if (S[ci]>CLM_EPSILON) {
S[ci]=1.0/S[ci];
} else {
S[ci]=0.0;
}
my_dscal(Npoly,S[ci],&U[ci*Npoly]);
}
/* find product U 1/S V^T */
my_dgemm('N','N',Npoly,Npoly,Npoly,1.0,U,Npoly,VT,Npoly,0.0,Bi,Npoly);
#ifdef DEBUG
printf("Bii=[\n");
for (ci=0; ci<Npoly; ci++) {
for(cj=0; cj<Npoly; cj++) {
printf("%lf ",Bi[ci*Npoly+cj]);
}
printf("\n");
}
printf("];\n");
#endif
free(U);
free(S);
free(VT);
free(WORK);
return 0;
}
/* update Z
Z: 8N Npoly x M double array (real and complex need to be updated separate)
N : stations
M : clusters
Npoly: no of basis functions
z : right hand side 8NM Npoly x 1 (note the different ordering from Z)
Bi : NpolyxNpoly matrix, Bi^T=Bi assumed
*/
int
update_global_z(double *Z,int N,int M,int Npoly,double *z,double *Bi) {
/* one block of Z for one direction 2Nx2xNpoly (complex)
and 8NxNpoly real values : select one column : 2NxNpoly (complex)
select real,imag : 2NxNpoly each (vector)
reshape each to 2NxNpoly matrix => Q
Bi : NpolyxNpoly matrix = B^T
for each direction (M values)
select 2N,2N,... : 2Nx Npoly complex values from z (ordered by M)
select real,imag: size 2NxNpoly, 2NxNpoly vectors
reshape to 2NxNpoly => R
reshape to 2NxNpoly => I (imag)
then Q=([R I] Bi^T) for each column
Q=[R_1^T I_1^T R_2^T I_2^T]^T Bi^T for 2 columns
R_1,I_1,R_2,I_2 : size 2NxNpoly
R : (2N 4) x Npoly
so find Q
*/
double *R,*Q;
if ((R=(double*)calloc((size_t)2*N*Npoly*4,sizeof(double)))==0) {
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((Q=(double*)calloc((size_t)2*N*Npoly*4,sizeof(double)))==0) {
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
int ci,np;
for (ci=0; ci<M; ci++) {
for (np=0; np<Npoly; np++) {
/* select 2N */
my_dcopy(2*N, &z[8*N*ci+np*8*N*M], 4, &R[np*8*N], 1); /* R_1 */
my_dcopy(2*N, &z[8*N*ci+np*8*N*M+1], 4, &R[np*8*N+2*N], 1); /* I_1 */
my_dcopy(2*N, &z[8*N*ci+np*8*N*M+2], 4, &R[np*8*N+2*2*N], 1); /* R_2 */
my_dcopy(2*N, &z[8*N*ci+np*8*N*M+3], 4, &R[np*8*N+3*2*N], 1); /* I_2 */
}
/* find Q=R B^T */
memset(Q,0,sizeof(double)*2*N*Npoly*4);
my_dgemm('N','N',8*N,Npoly,Npoly,1.0,R,8*N,Bi,Npoly,1.0,Q,8*N);
/* copy back to Z */
for (np=0; np<Npoly; np++) {
my_dcopy(2*N, &Q[np*8*N], 1, &Z[8*N*Npoly*ci+8*N*np], 4);
my_dcopy(2*N, &Q[np*8*N+2*N], 1, &Z[8*N*Npoly*ci+8*N*np+1], 4);
my_dcopy(2*N, &Q[np*8*N+2*2*N], 1, &Z[8*N*Npoly*ci+8*N*np+2], 4);
my_dcopy(2*N, &Q[np*8*N+3*2*N], 1, &Z[8*N*Npoly*ci+8*N*np+3], 4);
}
}
free(R);
free(Q);
return 0;
}

Binary file not shown.

View File

@ -1,270 +0,0 @@
/*
*
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
$Id$
*/
#include "cuda.h"
#include <cuComplex.h>
#include <stdio.h>
/* enable this for checking for kernel failure */
#define CUDA_DBG
__global__ void
kernel_sqrtdiv_fl(int M, float eps, float *__restrict__ x){
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
/* make sure to use only M threads */
if (tid<M) {
if (x[tid]>eps) {
x[tid]=1.0f/sqrtf(x[tid]);
} else {
x[tid]=0.0f;
}
}
}
__global__ void
kernel_diagmult_fl(int M, float *__restrict__ U, const float *__restrict__ D) {
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
/* which column this tid operates on */
unsigned int col = tid/M;
if (tid<M*M) {
U[tid]=U[tid]*D[col];
}
}
__global__ void
kernel_jnorm_fl(int N, int M, const float *__restrict__ J, float *__restrict__ d) {
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
/* each thread handles one row */
if (tid<N) {
d[tid]=0.0f;
for (int ci=0; ci<M; ci++) {
/* J is transposed, so read each column */
d[tid]=d[tid]+J[tid*M+ci]*J[tid*M+ci];
}
}
}
__global__ void
kernel_jacf_fl2(int Nbase, int M, float *__restrict__ jac, const float *__restrict__ coh, const float *__restrict__ p, const short *__restrict__ bb, int N){
/* global thread index : equal to the baseline */
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
/* which parameter:0...M */
unsigned int m = threadIdx.y + blockDim.y*blockIdx.y;
if(n<Nbase && m<M) {
int sta1=(int)bb[3*n];
int sta2=(int)bb[3*n+1];
/* condition for calculating this baseline sum is
If this baseline is flagged,
or if this parameter does not belong to sta1 or sta2
we do not compute
*/
int stc=m>>3; /* 0...Ns-1 (because M=total par= 8 * Nstations */
/* flags are not taken into account */
if (((stc==sta2)||(stc==sta1))) {
cuFloatComplex C[4];
C[0].x=coh[8*n];
C[0].y=coh[8*n+1];
C[1].x=coh[8*n+2];
C[1].y=coh[8*n+3];
C[2].x=coh[8*n+4];
C[2].y=coh[8*n+5];
C[3].x=coh[8*n+6];
C[3].y=coh[8*n+7];
/* which parameter exactly 0..7 */
int stoff=m-stc*8;
float pp1[8];
float pp2[8];
if (stc==sta1) {
for (int cn=0; cn<8; cn++) {
pp1[cn]=0.0f;
pp2[cn]=p[sta2*8+cn];
}
pp1[stoff]=1.0f;
} else if (stc==sta2) {
for (int cn=0; cn<8; cn++) {
pp2[cn]=0.0f;
pp1[cn]=p[sta1*8+cn];
}
pp2[stoff]=1.0f;
}
cuFloatComplex G1[4];
G1[0].x=pp1[0];
G1[0].y=pp1[1];
G1[1].x=pp1[2];
G1[1].y=pp1[3];
G1[2].x=pp1[4];
G1[2].y=pp1[5];
G1[3].x=pp1[6];
G1[3].y=pp1[7];
cuFloatComplex T1[4];
/* T=G1*C */
T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
cuFloatComplex G2[4];
/* conjugate this */
G2[0].x=pp2[0];
G2[0].y=-pp2[1];
G2[2].x=pp2[2];
G2[2].y=-pp2[3];
G2[1].x=pp2[4];
G2[1].y=-pp2[5];
G2[3].x=pp2[6];
G2[3].y=-pp2[7];
cuFloatComplex T2[4];
T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
/* update jacobian */
/* NOTE: row major order */
jac[m+M*8*n]=T2[0].x;
jac[m+M*(8*n+1)]=T2[0].y;
jac[m+M*(8*n+2)]=T2[1].x;
jac[m+M*(8*n+3)]=T2[1].y;
jac[m+M*(8*n+4)]=T2[2].x;
jac[m+M*(8*n+5)]=T2[2].y;
jac[m+M*(8*n+6)]=T2[3].x;
jac[m+M*(8*n+7)]=T2[3].y;
}
}
}
/* only use extern if calling code is C */
extern "C"
{
/* cuda driver for calculating jacf() */
/* p: params (Mx1), jac: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
void
cudakernel_jacf_fl2(float *p, float *jac, int M, int N, float *coh, short *bbh, int Nbase, int Mclus, int Nstations) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
/* NOTE: use small value for ThreadsPerBlock here, like 8 */
dim3 threadsPerBlock(16, 8);
/* jacobian: Nbase x Nstations (proportional to N), so */
dim3 numBlocks((Nbase+threadsPerBlock.x-1)/threadsPerBlock.x,
(M+threadsPerBlock.y-1)/threadsPerBlock.y);
/* set memory of jac to zero */
cudaMemset(jac, 0, N*M*sizeof(float));
// printf("Kernel Jax data size=%d, params=%d, block=%d,%d, thread=%d,%d, baselines=%d\n",N, M, numBlocks.x,numBlocks.y, threadsPerBlock.x, threadsPerBlock.y, Nbase);
kernel_jacf_fl2<<< numBlocks, threadsPerBlock>>>(Nbase, M, jac, coh, p, bbh, Nstations);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* invert sqrt(singular values) 1/Sd[] for Sd[]> eps */
void
cudakernel_sqrtdiv_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float eps, float *Sd) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_sqrtdiv_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, eps, Sd);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* U <= U D,
U : MxM
D : Mx1, diagonal matrix
*/
void
cudakernel_diagmult_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float *U, float *D) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_diagmult_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, U, D);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* diag(J^T J)
d[i] = J[i,:] * J[i,:]
J: NxM (in row major order, so J[i,:] is actually J[:,i]
d: Nx1
*/
void
cudakernel_jnorm_fl(int ThreadsPerBlock, int BlocksPerGrid, float *J, int N, int M, float *d) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_jnorm_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N,M,J,d);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
}

Binary file not shown.

View File

@ -1,550 +0,0 @@
/*
*
Copyright (C) 2014 Sarod Yatawatta <sarod@users.sf.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
$Id$
*/
#include "Solvers.h"
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>
#include <pthread.h>
#include <math.h>
static void
checkCudaError(cudaError_t err, const char *file, int line)
{
#ifdef CUDA_DEBUG
if(!err)
return;
fprintf(stderr,"GPU (CUDA): %s %s %d\n", cudaGetErrorString(err),file,line);
exit(EXIT_FAILURE);
#endif
}
static void
checkCublasError(cublasStatus_t cbstatus, char *file, int line)
{
#ifdef CUDA_DEBUG
if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
fprintf(stderr,"%s: %d: CUBLAS failure\n",file,line);
exit(EXIT_FAILURE);
}
#endif
}
/* find for one cluster J (J^T W J+ eW)^-1 J^T and extract diagonal as output
p: parameters M x 1
rd: residual vector N x 1 (on the device, invarient)
x: (output) diagonal of leverage matrix
cbhandle,gWORK: BLAS/storage pointers
tileoff: need for hybrid parameters
adata: has all additional info: coherency,baselines,flags
*/
static int
calculate_leverage(float *p, float *rd, float *x, int M, int N, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle, float *gWORK, int tileoff, int ntiles, me_data_t *dp) {
/* p needs to be copied to device and x needs to be copied back from device
rd always remains in the device (select part with the right offset)
N will change in hybrid mode, so copy back to x with right offset */
int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
float *jacd,*xd,*jacTjacd,*pd,*cohd,*Ud,*VTd,*Sd;
unsigned long int moff=0;
short *bbd;
cudaError_t err;
/* total storage N+M*N+M*M+M+Nbase*8+M*M+M*M+M+M+Nbase*3(short)/(float) */
xd=&gWORK[moff];
moff+=N;
jacd=&gWORK[moff];
moff+=M*N;
jacTjacd=&gWORK[moff];
moff+=M*M;
pd=&gWORK[moff];
moff+=M;
cohd=&gWORK[moff];
moff+=Nbase*8;
Ud=&gWORK[moff];
moff+=M*M;
VTd=&gWORK[moff];
moff+=M*M;
Sd=&gWORK[moff];
moff+=M;
bbd=(short*)&gWORK[moff];
moff+=(Nbase*3*sizeof(short))/sizeof(float);
err=cudaMemcpyAsync(pd, p, M*sizeof(float), cudaMemcpyHostToDevice,0);
checkCudaError(err,__FILE__,__LINE__);
/* need to give right offset for coherencies */
/* offset: cluster offset+time offset */
err=cudaMemcpyAsync(cohd, &(dp->ddcohf[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(float), cudaMemcpyHostToDevice,0);
checkCudaError(err,__FILE__,__LINE__);
/* correct offset for baselines */
err=cudaMemcpyAsync(bbd, &(dp->ddbase[3*(dp->Nbase)*(tileoff)]), Nbase*3*sizeof(short), cudaMemcpyHostToDevice,0);
checkCudaError(err,__FILE__,__LINE__);
cudaDeviceSynchronize();
int ThreadsPerBlock=DEFAULT_TH_PER_BK;
int ci,Mi;
/* extra storage for cusolver */
int work_size=0;
int *devInfo;
err=cudaMalloc((void**)&devInfo, sizeof(int));
checkCudaError(err,__FILE__,__LINE__);
float *work;
float *rwork;
cusolverDnSgesvd_bufferSize(solver_handle, M, M, &work_size);
err=cudaMalloc((void**)&work, work_size*sizeof(float));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&rwork, 5*M*sizeof(float));
checkCudaError(err,__FILE__,__LINE__);
/* set mem to 0 */
cudaMemset(xd, 0, N*sizeof(float));
/* calculate J^T, not taking flags into account */
cudakernel_jacf_fl2(pd, jacd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
/* calculate JTJ=(J^T J - [e] [W]) */
//status=culaDeviceSgemm('N','T',M,M,N,1.0f,jacd,M,jacd,M,0.0f,jacTjacd,M);
//checkStatus(status,__FILE__,__LINE__);
cublasStatus_t cbstatus=CUBLAS_STATUS_SUCCESS;
float cone=1.0f; float czero=0.0f;
cbstatus=cublasSgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_T,M,M,N,&cone,jacd,M,jacd,M,&czero,jacTjacd,M);
/* add mu * I to JTJ */
cudakernel_diagmu_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, jacTjacd, 1e-9f);
/* calculate inv(JTJ) using SVD */
/* inv(JTJ) = Ud x Sid x VTd : we take into account that JTJ is symmetric */
//status=culaDeviceSgesvd('A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M);
//checkStatus(status,__FILE__,__LINE__);
cusolverDnSgesvd(solver_handle,'A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M,work,work_size,rwork,devInfo);
cudaDeviceSynchronize();
/* find Sd= 1/sqrt(Sd) of the singular values (positive singular values) */
cudakernel_sqrtdiv_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, 1e-9f, Sd);
/* multiply Ud with Sid (diagonal) Ud <= Ud Sid (columns modified) */
cudakernel_diagmult_fl(ThreadsPerBlock, (M*M+ThreadsPerBlock-1)/ThreadsPerBlock, M, Ud, Sd);
/* now multiply Ud VTd to get the square root */
//status=culaDeviceSgemm('N','N',M,M,M,1.0f,Ud,M,VTd,M,0.0f,jacTjacd,M);
//checkStatus(status,__FILE__,__LINE__);
cbstatus=cublasSgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_N,M,M,M,&cone,Ud,M,VTd,M,&czero,jacTjacd,M);
/* calculate J^T, without taking flags into account (use same storage as previous J^T) */
cudakernel_jacf_fl2(pd, jacd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
/* multiply (J^T)^T sqrt(B) == sqrt(B)^T J^T, taking M columns at a time */
for (ci=0; ci<(N+M-1)/M;ci++) {
if (ci*M+M<N) {
Mi=M;
} else {
Mi=N-ci*M;
}
//status=culaDeviceSgemm('T','N',M,Mi,M,1.0f,jacTjacd,M,&jacd[ci*M*M],M,0.0f,VTd,M);
//checkStatus(status,__FILE__,__LINE__);
cbstatus=cublasSgemm(cbhandle,CUBLAS_OP_T,CUBLAS_OP_N,M,Mi,M,&cone,jacTjacd,M,&jacd[ci*M*M],M,&czero,VTd,M);
err=cudaMemcpy(&jacd[ci*M*M],VTd,Mi*M*sizeof(float),cudaMemcpyDeviceToDevice);
checkCudaError(err,__FILE__,__LINE__);
}
/* xd[i] <= ||J[i,:]||^2 */
cudakernel_jnorm_fl(ThreadsPerBlock, (N+ThreadsPerBlock-1)/ThreadsPerBlock, jacd, N, M, xd);
/* output x <=xd */
err=cudaMemcpyAsync(x, xd, N*sizeof(float), cudaMemcpyDeviceToHost,0);
cudaDeviceSynchronize();
checkCudaError(err,__FILE__,__LINE__);
checkCublasError(cbstatus,__FILE__,__LINE__);
return 0;
}
/******************** pipeline functions **************************/
typedef struct gb_data_dg_ {
int status[2];
float *p[2]; /* pointer to parameters being used by each thread (depends on cluster) */
float *xo; /* residual vector (copied to device) */
float *x[2]; /* output leverage values from each thread */
int M[2]; /* no. of parameters (per cluster,hybrid) */
int N[2]; /* no. of visibilities (might change in hybrid mode) */
me_data_t *lmdata[2]; /* two for each thread */
/* GPU related info */
cublasHandle_t cbhandle[2]; /* CUBLAS handles */
cusolverDnHandle_t solver_handle[2];
float *rd[2]; /* residual vector on the device (invarient) */
float *gWORK[2]; /* GPU buffers */
int64_t data_size; /* size of buffer (bytes) */
} gbdatadg;
/* slave thread 2GPU function */
static void *
pipeline_slave_code_dg(void *data)
{
slave_tdata *td=(slave_tdata*)data;
gbdatadg *gd=(gbdatadg*)(td->pline->data);
int tid=td->tid;
while(1) {
sync_barrier(&(td->pline->gate1)); /* stop at gate 1*/
if(td->pline->terminate) break; /* if flag is set, break loop */
sync_barrier(&(td->pline->gate2)); /* stop at gate 2 */
/* do work */
if (gd->status[tid]==PT_DO_CDERIV) {
me_data_t *t=(me_data_t *)gd->lmdata[tid];
/* divide the tiles into chunks tilesz/nchunk */
int tilechunk=(t->tilesz+t->carr[t->clus].nchunk-1)/t->carr[t->clus].nchunk;
int ci;
int cj=0;
int ntiles;
/* loop over chunk, righ set of parameters and residual vector */
for (ci=0; ci<t->carr[t->clus].nchunk; ci++) {
/* divide the tiles into chunks tilesz/nchunk */
if (cj+tilechunk<t->tilesz) {
ntiles=tilechunk;
} else {
ntiles=t->tilesz-cj;
}
/* right offset for rd[] and x[] needed and since no overlap,
can wait for all chunks to complete */
calculate_leverage(&gd->p[tid][ci*(gd->M[tid])],&gd->rd[tid][8*cj*t->Nbase],&gd->x[tid][8*cj*t->Nbase], gd->M[tid], 8*ntiles*t->Nbase, gd->cbhandle[tid], gd->solver_handle[tid], gd->gWORK[tid], cj, ntiles, gd->lmdata[tid]);
cj=cj+tilechunk;
}
} else if (gd->status[tid]==PT_DO_AGPU) {
attach_gpu_to_thread2(tid,&gd->cbhandle[tid],&gd->solver_handle[tid],&gd->gWORK[tid],gd->data_size,1);
/* copy residual vector to device */
cudaError_t err;
me_data_t *t=(me_data_t *)gd->lmdata[tid];
err=cudaMalloc((void**)&gd->rd[tid], (size_t)8*t->tilesz*t->Nbase*sizeof(float));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMemcpy(gd->rd[tid], gd->xo, 8*t->tilesz*t->Nbase*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
} else if (gd->status[tid]==PT_DO_DGPU) {
cudaFree(gd->rd[tid]);
detach_gpu_from_thread2(gd->cbhandle[tid],gd->solver_handle[tid],gd->gWORK[tid],1);
} else if (gd->status[tid]!=PT_DO_NOTHING) { /* catch error */
fprintf(stderr,"%s: %d: invalid mode for slave tid=%d status=%d\n",__FILE__,__LINE__,tid,gd->status[tid]);
exit(1);
}
}
return NULL;
}
/* initialize the pipeline
and start the slaves rolling */
static void
init_pipeline_dg(th_pipeline *pline,
void *data)
{
slave_tdata *t0,*t1;
pthread_attr_init(&(pline->attr));
pthread_attr_setdetachstate(&(pline->attr),PTHREAD_CREATE_JOINABLE);
init_th_barrier(&(pline->gate1),3); /* 3 threads, including master */
init_th_barrier(&(pline->gate2),3); /* 3 threads, including master */
pline->terminate=0;
pline->data=data; /* data should have pointers to t1 and t2 */
if ((t0=(slave_tdata*)malloc(sizeof(slave_tdata)))==0) {
fprintf(stderr,"no free memory\n");
exit(1);
}
if ((t1=(slave_tdata*)malloc(sizeof(slave_tdata)))==0) {
fprintf(stderr,"no free memory\n");
exit(1);
}
if ((pline->thst=(taskhist*)malloc(sizeof(taskhist)))==0) {
fprintf(stderr,"no free memory\n");
exit(1);
}
init_task_hist(pline->thst);
t0->pline=t1->pline=pline;
t0->tid=0;
t1->tid=1; /* link back t1, t2 to data so they could be freed */
pline->sd0=t0;
pline->sd1=t1;
pthread_create(&(pline->slave0),&(pline->attr),pipeline_slave_code_dg,(void*)t0);
pthread_create(&(pline->slave1),&(pline->attr),pipeline_slave_code_dg,(void*)t1);
}
/* destroy the pipeline */
/* need to kill the slaves first */
static void
destroy_pipeline_dg(th_pipeline *pline)
{
pline->terminate=1;
sync_barrier(&(pline->gate1));
pthread_join(pline->slave0,NULL);
pthread_join(pline->slave1,NULL);
destroy_th_barrier(&(pline->gate1));
destroy_th_barrier(&(pline->gate2));
pthread_attr_destroy(&(pline->attr));
destroy_task_hist(pline->thst);
free(pline->thst);
free(pline->sd0);
free(pline->sd1);
pline->data=NULL;
}
/******************** end pipeline functions **************************/
/* Calculate St.Laurent-Cook Jacobian leverage
xo: residual (modified)
flags: 2 for flags based on uvcut, 1 for normal flags
coh: coherencies are calculated for all baselines, regardless of flag
diagmode: 1: replace residual, 2: calc noise/leverage ratio
*/
int
calculate_diagnostics(double *u,double *v,double *w,double *p,double *xo,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, complex double *coh, int M,int Mt,int diagmode, int Nt) {
int cj;
int n;
me_data_t lmdata0,lmdata1;
int Nbase1;
/* no of data */
n=Nbase*tilesz*8;
/* true no of baselines */
Nbase1=Nbase*tilesz;
double *ddcoh;
short *ddbase;
int c0,c1;
float *ddcohf, *pf, *xdummy0f, *xdummy1f, *res0, *dgf;
/********* thread data ******************/
/* barrier */
th_pipeline tp;
gbdatadg tpg;
/****************************************/
lmdata0.clus=lmdata1.clus=-1;
/* setup data for lmfit */
lmdata0.u=lmdata1.u=u;
lmdata0.v=lmdata1.v=v;
lmdata0.w=lmdata1.w=w;
lmdata0.Nbase=lmdata1.Nbase=Nbase;
lmdata0.tilesz=lmdata1.tilesz=tilesz;
lmdata0.N=lmdata1.N=N;
lmdata0.barr=lmdata1.barr=barr;
lmdata0.carr=lmdata1.carr=carr;
lmdata0.M=lmdata1.M=M;
lmdata0.Mt=lmdata1.Mt=Mt;
lmdata0.freq0=lmdata1.freq0=NULL; /* not used */
lmdata0.Nt=lmdata1.Nt=Nt;
lmdata0.coh=lmdata1.coh=coh;
/* rearrange coh for GPU use */
if ((ddcoh=(double*)calloc((size_t)(M*Nbase1*8),sizeof(double)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((ddcohf=(float*)calloc((size_t)(M*Nbase1*8),sizeof(float)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((ddbase=(short*)calloc((size_t)(Nbase1*3),sizeof(short)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
rearrange_coherencies2(Nbase1, barr, coh, ddcoh, ddbase, M, Nt);
lmdata0.ddcoh=lmdata1.ddcoh=ddcoh;
lmdata0.ddbase=lmdata1.ddbase=ddbase;
/* ddcohf (float) << ddcoh (double) */
double_to_float(ddcohf,ddcoh,M*Nbase1*8,Nt);
lmdata0.ddcohf=lmdata1.ddcohf=ddcohf;
if ((pf=(float*)calloc((size_t)(Mt*8*N),sizeof(float)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
double_to_float(pf,p,Mt*8*N,Nt);
/* residual */
if ((res0=(float*)calloc((size_t)(n),sizeof(float)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
double_to_float(res0,xo,n,Nt);
/* sum of diagonal values of leverage */
if ((dgf=(float*)calloc((size_t)(n),sizeof(float)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((xdummy0f=(float*)calloc((size_t)(n),sizeof(float)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((xdummy1f=(float*)calloc((size_t)(n),sizeof(float)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
/********** setup threads *******************************/
/* also calculate the total storage needed to be allocated on a GPU */
/* determine total size for memory allocation
residual = n (separately allocated)
diagonal = n
For one cluster,
Jacobian = nxm, J^T J = mxm, (also inverse)
*/
int Mm=8*N; /* no of parameters */
int64_t data_sz=0;
data_sz=(int64_t)(n+Mm*n+3*Mm*Mm+3*Mm+Nbase1*8)*sizeof(float)+(int64_t)Nbase1*3*sizeof(short);
tpg.data_size=data_sz;
tpg.lmdata[0]=&lmdata0;
tpg.lmdata[1]=&lmdata1;
tpg.xo=res0; /* residual */
init_pipeline_dg(&tp,&tpg);
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
tpg.status[0]=tpg.status[1]=PT_DO_AGPU;
sync_barrier(&(tp.gate2)); /* sync at gate 2*/
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
sync_barrier(&(tp.gate2)); /* sync at gate 2*/
/********** done setup threads *******************************/
tpg.x[0]=xdummy0f;
tpg.M[0]=8*N; /* even though size of p is > M, dont change this */
tpg.N[0]=n; /* Nbase*tilesz*8 */
tpg.x[1]=xdummy1f;
tpg.M[1]=8*N; /* even though size of p is > M, dont change this */
tpg.N[1]=n; /* Nbase*tilesz*8 */
for (cj=0; cj<M/2; cj++) { /* iter per cluster pairs */
c0=2*cj;
c1=2*cj+1;
sync_barrier(&(tp.gate1)); /* sync at gate 1 */
lmdata0.clus=c0;
lmdata1.clus=c1;
/* run this from a separate thread */
tpg.p[0]=&pf[carr[c0].p[0]]; /* length carr[c0].nchunk times */
tpg.p[1]=&pf[carr[c1].p[0]]; /* length carr[c1].nchunk times */
tpg.status[0]=tpg.status[1]=PT_DO_CDERIV;
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
sync_barrier(&(tp.gate1)); /* sync at gate 1 */
tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
/* add result to the sum */
my_saxpy(n, xdummy0f, 1.0f, dgf);
my_saxpy(n, xdummy1f, 1.0f, dgf);
}
/* odd cluster out, if M is odd */
if (M%2) {
c0=M-1;
sync_barrier(&(tp.gate1)); /* sync at gate 1 */
tpg.p[0]=&pf[carr[c0].p[0]];
lmdata0.clus=c0;
tpg.status[0]=PT_DO_CDERIV;
tpg.status[1]=PT_DO_NOTHING;
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
/**************************************************************************/
sync_barrier(&(tp.gate1)); /* sync at gate 1 */
tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
my_saxpy(n, xdummy0f, 1.0f, dgf);
}
free(pf);
free(ddcohf);
free(xdummy1f);
free(res0);
free(ddcoh);
/******** free threads ***************/
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
tpg.status[0]=tpg.status[1]=PT_DO_DGPU;
sync_barrier(&(tp.gate2)); /* sync at gate 2*/
destroy_pipeline_dg(&tp);
/******** done free threads ***************/
/* now add 1's to locations with flagged data */
/* create array for adding */
create_onezerovec(Nbase1, ddbase, xdummy0f, Nt);
my_saxpy(n, xdummy0f, 1.0f, dgf);
free(xdummy0f);
free(ddbase);
/* output */
// for (cj=0; cj<n; cj++) {
// printf("%d %f\n",cj,dgf[cj]);
// }
if (diagmode==1) {
/* copy back to output */
float_to_double(xo,dgf,n,Nt);
} else {
/* solve system of equations a * leverage + b * 1 = |residual|
to find a,b scalars, and just print them as output */
/* find 1^T |r| = sum (|residual|) and lev^T |r| */
float sum1,sum2;
find_sumproduct(n, res0, dgf, &sum1, &sum2, Nt);
//printf("sum|res|=%f sum(lev^T |res|)=%f\n",sum1,sum2);
float a00,a01,a11;
a00=my_fnrm2(n,dgf); /* lev^T lev */
a01=my_fasum(n,dgf); /* = a10 = sum|leverage| */
a00=a00*a00;
a11=(float)n; /* sum( 1 ) */
float r00,r01;
r00=sum1;
r01=sum2;
//printf("A=[\n %f %f;\n %f %f];\n b=[\n %f\n %f\n]\n",a00,a01,a01,a11,r00,r01);
/* solve A [a b]^T = r */
float alpha,beta,denom;
denom=(a00*a11-a01*a01);
//printf("denom=%f\n",denom);
if (denom>1e-6f) { /* can be solved */
alpha=(r00*a11-r01*a01)/denom;
} else {
alpha=0.0f;
}
beta=(r00-a00*alpha)/a01;
printf("Error Noise/Model %e/%e\n",beta,alpha);
}
free(dgf);
return 0;
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,926 +0,0 @@
/*
*
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
$Id$
*/
#include "Solvers.h"
#include <pthread.h>
/**** repeated code here ********************/
/* Jones matrix multiplication
C=A*B
*/
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
static void
amb(complex double * __restrict a, complex double * __restrict b, complex double * __restrict c) {
c[0]=a[0]*b[0]+a[1]*b[2];
c[1]=a[0]*b[1]+a[1]*b[3];
c[2]=a[2]*b[0]+a[3]*b[2];
c[3]=a[2]*b[1]+a[3]*b[3];
}
/* Jones matrix multiplication
C=A*B^H
*/
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
static void
ambt(complex double * __restrict a, complex double * __restrict b, complex double * __restrict c) {
c[0]=a[0]*conj(b[0])+a[1]*conj(b[1]);
c[1]=a[0]*conj(b[2])+a[1]*conj(b[3]);
c[2]=a[2]*conj(b[0])+a[3]*conj(b[1]);
c[3]=a[2]*conj(b[2])+a[3]*conj(b[3]);
}
/**** end repeated code ********************/
/* worker thread for a cpu */
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
static void *
cpu_calc_deriv(void *adata) {
thread_data_grad_t *t=(thread_data_grad_t*)adata;
int ci,nb;
int stc,stoff,stm,sta1,sta2;
int N=t->N; /* stations */
int M=t->M; /* clusters */
int Nbase=(t->Nbase)*(t->tilesz);
complex double xr[4]; /* residuals */
complex double G1[4],G2[4],C[4],T1[4],T2[4];
double pp[8];
complex double csum;
int cli,tpchunk,pstart,nchunk,tilesperchunk,stci,ttile,tptile,poff;
/* iterate over each paramter */
for (ci=t->g_start; ci<=t->g_end; ++ci) {
t->g[ci]=0.0;
/* find station and parameter corresponding to this value of ci */
/* this parameter should correspond to the right baseline (x tilesz)
to contribute to the derivative */
cli=0;
while((cli<M) && (ci<t->carr[cli].p[0] || ci>t->carr[cli].p[0]+8*N*t->carr[cli].nchunk-1)) {
cli++;
}
/* now either cli>=M: cluster not found
or cli<M and cli is the right cluster */
if (cli==M && ci>=t->carr[cli-1].p[0] && ci<=t->carr[cli-1].p[0]+8*N*t->carr[cli-1].nchunk-1) {
cli--;
}
if (cli<M) {
/* right parameter offset */
stci=ci-t->carr[cli].p[0];
stc=(stci%(8*N))/8; /* 0..N-1 */
/* make sure this baseline contribute to this parameter */
tpchunk=stci/(8*N);
nchunk=t->carr[cli].nchunk;
pstart=t->carr[cli].p[0];
tilesperchunk=(t->tilesz+nchunk-1)/nchunk;
/* iterate over all baselines and accumulate sum */
for (nb=0; nb<Nbase; ++nb) {
/* which tile is this ? */
ttile=nb/t->Nbase;
/* which chunk this tile belongs to */
tptile=ttile/tilesperchunk;
/* now tptile has to match tpchunk, otherwise ignore calculation */
if (tptile==tpchunk) {
sta1=t->barr[nb].sta1;
sta2=t->barr[nb].sta2;
if (((stc==sta1)||(stc==sta2))&& !t->barr[nb].flag) {
/* this baseline has a contribution */
/* which paramter of this station */
stoff=(stci%(8*N))%8; /* 0..7 */
/* which cluster */
stm=cli; /* 0..M-1 */
/* exact expression for derivative
2 real( vec^H(residual_this_baseline)
* vec(-J_{pm}C_{pqm} J_{qm}^H)
where m: chosen cluster
J_{pm},J_{qm} Jones matrices for baseline p-q
depending on the parameter, J ==> E
E: zero matrix, except 1 at location of m
residual : in x[8*nb:8*nb+7]
C coh: in coh[8*M*nb+m*8:8*M*nb+m*8+7] (double storage)
coh[4*M*nb+4*m:4*M*nb+4*m+3] (complex storage)
J_p,J_q: in p[sta1*8+m*8*N: sta1*8+m*8*N+7]
and p[sta2*8+m*8*N: sta2*8+m*8*N+ 7]
*/
/* read in residual vector, conjugated */
xr[0]=(t->x[nb*8])-_Complex_I*(t->x[nb*8+1]);
xr[1]=(t->x[nb*8+2])-_Complex_I*(t->x[nb*8+3]);
xr[2]=(t->x[nb*8+4])-_Complex_I*(t->x[nb*8+5]);
xr[3]=(t->x[nb*8+6])-_Complex_I*(t->x[nb*8+7]);
/* read in coherency */
C[0]=t->coh[4*M*nb+4*stm];
C[1]=t->coh[4*M*nb+4*stm+1];
C[2]=t->coh[4*M*nb+4*stm+2];
C[3]=t->coh[4*M*nb+4*stm+3];
memset(pp,0,sizeof(double)*8);
if (stc==sta1) {
/* this station parameter gradient */
pp[stoff]=1.0;
memset(G1,0,sizeof(complex double)*4);
G1[0]=pp[0]+_Complex_I*pp[1];
G1[1]=pp[2]+_Complex_I*pp[3];
G1[2]=pp[4]+_Complex_I*pp[5];
G1[3]=pp[6]+_Complex_I*pp[7];
poff=pstart+tpchunk*8*N+sta2*8;
G2[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]);
G2[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]);
G2[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+4]);
G2[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]);
} else if (stc==sta2) {
memset(G2,0,sizeof(complex double)*4);
pp[stoff]=1.0;
G2[0]=pp[0]+_Complex_I*pp[1];
G2[1]=pp[2]+_Complex_I*pp[3];
G2[2]=pp[4]+_Complex_I*pp[5];
G2[3]=pp[6]+_Complex_I*pp[7];
poff=pstart+tpchunk*8*N+sta1*8;
G1[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]);
G1[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]);
G1[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+5]);
G1[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]);
}
/* T1=G1*C */
amb(G1,C,T1);
/* T2=T1*G2' */
ambt(T1,G2,T2);
/* calculate product xr*vec(J_p C J_q^H ) */
csum=xr[0]*T2[0];
csum+=xr[1]*T2[1];
csum+=xr[2]*T2[2];
csum+=xr[3]*T2[3];
/* accumulate sum */
t->g[ci]+=-2.0*creal(csum);
}
}
}
}
}
return NULL;
}
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
static int
func_grad(
void (*func)(double *p, double *hx, int m, int n, void *adata),
double *p, double *g, double *xo, int m, int n, double step, void *adata) {
/* gradient for each parameter is
(||func(p+step*e_i)-x||^2-||func(p-step*e_i)-x||^2)/2*step
i=0,...,m-1 for all parameters
e_i: unit vector, 1 only at i-th location
*/
double *x; /* array to store residual */
int ci;
me_data_t *dp=(me_data_t*)adata;
int Nt=dp->Nt;
pthread_attr_t attr;
pthread_t *th_array;
thread_data_grad_t *threaddata;
if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
/* evaluate func once, store in x, and create threads */
/* and calculate the residual x=xo-func */
func(p,x,m,n,adata);
/* calculate x<=x-xo */
my_daxpy(n,xo,-1.0,x);
/* setup threads */
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((threaddata=(thread_data_grad_t*)malloc((size_t)Nt*sizeof(thread_data_grad_t)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
int nth,nth1,Nparm;
/* parameters per thread */
Nparm=(m+Nt-1)/Nt;
/* each thread will calculate derivative of part of
parameters */
ci=0;
for (nth=0; nth<Nt; nth++) {
threaddata[nth].Nbase=dp->Nbase;
threaddata[nth].tilesz=dp->tilesz;
threaddata[nth].barr=dp->barr;
threaddata[nth].carr=dp->carr;
threaddata[nth].M=dp->M;
threaddata[nth].N=dp->N;
threaddata[nth].coh=dp->coh;
threaddata[nth].m=m;
threaddata[nth].n=n;
threaddata[nth].x=x;
threaddata[nth].p=p;
threaddata[nth].g=g;
threaddata[nth].g_start=ci;
threaddata[nth].g_end=ci+Nparm-1;
if (threaddata[nth].g_end>=m) {
threaddata[nth].g_end=m-1;
}
ci=ci+Nparm;
pthread_create(&th_array[nth],&attr,cpu_calc_deriv,(void*)(&threaddata[nth]));
}
/* now wait for threads to finish */
for(nth1=0; nth1<nth; nth1++) {
pthread_join(th_array[nth1],NULL);
}
pthread_attr_destroy(&attr);
free(th_array);
free(threaddata);
free(x);
return 0;
}
/* use algorithm 9.1 to compute pk=Hk gk */
/* pk,gk: size m x 1
s, y: size mM x 1
rho: size M x 1
ii: true location of the k th values in s,y */
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
static void
mult_hessian(int m, double *pk, double *gk, double *s, double *y, double *rho, int M, int ii) {
int ci;
double *alphai;
int *idx; /* store sorted locations of s, y here */
double gamma,beta;
if ((alphai=(double*)calloc((size_t)M,sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((idx=(int*)calloc((size_t)M,sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if (M>0) {
/* find the location of k-1 th value */
if (ii>0) {
ii=ii-1;
} else {
ii=M-1;
}
/* s,y will have 0,1,...,ii,ii+1,...M-1 */
/* map this to ii+1,ii+2,...,M-1,0,1,..,ii */
for (ci=0; ci<M-ii-1; ci++){
idx[ci]=(ii+ci+1);
}
for(ci=M-ii-1; ci<M; ci++) {
idx[ci]=(ci-M+ii+1);
}
}
#ifdef DEBUG
printf("prod M=%d, current ii=%d\n",M,ii);
for(ci=0; ci<M; ci++) {
printf("%d->%d ",ci,idx[ci]);
}
printf("\n");
#endif
/* q = grad(f)k : pk<=gk */
my_dcopy(m,gk,1,pk,1);
/* this should be done in the right order */
for (ci=0; ci<M; ci++) {
/* alphai=rhoi si^T*q */
alphai[M-ci-1]=rho[idx[M-ci-1]]*my_ddot(m,&s[m*idx[M-ci-1]],pk);
/* q=q-alphai yi */
my_daxpy(m,&y[m*idx[M-ci-1]],-alphai[M-ci-1],pk);
}
/* r=Hk(0) q : initial hessian */
/* gamma=s(k-1)^T*y(k-1)/y(k-1)^T*y(k-1)*/
gamma=1.0;
if (M>0) {
gamma=my_ddot(m,&s[m*idx[M-1]],&y[m*idx[M-1]]);
gamma/=my_ddot(m,&y[m*idx[M-1]],&y[m*idx[M-1]]);
/* Hk(0)=gamma I, so scale q by gamma */
/* r= Hk(0) q */
my_dscal(m,gamma,pk);
}
for (ci=0; ci<M; ci++) {
/* beta=rhoi yi^T * r */
beta=rho[idx[ci]]*my_ddot(m,&y[m*idx[ci]],pk);
/* r = r + (alphai-beta)*si */
my_daxpy(m,&s[m*idx[ci]],alphai[ci]-beta,pk);
}
free(alphai);
free(idx);
}
/* cubic interpolation in interval [a,b] (a>b is possible)
to find step that minimizes cost function */
/* func: vector function
xk: parameter values size m x 1 (at which step is calculated)
pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
a/b: interval for interpolation
x: size n x 1 (storage)
xp: size m x 1 (storage)
xo: observed data size n x 1
n: size of vector function
step: step size for differencing
adata: additional data passed to the function
*/
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
static double
cubic_interp(
void (*func)(double *p, double *hx, int m, int n, void *adata),
double *xk, double *pk, double a, double b, double *x, double *xp, double *xo, int m, int n, double step, void *adata) {
double f0,f1,f0d,f1d; /* function values and derivatives at a,b */
double p01,p02,z0,fz0;
double aa,cc;
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
my_daxpy(m,pk,a,xp); /* xp<=xp+(a)*pk */
func(xp,x,m,n,adata);
my_daxpy(n,xo,-1.0,x);
f0=my_dnrm2(n,x);
f0*=f0;
/* grad(phi_0): evaluate at -step and +step */
my_daxpy(m,pk,step,xp); /* xp<=xp+(a+step)*pk */
func(xp,x,m,n,adata);
my_daxpy(n,xo,-1.0,x);
p01=my_dnrm2(n,x);
my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(a-step)*pk */
func(xp,x,m,n,adata);
my_daxpy(n,xo,-1.0,x);
p02=my_dnrm2(n,x);
f0d=(p01*p01-p02*p02)/(2.0*step);
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
my_daxpy(m,pk,b,xp); /* xp<=xp+(b)*pk */
func(xp,x,m,n,adata);
my_daxpy(n,xo,-1.0,x);
f1=my_dnrm2(n,x);
f1*=f1;
/* grad(phi_1): evaluate at -step and +step */
my_daxpy(m,pk,step,xp); /* xp<=xp+(b+step)*pk */
func(xp,x,m,n,adata);
my_daxpy(n,xo,-1.0,x);
p01=my_dnrm2(n,x);
my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(b-step)*pk */
func(xp,x,m,n,adata);
my_daxpy(n,xo,-1.0,x);
p02=my_dnrm2(n,x);
f1d=(p01*p01-p02*p02)/(2.0*step);
//printf("Interp a,f(a),f'(a): (%lf,%lf,%lf) (%lf,%lf,%lf)\n",a,f0,f0d,b,f1,f1d);
/* cubic poly in [0,1] is f0+f0d z+eta z^2+xi z^3
where eta=3(f1-f0)-2f0d-f1d, xi=f0d+f1d-2(f1-f0)
derivative f0d+2 eta z+3 xi z^2 => cc+bb z+aa z^2 */
aa=3.0*(f0-f1)/(b-a)+(f1d-f0d);
p01=aa*aa-f0d*f1d;
/* root exist? */
if (p01>0.0) {
/* root */
cc=sqrt(p01);
z0=b-(f1d+cc-aa)*(b-a)/(f1d-f0d+2.0*cc);
/* FIXME: check if this is within boundary */
aa=MAX(a,b);
cc=MIN(a,b);
//printf("Root=%lf, in [%lf,%lf]\n",z0,cc,aa);
if (z0>aa || z0<cc) {
fz0=f0+f1;
} else {
/* evaluate function for this root */
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
my_daxpy(m,pk,a+z0*(b-a),xp); /* xp<=xp+(z0)*pk */
func(xp,x,m,n,adata);
my_daxpy(n,xo,-1.0,x);
fz0=my_dnrm2(n,x);
fz0*=fz0;
}
/* now choose between f0,f1,fz0,fz1 */
if (f0<f1 && f0<fz0) {
return a;
}
if (f1<fz0) {
return b;
}
/* else */
return (z0);
} else {
/* find the value from a or b that minimizes func */
if (f0<f1) {
return a;
} else {
return b;
}
}
return 0;
}
/*************** Fletcher line search **********************************/
/* zoom function for line search */
/* func: vector function
xk: parameter values size m x 1 (at which step is calculated)
pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
a/b: bracket interval [a,b] (a>b) is possible
x: size n x 1 (storage)
xp: size m x 1 (storage)
phi_0: phi(0)
gphi_0: grad(phi(0))
xo: observed data size n x 1
n: size of vector function
step: step size for differencing
adata: additional data passed to the function
*/
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
static double
linesearch_zoom(
void (*func)(double *p, double *hx, int m, int n, void *adata),
double *xk, double *pk, double a, double b, double *x, double *xp, double phi_0, double gphi_0, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) {
double alphaj,phi_j,phi_aj;
double gphi_j,p01,p02,aj,bj;
double alphak=1.0;
int ci,found_step=0;
aj=a;
bj=b;
ci=0;
while(ci<10) {
/* choose alphaj from [a+t2(b-a),b-t3(b-a)] */
p01=aj+t2*(bj-aj);
p02=bj-t3*(bj-aj);
alphaj=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata);
//printf("cubic intep [%lf,%lf]->%lf\n",p01,p02,alphaj);
/* evaluate phi(alphaj) */
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
my_daxpy(m,pk,alphaj,xp); /* xp<=xp+(alphaj)*pk */
func(xp,x,m,n,adata);
/* calculate x<=x-xo */
my_daxpy(n,xo,-1.0,x);
phi_j=my_dnrm2(n,x);
phi_j*=phi_j;
/* evaluate phi(aj) */
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
my_daxpy(m,pk,aj,xp); /* xp<=xp+(alphaj)*pk */
func(xp,x,m,n,adata);
/* calculate x<=x-xo */
my_daxpy(n,xo,-1.0,x);
phi_aj=my_dnrm2(n,x);
phi_aj*=phi_aj;
if ((phi_j>phi_0+rho*alphaj*gphi_0) || phi_j>=phi_aj) {
bj=alphaj; /* aj unchanged */
} else {
/* evaluate grad(alphaj) */
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
my_daxpy(m,pk,alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */
func(xp,x,m,n,adata);
/* calculate x<=x-xo */
my_daxpy(n,xo,-1.0,x);
p01=my_dnrm2(n,x);
my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphaj-step)*pk */
func(xp,x,m,n,adata);
/* calculate x<=x-xo */
my_daxpy(n,xo,-1.0,x);
p02=my_dnrm2(n,x);
gphi_j=(p01*p01-p02*p02)/(2.0*step);
/* termination due to roundoff/other errors pp. 38, Fletcher */
if ((aj-alphaj)*gphi_j<=step) {
alphak=alphaj;
found_step=1;
break;
}
if (fabs(gphi_j)<=-sigma*gphi_0) {
alphak=alphaj;
found_step=1;
break;
}
if (gphi_j*(bj-aj)>=0) {
bj=aj;
} /* else bj unchanged */
aj=alphaj;
}
ci++;
}
if (!found_step) {
/* use bound to find possible step */
alphak=alphaj;
}
#ifdef DEBUG
printf("Found %g Interval [%lf,%lf]\n",alphak,a,b);
#endif
return alphak;
}
/* line search */
/* func: vector function
xk: parameter values size m x 1 (at which step is calculated)
pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
alpha1: initial value for step
sigma,rho,t1,t2,t3: line search parameters (from Fletcher)
xo: observed data size n x 1
n: size of vector function
step: step size for differencing
adata: additional data passed to the function
*/
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
static double
linesearch(
void (*func)(double *p, double *hx, int m, int n, void *adata),
double *xk, double *pk, double alpha1, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) {
/* phi(alpha)=f(xk+alpha pk)
for vector function func
f(xk) =||func(xk)||^2 */
double *x,*xp;
double alphai,alphai1;
double phi_0,phi_alphai,phi_alphai1;
double p01,p02;
double gphi_0,gphi_i;
double alphak;
double mu;
double tol; /* lower limit for minimization */
int ci;
if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((xp=(double*)calloc((size_t)m,sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
alphak=1.0;
/* evaluate phi_0 and grad(phi_0) */
func(xk,x,m,n,adata);
my_daxpy(n,xo,-1.0,x);
phi_0=my_dnrm2(n,x);
phi_0*=phi_0;
/* select tolarance 1/100 of current function value */
tol=MIN(0.01*phi_0,1e-6);
/* grad(phi_0): evaluate at -step and +step */
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
my_daxpy(m,pk,step,xp); /* xp<=xp+(0.0+step)*pk */
func(xp,x,m,n,adata);
/* calculate x<=x-xo */
my_daxpy(n,xo,-1.0,x);
p01=my_dnrm2(n,x);
my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(0.0-step)*pk */
func(xp,x,m,n,adata);
/* calculate x<=x-xo */
my_daxpy(n,xo,-1.0,x);
p02=my_dnrm2(n,x);
gphi_0=(p01*p01-p02*p02)/(2.0*step);
/* estimate for mu */
/* mu = (tol-phi_0)/(rho gphi_0) */
mu=(tol-phi_0)/(rho*gphi_0);
#ifdef DEBUG
printf("mu=%lf, alpha1=%lf\n",mu,alpha1);
#endif
ci=1;
alphai=alpha1; /* initial value for alpha(i) : check if 0<alphai<=mu */
alphai1=0.0;
phi_alphai1=phi_0;
while(ci<10) {
/* evalualte phi(alpha(i))=f(xk+alphai pk) */
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
my_daxpy(m,pk,alphai,xp); /* xp<=xp+alphai*pk */
func(xp,x,m,n,adata);
/* calculate x<=x-xo */
my_daxpy(n,xo,-1.0,x);
phi_alphai=my_dnrm2(n,x);
phi_alphai*=phi_alphai;
if (phi_alphai<tol) {
alphak=alphai;
#ifdef DEBUG
printf("Linesearch : Condition 0 met\n");
#endif
break;
}
if ((phi_alphai>phi_0+alphai*gphi_0) || (ci>1 && phi_alphai>=phi_alphai1)) {
/* ai=alphai1, bi=alphai bracket */
alphak=linesearch_zoom(func,xk,pk,alphai1,alphai,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata);
#ifdef DEBUG
printf("Linesearch : Condition 1 met\n");
#endif
break;
}
/* evaluate grad(phi(alpha(i))) */
my_dcopy(m,xk,1,xp,1); /* NOT NEEDED here?? xp<=xk */
my_daxpy(m,pk,alphai+step,xp); /* xp<=xp+(alphai+step)*pk */
func(xp,x,m,n,adata);
/* calculate x<=x-xo */
my_daxpy(n,xo,-1.0,x);
p01=my_dnrm2(n,x);
my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphai-step)*pk */
func(xp,x,m,n,adata);
/* calculate x<=x-xo */
my_daxpy(n,xo,-1.0,x);
p02=my_dnrm2(n,x);
gphi_i=(p01*p01-p02*p02)/(2.0*step);
if (fabs(gphi_i)<=-sigma*gphi_0) {
alphak=alphai;
#ifdef DEBUG
printf("Linesearch : Condition 2 met\n");
#endif
break;
}
if (gphi_i>=0) {
/* ai=alphai, bi=alphai1 bracket */
alphak=linesearch_zoom(func,xk,pk,alphai,alphai1,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata);
#ifdef DEBUG
printf("Linesearch : Condition 3 met\n");
#endif
break;
}
/* else preserve old values */
if (mu<=(2.0*alphai-alphai1)) {
/* next step */
alphai1=alphai;
alphai=mu;
} else {
/* choose by interpolation in [2*alphai-alphai1,min(mu,alphai+t1*(alphai-alphai1)] */
p01=2.0*alphai-alphai1;
p02=MIN(mu,alphai+t1*(alphai-alphai1));
alphai=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata);
//printf("cubic interp [%lf,%lf]->%lf\n",p01,p02,alphai);
}
phi_alphai1=phi_alphai;
ci++;
}
free(x);
free(xp);
#ifdef DEBUG
printf("Step size=%g\n",alphak);
#endif
return alphak;
}
/*************** END Fletcher line search **********************************/
int
lbfgs_fit(
void (*func)(double *p, double *hx, int m, int n, void *adata),
double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, void *adata) {
double *gk; /* gradients at both k+1 and k iter */
double *xk1,*xk; /* parameters at k+1 and k iter */
double *pk; /* step direction H_k * grad(f) */
double step=1e-6; /* step for interpolation */
double *y, *s; /* storage for delta(grad) and delta(p) */
double *rho; /* storage for 1/yk^T*sk */
int ci,ck,cm;
double alphak=1.0;
if ((gk=(double*)calloc((size_t)m,sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((xk1=(double*)calloc((size_t)m,sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((xk=(double*)calloc((size_t)m,sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((pk=(double*)calloc((size_t)m,sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
/* storage size mM x 1*/
if ((s=(double*)calloc((size_t)m*M,sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((y=(double*)calloc((size_t)m*M,sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((rho=(double*)calloc((size_t)M,sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
/* initial value for params xk=p */
my_dcopy(m,p,1,xk,1);
/* gradient gk=grad(f)_k */
func_grad(func,xk,gk,x,m,n,step,adata);
double gradnrm=my_dnrm2(m,gk);
/* if gradient is too small, no need to solve, so stop */
if (gradnrm<CLM_STOP_THRESH) {
ck=itmax;
step=0.0;
} else {
ck=0;
/* step in [1e-6,1e-9] */
step=MAX(1e-9,MIN(1e-3/gradnrm,1e-6));
}
#ifdef DEBUG
printf("||grad||=%g step=%g\n",gradnrm,step);
#endif
cm=0;
ci=0;
while (ck<itmax) {
/* mult with hessian pk=-H_k*gk */
if (ck<M) {
mult_hessian(m,pk,gk,s,y,rho,ck,ci);
} else {
mult_hessian(m,pk,gk,s,y,rho,M,ci);
}
my_dscal(m,-1.0,pk);
/* linesearch to find step length */
/* parameters alpha1=10.0,sigma=0.1, rho=0.01, t1=9, t2=0.1, t3=0.5 */
alphak=linesearch(func,xk,pk,10.0,0.1,0.01,9,0.1,0.5,x,m,n,step,adata);
/* parameters c1=1e-4 c2=0.9, alpha1=1.0, alphamax=10.0, step (for alpha)=1e-4*/
//alphak=linesearch_nw(func,xk,pk,1.0,10.0,1e-4,0.9,x,m,n,1e-4,adata);
//alphak=1.0;
/* check if step size is too small, then stop */
if (fabs(alphak)<CLM_EPSILON) {
break;
}
/* update parameters xk1=xk+alpha_k *pk */
my_dcopy(m,xk,1,xk1,1);
my_daxpy(m,pk,alphak,xk1);
/* calculate sk=xk1-xk and yk=gk1-gk */
/* sk=xk1 */
my_dcopy(m,xk1,1,&s[cm],1);
/* sk=sk-xk */
my_daxpy(m,xk,-1.0,&s[cm]);
/* yk=-gk */
my_dcopy(m,gk,1,&y[cm],1);
my_dscal(m,-1.0,&y[cm]);
/* update gradient */
func_grad(func,xk1,gk,x,m,n,step,adata);
/* yk=yk+gk1 */
my_daxpy(m,gk,1.0,&y[cm]);
/* calculate 1/yk^T*sk */
rho[ci]=1.0/my_ddot(m,&y[cm],&s[cm]);
/* update xk=xk1 */
my_dcopy(m,xk1,1,xk,1);
//printf("iter %d store %d\n",ck,cm);
ck++;
/* increment storage appropriately */
if (cm<(M-1)*m) {
/* offset of m */
cm=cm+m;
ci++;
} else {
cm=ci=0;
}
}
/* copy back solution to p */
my_dcopy(m,xk,1,p,1);
/* for (ci=0; ci<m; ci++) {
printf("grad %d=%lf\n",ci,gk[ci]);
} */
free(gk);
free(xk1);
free(xk);
free(pk);
free(s);
free(y);
free(rho);
return 0;
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,161 +0,0 @@
/*
*
Copyright (C) 2006-2015 Sarod Yatawatta <sarod@users.sf.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
$Id$
*/
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <pthread.h>
#include <math.h>
#include "Solvers.h"
#include <nvml.h>
//#define MPI_BUILD
#ifdef MPI_BUILD
#include <mpi.h>
#endif
//#define DEBUG
/* return random value in 0,1,..,maxval */
#ifndef MPI_BUILD
static int
random_pick(int maxval, taskhist *th) {
double rat=(double)random()/(double)RAND_MAX;
double y=rat*(double)(maxval+1);
int x=(int)floor(y);
return x;
}
#endif
void
init_task_hist(taskhist *th) {
th->prev=-1;
th->rseed=0;
pthread_mutex_init(&th->prev_mutex,NULL);
}
void
destroy_task_hist(taskhist *th) {
th->prev=-1;
th->rseed=0;
pthread_mutex_destroy(&th->prev_mutex);
}
/* select a GPU from 0,1..,max_gpu
in such a way to allow load balancing */
int
select_work_gpu(int max_gpu, taskhist *th) {
#ifdef MPI_BUILD
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
/* check if max_gpu > no. of actual devices */
int actual_devcount;
cudaGetDeviceCount(&actual_devcount);
if (max_gpu+1>actual_devcount) {
return rank%(actual_devcount);
}
return rank%(max_gpu+1); /* modulo value */
#endif
#ifndef MPI_BUILD
/* sequentially query the devices to find
one with the min load/memory usage */
nvmlReturn_t result;
result = nvmlInit();
int retval;
int minid=-1;
int maxid=-1;
if (result!=NVML_SUCCESS) {
fprintf(stderr,"%s: %d: cannot access NVML\n",__FILE__,__LINE__);
/* return random pick */
retval=random_pick(max_gpu, th);
/* if this matches the previous value, select again */
pthread_mutex_lock(&th->prev_mutex);
while (retval==th->prev) {
retval=random_pick(max_gpu, th);
}
th->prev=retval;
pthread_mutex_unlock(&th->prev_mutex);
return retval;
} else {
/* iterate */
nvmlDevice_t device;
nvmlUtilization_t nvmlUtilization;
nvmlMemory_t nvmlMemory;
unsigned int min_util=101; /* GPU utilization */
unsigned int max_util=0; /* GPU utilization */
unsigned long long int max_free=0; /* max free memory */
unsigned long long int min_free=ULLONG_MAX; /* max free memory */
int ci;
for (ci=0; ci<=max_gpu; ci++) {
result=nvmlDeviceGetHandleByIndex(ci, &device);
result=nvmlDeviceGetUtilizationRates(device, &nvmlUtilization);
result=nvmlDeviceGetMemoryInfo(device, &nvmlMemory);
if (min_util>nvmlUtilization.gpu) {
min_util=nvmlUtilization.gpu;
minid=ci;
}
if (max_util<nvmlUtilization.gpu) {
max_util=nvmlUtilization.gpu;
}
if (max_free<nvmlMemory.free) {
max_free=nvmlMemory.free;
maxid=ci;
}
if (min_free>nvmlMemory.free) {
min_free=nvmlMemory.free;
}
}
result = nvmlShutdown();
/* give priority for selection a GPU with max free memory,
if there is a tie, use min utilization as second criterion */
/* if all have 0 usage, again use random */
if (max_free==min_free && max_util==min_util) {
retval=random_pick(max_gpu,th);
/* if this value matches previous one, select again */
pthread_mutex_lock(&th->prev_mutex);
while(retval==th->prev) {
retval=random_pick(max_gpu,th);
}
th->prev=retval;
pthread_mutex_unlock(&th->prev_mutex);
return retval;
} else {
if (max_free==min_free) { /* all cards have equal free mem */
retval=(int)minid;
} else {
retval=(int)maxid;
}
}
}
/* update last pick */
pthread_mutex_lock(&th->prev_mutex);
th->prev=retval;
pthread_mutex_unlock(&th->prev_mutex);
return retval;
#endif
}

Binary file not shown.

View File

@ -1,627 +0,0 @@
/*
*
Copyright (C) 2014 Sarod Yatawatta <sarod@users.sf.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
$Id$
*/
#include "Solvers.h"
#include <math.h>
//#define DEBUG
typedef struct thread_data_manavg_ {
double *Y;
int startM;
int endM;
int Niter;
int N;
int M;
int Nf;
} thread_data_manavg_t;
/* worker thread function for manifold average+projection */
static void*
manifold_average_threadfn(void *data) {
thread_data_manavg_t *t=(thread_data_manavg_t*)data;
int ci,cj,iter;
double *Yl;
complex double *J3,*Jp;
/* local storage 2Nx2 x Nf complex values */
if ((Yl=(double*)malloc((size_t)t->N*8*t->Nf*sizeof(double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((J3=(complex double*)malloc((size_t)t->N*4*sizeof(complex double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((Jp=(complex double*)malloc((size_t)t->N*4*sizeof(complex double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
#ifdef DEBUG
complex double *Jerr;
if ((Jerr=(complex double*)malloc((size_t)t->N*4*sizeof(complex double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
#endif
complex double *Yc=(complex double*)Yl;
complex double a=1.0/(double)t->Nf+0.0*_Complex_I;
/* work for SVD */
complex double *WORK=0;
complex double w[1];
double RWORK[32]; /* size > 5*max_matrix_dimension */
complex double JTJ[4],U[4],VT[4];
double S[2];
int status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,w,-1,RWORK);
if (status!=0) {
fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
exit(1);
}
int lwork=(int)w[0];
if ((WORK=(complex double*)malloc((size_t)(int)lwork*sizeof(complex double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
for (ci=t->startM; ci<=t->endM; ci++) {
/* copy to local storage */
for (cj=0; cj<t->Nf; cj++) {
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N], 8, &Yl[cj*8*t->N], 4);
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+1], 8, &Yl[cj*8*t->N+1], 4);
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+4], 8, &Yl[cj*8*t->N+2], 4);
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+5], 8, &Yl[cj*8*t->N+3], 4);
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+2], 8, &Yl[cj*8*t->N+4*t->N], 4);
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+3], 8, &Yl[cj*8*t->N+4*t->N+1], 4);
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+6], 8, &Yl[cj*8*t->N+4*t->N+2], 4);
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+7], 8, &Yl[cj*8*t->N+4*t->N+3], 4);
}
/* first averaging, select random block in [0,Nf-1] to project to */
int cr=rand()%(t->Nf); /* remainder always in [0,Nf-1] */
/* J3 <= cr th block */
my_ccopy(t->N*4,&Yc[cr*t->N*4],1,J3,1);
/* project the remainder */
for (cj=0; cj<cr; cj++) {
project_procrustes_block(t->N,J3,&Yc[cj*t->N*4]);
}
for (cj=cr+1; cj<t->Nf; cj++) {
project_procrustes_block(t->N,J3,&Yc[cj*t->N*4]);
}
/* now each 2, 2N complex vales is one J block */
/* average values and project to common average */
for (iter=0; iter<t->Niter; iter++) {
/* J3 <= 1st block */
my_ccopy(t->N*4,Yc,1,J3,1);
/* add the remainder */
for (cj=1; cj<t->Nf; cj++) {
my_caxpy(t->N*4,&Yc[cj*t->N*4],1.0+_Complex_I*0.0,J3);
}
my_cscal(t->N*4,a,J3);
/* now find unitary matrix using Procrustes problem */
for (cj=0; cj<t->Nf; cj++) {
/* find product JTJ = J^H J3 */
my_zgemm('C','N',2,2,2*t->N,1.0+_Complex_I*0.0,&Yc[cj*t->N*4],2*t->N,J3,2*t->N,0.0+_Complex_I*0.0,JTJ,2);
status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,WORK,lwork,RWORK);
//printf("%d %d %lf %lf\n",ci,cj,S[0],S[1]);
/* find JTJ= U V^H */
my_zgemm('N','N',2,2,2,1.0+_Complex_I*0.0,U,2,VT,2,0.0+_Complex_I*0.0,JTJ,2);
/* find J*(JTJ) : projected matrix */
my_zgemm('N','N',2*t->N,2,2,1.0+_Complex_I*0.0,&Yc[cj*t->N*4],2*t->N,JTJ,2,0.0+_Complex_I*0.0,Jp,2*t->N);
/* copy back */
my_ccopy(t->N*4,Jp,1,&Yc[cj*t->N*4],1);
#ifdef DEBUG
/* calculate error between projected value and global mean */
my_ccopy(t->N*4,J3,1,Jerr,1);
my_caxpy(t->N*4,&Yc[cj*t->N*4],-1.0+_Complex_I*0.0,Jerr);
printf("Error freq=%d dir=%d iter=%d %lf\n",cj,ci,iter,my_cnrm2(t->N*4,Jerr));
#endif
}
}
/* now get a fresh copy, because we should modify Y only by
one unitary matrix */
my_ccopy(t->N*4,Yc,1,J3,1);
/* add the remainder */
for (cj=1; cj<t->Nf; cj++) {
my_caxpy(t->N*4,&Yc[cj*t->N*4],1.0+_Complex_I*0.0,J3);
}
my_cscal(t->N*4,a,J3);
for (cj=0; cj<t->Nf; cj++) {
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N], 8, &Yl[cj*8*t->N], 4);
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+1], 8, &Yl[cj*8*t->N+1], 4);
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+4], 8, &Yl[cj*8*t->N+2], 4);
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+5], 8, &Yl[cj*8*t->N+3], 4);
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+2], 8, &Yl[cj*8*t->N+4*t->N], 4);
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+3], 8, &Yl[cj*8*t->N+4*t->N+1], 4);
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+6], 8, &Yl[cj*8*t->N+4*t->N+2], 4);
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+7], 8, &Yl[cj*8*t->N+4*t->N+3], 4);
}
for (cj=0; cj<t->Nf; cj++) {
/* find product JTJ = J^H J3 */
my_zgemm('C','N',2,2,2*t->N,1.0+_Complex_I*0.0,&Yc[cj*t->N*4],2*t->N,J3,2*t->N,0.0+_Complex_I*0.0,JTJ,2);
status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,WORK,lwork,RWORK);
/* find JTJ= U V^H */
my_zgemm('N','N',2,2,2,1.0+_Complex_I*0.0,U,2,VT,2,0.0+_Complex_I*0.0,JTJ,2);
/* find J*(JTJ) : projected matrix */
my_zgemm('N','N',2*t->N,2,2,1.0+_Complex_I*0.0,&Yc[cj*t->N*4],2*t->N,JTJ,2,0.0+_Complex_I*0.0,Jp,2*t->N);
/* copy back */
my_ccopy(t->N*4,Jp,1,&Yc[cj*t->N*4],1);
}
/* copy back from local storage */
for (cj=0; cj<t->Nf; cj++) {
my_dcopy(t->N, &Yl[cj*8*t->N], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N], 8);
my_dcopy(t->N, &Yl[cj*8*t->N+1], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+1], 8);
my_dcopy(t->N, &Yl[cj*8*t->N+2], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+4], 8);
my_dcopy(t->N, &Yl[cj*8*t->N+3], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+5], 8);
my_dcopy(t->N, &Yl[cj*8*t->N+4*t->N], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+2], 8);
my_dcopy(t->N, &Yl[cj*8*t->N+4*t->N+1], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+3], 8);
my_dcopy(t->N, &Yl[cj*8*t->N+4*t->N+2], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+6], 8);
my_dcopy(t->N, &Yl[cj*8*t->N+4*t->N+3], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+7], 8);
}
}
#ifdef DEBUG
free(Jerr);
#endif
free(Yl);
free(J3);
free(Jp);
free(WORK);
return NULL;
}
int
calculate_manifold_average(int N,int M,int Nf,double *Y,int Niter,int Nt) {
/* Y : each 2Nx2xM blocks belong to one freq,
select one 2Nx2 from this, reorder to J format : Nf blocks
and average */
pthread_attr_t attr;
pthread_t *th_array;
thread_data_manavg_t *threaddata;
int ci,Nthb0,Nthb,nth,nth1;
/* clusters per thread */
Nthb0=(M+Nt-1)/Nt;
/* setup threads */
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((threaddata=(thread_data_manavg_t*)malloc((size_t)Nt*sizeof(thread_data_manavg_t)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
ci=0;
for (nth=0; nth<Nt && ci<M; nth++) {
if (ci+Nthb0<M) {
Nthb=Nthb0;
} else {
Nthb=M-ci;
}
threaddata[nth].Y=Y;
threaddata[nth].N=N;
threaddata[nth].M=M;
threaddata[nth].Nf=Nf;
threaddata[nth].Niter=Niter;
threaddata[nth].startM=ci;
threaddata[nth].endM=ci+Nthb-1;
pthread_create(&th_array[nth],&attr,manifold_average_threadfn,(void*)(&threaddata[nth]));
ci=ci+Nthb;
}
for(nth1=0; nth1<nth; nth1++) {
pthread_join(th_array[nth1],NULL);
}
pthread_attr_destroy(&attr);
free(th_array);
free(threaddata);
return 0;
}
int
project_procrustes(int N,double *J,double *J1) {
/* min ||J - J1 U || find U */
complex double *X,*Y;
/* local storage */
if ((X=(complex double*)malloc((size_t)N*4*sizeof(complex double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((Y=(complex double*)malloc((size_t)N*4*sizeof(complex double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
double *Jx=(double*)X;
double *Jy=(double*)Y;
/* copy to get correct format */
my_dcopy(N, &J[0], 8, &Jx[0], 4);
my_dcopy(N, &J[0+1], 8, &Jx[1], 4);
my_dcopy(N, &J[0+4], 8, &Jx[2], 4);
my_dcopy(N, &J[0+5], 8, &Jx[3], 4);
my_dcopy(N, &J[0+2], 8, &Jx[4*N], 4);
my_dcopy(N, &J[0+3], 8, &Jx[4*N+1], 4);
my_dcopy(N, &J[0+6], 8, &Jx[4*N+2], 4);
my_dcopy(N, &J[0+7], 8, &Jx[4*N+3], 4);
my_dcopy(N, &J1[0], 8, &Jy[0], 4);
my_dcopy(N, &J1[0+1], 8, &Jy[1], 4);
my_dcopy(N, &J1[0+4], 8, &Jy[2], 4);
my_dcopy(N, &J1[0+5], 8, &Jy[3], 4);
my_dcopy(N, &J1[0+2], 8, &Jy[4*N], 4);
my_dcopy(N, &J1[0+3], 8, &Jy[4*N+1], 4);
my_dcopy(N, &J1[0+6], 8, &Jy[4*N+2], 4);
my_dcopy(N, &J1[0+7], 8, &Jy[4*N+3], 4);
/* min ||X - Y U|| find U */
/* work for SVD */
complex double *WORK=0;
complex double w[1];
double RWORK[32]; /* size > 5*max_matrix_dimension */
complex double JTJ[4],U[4],VT[4];
double S[2];
int status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,w,-1,RWORK);
if (status!=0) {
fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
exit(1);
}
int lwork=(int)w[0];
if ((WORK=(complex double*)malloc((size_t)(int)lwork*sizeof(complex double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
/* find product JTJ = Y^H X */
my_zgemm('C','N',2,2,2*N,1.0+_Complex_I*0.0,Y,2*N,X,2*N,0.0+_Complex_I*0.0,JTJ,2);
/* JTJ = U S V^H */
status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,WORK,lwork,RWORK);
/* find JTJ= U V^H */
my_zgemm('N','N',2,2,2,1.0+_Complex_I*0.0,U,2,VT,2,0.0+_Complex_I*0.0,JTJ,2);
/* find Y*(JTJ) : projected matrix -> store in X */
my_zgemm('N','N',2*N,2,2,1.0+_Complex_I*0.0,Y,2*N,JTJ,2,0.0+_Complex_I*0.0,X,2*N);
my_dcopy(N, &Jx[0], 4, &J1[0], 8);
my_dcopy(N, &Jx[1], 4, &J1[0+1], 8);
my_dcopy(N, &Jx[2], 4, &J1[0+4], 8);
my_dcopy(N, &Jx[3], 4, &J1[0+5], 8);
my_dcopy(N, &Jx[4*N], 4, &J1[0+2], 8);
my_dcopy(N, &Jx[4*N+1], 4, &J1[0+3], 8);
my_dcopy(N, &Jx[4*N+2], 4, &J1[0+6], 8);
my_dcopy(N, &Jx[4*N+3], 4, &J1[0+7], 8);
free(WORK);
free(X);
free(Y);
return 0;
}
int
project_procrustes_block(int N,complex double *X,complex double *Y) {
/* min ||X - Y U || find U */
complex double *Jlocal;
/* local storage */
if ((Jlocal=(complex double*)malloc((size_t)N*4*sizeof(complex double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
/* work for SVD */
complex double *WORK=0;
complex double w[1];
double RWORK[32]; /* size > 5*max_matrix_dimension */
complex double JTJ[4],U[4],VT[4];
double S[2];
int status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,w,-1,RWORK);
if (status!=0) {
fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
exit(1);
}
int lwork=(int)w[0];
if ((WORK=(complex double*)malloc((size_t)(int)lwork*sizeof(complex double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
/* find product JTJ = Y^H X */
my_zgemm('C','N',2,2,2*N,1.0+_Complex_I*0.0,Y,2*N,X,2*N,0.0+_Complex_I*0.0,JTJ,2);
/* JTJ = U S V^H */
status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,WORK,lwork,RWORK);
/* find JTJ= U V^H */
my_zgemm('N','N',2,2,2,1.0+_Complex_I*0.0,U,2,VT,2,0.0+_Complex_I*0.0,JTJ,2);
/* find Y*(JTJ) : projected matrix -> store in Jlocal */
my_zgemm('N','N',2*N,2,2,1.0+_Complex_I*0.0,Y,2*N,JTJ,2,0.0+_Complex_I*0.0,Jlocal,2*N);
/* copy Jlocal -> Y */
my_dcopy(8*N, (double*)Jlocal, 1, (double*)Y, 1);
free(WORK);
free(Jlocal);
return 0;
}
//#define DEBUG
/* Extract only the phase of diagonal entries from solutions
p: 8Nx1 solutions, orders as [(real,imag)vec(J1),(real,imag)vec(J2),...]
pout: 8Nx1 phases (exp(j*phase)) of solutions, after joint diagonalization of p
N: no. of 2x2 Jones matrices in p, having common unitary ambiguity
niter: no of iterations for Jacobi rotation */
int
extract_phases(double *p, double *pout, int N, int niter) {
/* local storage */
complex double *J,*Jcopy;
/* local storage, change ordering of solutions [J_1^T,J_2^T,...]^T */
if ((J=(complex double*)malloc((size_t)N*4*sizeof(complex double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((Jcopy=(complex double*)malloc((size_t)N*4*sizeof(complex double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
double *Jx=(double *)J;
/* copy to get correct format */
my_dcopy(N, &p[0], 8, &Jx[0], 4);
my_dcopy(N, &p[0+1], 8, &Jx[1], 4);
my_dcopy(N, &p[0+4], 8, &Jx[2], 4);
my_dcopy(N, &p[0+5], 8, &Jx[3], 4);
my_dcopy(N, &p[0+2], 8, &Jx[4*N], 4);
my_dcopy(N, &p[0+3], 8, &Jx[4*N+1], 4);
my_dcopy(N, &p[0+6], 8, &Jx[4*N+2], 4);
my_dcopy(N, &p[0+7], 8, &Jx[4*N+3], 4);
complex double h[3],Hc[9];
double H[9];
double W[3],Z[3];
double w[1],*WORK;
int IWORK[15],IFAIL[3],info;
int ni,ci;
complex double c,s,G[4];
#ifdef DEBUG
printf("J=[\n");
for (ci=0; ci<N; ci++) {
printf("%lf+j*(%lf), %lf+j*(%lf)\n",p[8*ci],p[8*ci+1],p[8*ci+2],p[8*ci+3]);
printf("%lf+j*(%lf), %lf+j*(%lf)\n",p[8*ci+4],p[8*ci+5],p[8*ci+6],p[8*ci+7]);
}
printf("];\n");
#endif
/* setup workspace for eigenvalue decomposition */
info=my_dsyevx('V','I','L',3,H,3,0.0,0.0,3,3,dlamch('S'),1,W,Z,3,w,-1,IWORK,IFAIL);
if (info) {
fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,info);
exit(1);
}
/* get work size */
int lwork=(int)w[0];
/* allocate memory */
if ((WORK=(double*)malloc((size_t)lwork*sizeof(double)))==0) {
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
exit(1);
}
/* iteration loop */
for (ni=0; ni<niter; ni++) {
/************** for element (1,2) **********************/
/* accumulate h*h^H product */
memset(Hc,0,9*sizeof(complex double));
for (ci=0; ci<N; ci++) {
/* [a_ii-a_jj,a_ij+a_ji,I*(a_ji-a_ij)] */
h[0]=conj(J[2*ci]-J[2*ci+2*N+1]);
h[1]=conj(J[2*ci+2*N]+J[2*ci+1]);
h[2]=conj(_Complex_I*(J[2*ci+1]-J[2*ci+2*N]));
/* store results onto lower triangle */
my_zher('L',3,1.0,h,1,Hc,3);
}
/* get real part, copy it to lower triangle */
H[0]=creal(Hc[0]);
H[1]=creal(Hc[1]);
H[2]=creal(Hc[2]);
H[4]=creal(Hc[4]);
H[5]=creal(Hc[5]);
H[8]=creal(Hc[8]);
#ifdef DEBUG
printf("H=[\n");
printf("%e %e %e\n",H[0],H[1],H[2]);
printf("%e %e %e\n",H[1],H[4],H[5]);
printf("%e %e %e\n",H[2],H[5],H[8]);
printf("];\n");
#endif
info=my_dsyevx('V','I','L',3,H,3,0.0,0.0,3,3,dlamch('S'),1,W,Z,3,WORK,lwork,IWORK,IFAIL);
if (info<0) {
fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,info);
exit(1);
}
#ifdef DEBUG
printf("max eigenvalue=%e\n",W[0]);
printf("ev=[\n");
printf("%e\n",Z[0]);
printf("%e\n",Z[1]);
printf("%e\n",Z[2]);
printf("];\n");
#endif
/* form sin,cos values */
if (Z[0]>=0.0) {
c=sqrt(0.5+Z[0]*0.5)+_Complex_I*0.0;
s=0.5*(Z[1]-_Complex_I*Z[2])/c;
} else {
/* flip sign of eigenvector */
c=sqrt(0.5-Z[0]*0.5)+_Complex_I*0.0;
s=0.5*(-Z[1]+_Complex_I*Z[2])/c;
}
/* form Givens rotation matrix */
G[0]=c;
G[1]=-s;
G[2]=conj(s);
G[3]=conj(c);
#ifdef DEBUG
printf("G=[\n");
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(G[0]),cimag(G[0]),creal(G[2]),cimag(G[2]));
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(G[1]),cimag(G[1]),creal(G[3]),cimag(G[3]));
printf("];\n");
#endif
/* rotate J <= J * G^H: Jcopy = 1 x J x G^H + 0 x Jcopy */
my_zgemm('N','C',2*N,2,2,1.0+_Complex_I*0.0,J,2*N,G,2,0.0+_Complex_I*0.0,Jcopy,2*N);
memcpy(J,Jcopy,(size_t)4*N*sizeof(complex double));
#ifdef DEBUG
printf("JGH=[\n");
for (ci=0; ci<N; ci++) {
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci]),cimag(J[2*ci]),creal(J[2*N+2*ci]),cimag(J[2*N+2*ci]));
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci+1]),cimag(J[2*ci+1]),creal(J[2*N+2*ci+1]),cimag(J[2*N+2*ci+1]));
}
printf("];\n");
#endif
/************** for element (2,1) **********************/
/* accumulate h*h^H product */
memset(Hc,0,9*sizeof(complex double));
for (ci=0; ci<N; ci++) {
/* [a_ii-a_jj,a_ij+a_ji,I*(a_ji-a_ij)] */
h[0]=conj(J[2*ci+2*N+1]-J[2*ci]);
h[1]=conj(J[2*ci+1]+J[2*ci+2*N]);
h[2]=conj(_Complex_I*(J[2*ci+2*N]-J[2*ci+1]));
/* store results onto lower triangle */
my_zher('L',3,1.0,h,1,Hc,3);
}
/* get real part, copy it to lower triangle */
H[0]=creal(Hc[0]);
H[1]=creal(Hc[1]);
H[2]=creal(Hc[2]);
H[4]=creal(Hc[4]);
H[5]=creal(Hc[5]);
H[8]=creal(Hc[8]);
#ifdef DEBUG
printf("H=[\n");
printf("%e %e %e\n",H[0],H[1],H[2]);
printf("%e %e %e\n",H[1],H[4],H[5]);
printf("%e %e %e\n",H[2],H[5],H[8]);
printf("];\n");
#endif
info=my_dsyevx('V','I','L',3,H,3,0.0,0.0,3,3,dlamch('S'),1,W,Z,3,WORK,lwork,IWORK,IFAIL);
if (info<0) {
fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,info);
exit(1);
}
#ifdef DEBUG
printf("max eigenvalue=%e\n",W[0]);
printf("ev=[\n");
printf("%e\n",Z[0]);
printf("%e\n",Z[1]);
printf("%e\n",Z[2]);
printf("];\n");
#endif
/* form sin,cos values */
if (Z[0]>=0.0) {
c=sqrt(0.5+Z[0]*0.5)+_Complex_I*0.0;
s=0.5*(Z[1]-_Complex_I*Z[2])/c;
} else {
/* flip sign of eigenvector */
c=sqrt(0.5-Z[0]*0.5)+_Complex_I*0.0;
s=0.5*(-Z[1]+_Complex_I*Z[2])/c;
}
/* form Givens rotation matrix */
G[0]=c;
G[1]=-s;
G[2]=conj(s);
G[3]=conj(c);
#ifdef DEBUG
printf("G=[\n");
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(G[0]),cimag(G[0]),creal(G[2]),cimag(G[2]));
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(G[1]),cimag(G[1]),creal(G[3]),cimag(G[3]));
printf("];\n");
#endif
/* rotate J <= J * G^H: Jcopy = 1 x J x G^H + 0 x Jcopy */
my_zgemm('N','C',2*N,2,2,1.0+_Complex_I*0.0,J,2*N,G,2,0.0+_Complex_I*0.0,Jcopy,2*N);
/* before copying updated result, find residual norm */
/* J = -Jcopy + J */
my_caxpy(4*N,Jcopy,-1.0+_Complex_I*0.0,J);
#ifdef DEBUG
printf("Iter %d residual=%lf\n",ni,my_cnrm2(4*N,J));
#endif
memcpy(J,Jcopy,(size_t)4*N*sizeof(complex double));
#ifdef DEBUG
printf("JGH=[\n");
for (ci=0; ci<N; ci++) {
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci]),cimag(J[2*ci]),creal(J[2*N+2*ci]),cimag(J[2*N+2*ci]));
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci+1]),cimag(J[2*ci+1]),creal(J[2*N+2*ci+1]),cimag(J[2*N+2*ci+1]));
}
printf("];\n");
#endif
}
free(WORK);
#ifdef DEBUG
printf("Jfinal=[\n");
for (ci=0; ci<N; ci++) {
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci]),cimag(J[2*ci]),creal(J[2*N+2*ci]),cimag(J[2*N+2*ci]));
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci+1]),cimag(J[2*ci+1]),creal(J[2*N+2*ci+1]),cimag(J[2*N+2*ci+1]));
}
printf("];\n");
#endif
/* extract phase only from diagonal elements */
for (ci=0; ci<N; ci++) {
J[2*ci]=J[2*ci]/cabs(J[2*ci]);
J[2*ci+2*N+1]=J[2*ci+2*N+1]/cabs(J[2*ci+2*N+1]);
}
/* copy back to output (only the diagonal values) */
memset(pout,0,sizeof(double)*8*N);
my_dcopy(N, &Jx[0], 4, &pout[0], 8);
my_dcopy(N, &Jx[1], 4, &pout[0+1], 8);
my_dcopy(N, &Jx[4*N+2], 4, &pout[0+6], 8);
my_dcopy(N, &Jx[4*N+3], 4, &pout[0+7], 8);
free(J);
free(Jcopy);
return 0;
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,380 +0,0 @@
/*
*
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
$Id$
*/
#include "cuda.h"
#include <cuComplex.h>
#include <stdio.h>
/* enable this for checking for kernel failure */
//#define CUDA_DBG
__global__ void kernel_diagdiv_fl(int M, float eps, float *y, float *x){
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
/* make sure to use only M threads */
if (tid<M) {
if (x[tid]>eps) {
y[tid]=y[tid]/x[tid];
} else {
y[tid]=0.0f;
}
}
}
__global__ void kernel_diagmu_fl(int M, float *A,float mu){
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
/* make sure to use only M threads */
if (tid<M) {
A[tid*(M+1)]=A[tid*(M+1)]+mu;
}
}
__global__ void kernel_func_fl(int Nbase, float *x, float *coh, float *p, short *bb, int N){
/* global thread index : equal to the baseline */
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
/* this thread works on
x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
bb[2*n:2*n+1] (sta1,sta2)
organization of p (N stations and M clusters)
sta 0 sta 1 sta 2 .... sta N-1
clus 0 0...7 8...15 16...23 ... 8N-8 8N-1
clus 1 8N..8N+7 8N+8..8N+15 8N+16..8N+23 .... 8N+8N-8...8N+8N-1
......
clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15.... ...(M-1)N+8N-8 (M-1)N+8N-1
organization of coherencies (coh)
[0, 8*M-1] : baseline 0
[8*M, 8*M+8*M-1]: baseline 1
[n*8*M, n*8*M+8*M-1]: baseline n
......
[n*8*M+cm*8, n*8*M+cm*8+7] cluster cm, baseline n
residual error stored at sum[n]
*/
if(n<Nbase) {
int sta1=(int)bb[2*n];
int sta2=(int)bb[2*n+1];
/* condition for calculating this baseline sum is
1) its not flagged (sta1,sta2)>=0
*/
if (sta1>=0 && sta2>=0) {
cuFloatComplex G1[4];
float pp[8];
pp[0]=p[sta1*8];
pp[1]=p[sta1*8+1];
pp[2]=p[sta1*8+2];
pp[3]=p[sta1*8+3];
pp[4]=p[sta1*8+4];
pp[5]=p[sta1*8+5];
pp[6]=p[sta1*8+6];
pp[7]=p[sta1*8+7];
G1[0].x=pp[0];
G1[0].y=pp[1];
G1[1].x=pp[2];
G1[1].y=pp[3];
G1[2].x=pp[4];
G1[2].y=pp[5];
G1[3].x=pp[6];
G1[3].y=pp[7];
cuFloatComplex C[4];
C[0].x=coh[8*n];
C[0].y=coh[8*n+1];
C[1].x=coh[8*n+2];
C[1].y=coh[8*n+3];
C[2].x=coh[8*n+4];
C[2].y=coh[8*n+5];
C[3].x=coh[8*n+6];
C[3].y=coh[8*n+7];
cuFloatComplex T1[4];
/* T=G1*C */
T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
cuFloatComplex G2[4];
/* conjugate this */
pp[0]=p[sta2*8];
pp[1]=-p[sta2*8+1];
pp[2]=p[sta2*8+2];
pp[3]=-p[sta2*8+3];
pp[4]=p[sta2*8+4];
pp[5]=-p[sta2*8+5];
pp[6]=p[sta2*8+6];
pp[7]=-p[sta2*8+7];
G2[0].x=pp[0];
G2[0].y=pp[1];
G2[2].x=pp[2];
G2[2].y=pp[3];
G2[1].x=pp[4];
G2[1].y=pp[5];
G2[3].x=pp[6];
G2[3].y=pp[7];
cuFloatComplex T2[4];
T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
/* update model vector */
x[8*n]=T2[0].x;
x[8*n+1]=T2[0].y;
x[8*n+2]=T2[1].x;
x[8*n+3]=T2[1].y;
x[8*n+4]=T2[2].x;
x[8*n+5]=T2[2].y;
x[8*n+6]=T2[3].x;
x[8*n+7]=T2[3].y;
}
}
}
__global__ void kernel_jacf_fl(int Nbase, int M, float *jac, float *coh, float *p, short *bb, int N){
/* global thread index : equal to the baseline */
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
/* which parameter:0...M */
unsigned int m = threadIdx.y + blockDim.y*blockIdx.y;
/* this thread works on
x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
bb[2*n:2*n+1] (sta1,sta2)
organization of p (N stations and M clusters)
sta 0 sta 1 sta 2 .... sta N-1
clus 0 0...7 8...15 16...23 ... 8N-8 8N-1
clus 1 8N..8N+7 8N+8..8N+15 8N+16..8N+23 .... 8N+8N-8...8N+8N-1
......
clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15.... ...(M-1)N+8N-8 (M-1)N+8N-1
organization of coherencies (coh)
[0, 8*M-1] : baseline 0
[8*M, 8*M+8*M-1]: baseline 1
[n*8*M, n*8*M+8*M-1]: baseline n
......
[n*8*M+cm*8, n*8*M+cm*8+7] cluster cm, baseline n
residual error stored at sum[n]
*/
if(n<Nbase && m<M) {
int sta1=(int)bb[2*n];
int sta2=(int)bb[2*n+1];
/* condition for calculating this baseline sum is
If this baseline is flagged,
or if this parameter does not belong to sta1 or sta2
we do not compute
*/
//int stc=m/8; /* 0...Ns-1 (because M=total par= 8 * Nstations */
int stc=m>>3; /* 0...Ns-1 (because M=total par= 8 * Nstations */
if (((stc==sta2)||(stc==sta1)) && sta1>=0 && sta2>=0 ) {
cuFloatComplex C[4];
C[0].x=coh[8*n];
C[0].y=coh[8*n+1];
C[1].x=coh[8*n+2];
C[1].y=coh[8*n+3];
C[2].x=coh[8*n+4];
C[2].y=coh[8*n+5];
C[3].x=coh[8*n+6];
C[3].y=coh[8*n+7];
/* which parameter exactly 0..7 */
//int stoff=m%8;
int stoff=m-stc*8;
float pp1[8];
float pp2[8];
if (stc==sta1) {
for (int cn=0; cn<8; cn++) {
pp1[cn]=0.0f;
pp2[cn]=p[sta2*8+cn];
}
pp1[stoff]=1.0f;
} else if (stc==sta2) {
for (int cn=0; cn<8; cn++) {
pp2[cn]=0.0f;
pp1[cn]=p[sta1*8+cn];
}
pp2[stoff]=1.0f;
}
cuFloatComplex G1[4];
G1[0].x=pp1[0];
G1[0].y=pp1[1];
G1[1].x=pp1[2];
G1[1].y=pp1[3];
G1[2].x=pp1[4];
G1[2].y=pp1[5];
G1[3].x=pp1[6];
G1[3].y=pp1[7];
cuFloatComplex T1[4];
/* T=G1*C */
T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
cuFloatComplex G2[4];
/* conjugate this */
G2[0].x=pp2[0];
G2[0].y=-pp2[1];
G2[2].x=pp2[2];
G2[2].y=-pp2[3];
G2[1].x=pp2[4];
G2[1].y=-pp2[5];
G2[3].x=pp2[6];
G2[3].y=-pp2[7];
cuFloatComplex T2[4];
T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
/* update jacobian */
/* NOTE: row major order */
jac[m+M*8*n]=T2[0].x;
jac[m+M*(8*n+1)]=T2[0].y;
jac[m+M*(8*n+2)]=T2[1].x;
jac[m+M*(8*n+3)]=T2[1].y;
jac[m+M*(8*n+4)]=T2[2].x;
jac[m+M*(8*n+5)]=T2[2].y;
jac[m+M*(8*n+6)]=T2[3].x;
jac[m+M*(8*n+7)]=T2[3].y;
}
}
}
/* only use extern if calling code is C */
extern "C"
{
/* divide by singular values Dpd[]/Sd[] for Sd[]> eps */
void
cudakernel_diagdiv_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float eps, float *Dpd, float *Sd) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_diagdiv_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, eps, Dpd, Sd);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* cuda driver for calculating
A<= A+mu I, adding mu to diagonal entries of A
A: size MxM
ThreadsPerBlock, BlocksPerGrid calculated to meet M
*/
void
cudakernel_diagmu_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float *A, float mu) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_diagmu_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, A, mu);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* cuda driver for calculating f() */
/* p: params (Mx1), x: data (Nx1), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
void
cudakernel_func_fl(int ThreadsPerBlock, int BlocksPerGrid, float *p, float *x, int M, int N, float *coh, short *bbh, int Nbase, int Mclus, int Nstations) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
cudaMemset(x, 0, N*sizeof(float));
// printf("Kernel data size=%d, block=%d, thread=%d, baselines=%d\n",N,BlocksPerGrid, ThreadsPerBlock,Nbase);
kernel_func_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(Nbase, x, coh, p, bbh, Nstations);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* cuda driver for calculating jacf() */
/* p: params (Mx1), jac: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
void
cudakernel_jacf_fl(int ThreadsPerBlock_row, int ThreadsPerBlock_col, float *p, float *jac, int M, int N, float *coh, short *bbh, int Nbase, int Mclus, int Nstations) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
/* NOTE: use small value for ThreadsPerBlock here, like 8 */
dim3 threadsPerBlock(16, 8);
/* jacobian: Nbase x Nstations (proportional to N), so */
dim3 numBlocks((Nbase+threadsPerBlock.x-1)/threadsPerBlock.x,
(M+threadsPerBlock.y-1)/threadsPerBlock.y);
/* set memory of jac to zero */
cudaMemset(jac, 0, N*M*sizeof(float));
// printf("Kernel Jax data size=%d, params=%d, block=%d,%d, thread=%d,%d, baselines=%d\n",N, M, numBlocks.x,numBlocks.y, threadsPerBlock.x, threadsPerBlock.y, Nbase);
kernel_jacf_fl<<< numBlocks, threadsPerBlock>>>(Nbase, M, jac, coh, p, bbh, Nstations);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
}

Binary file not shown.

View File

@ -1,462 +0,0 @@
/*
*
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
$Id$
*/
#include "Solvers.h"
#include <string.h> /* for memcpy */
/* machine precision */
double
dlamch(char CMACH) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern double dlamch_(char *CMACH);
return(dlamch_(&CMACH));
}
/* blas dcopy */
/* y = x */
/* read x values spaced by Nx (so x size> N*Nx) */
/* write to y values spaced by Ny (so y size > N*Ny) */
void
my_dcopy(int N, double *x, int Nx, double *y, int Ny) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void dcopy_(int *N, double *x, int *incx, double *y, int *incy);
/* use memcpy if Nx=Ny=1 */
if (Nx==1&&Ny==1) {
memcpy((void*)y,(void*)x,sizeof(double)*(size_t)N);
} else {
dcopy_(&N,x,&Nx,y,&Ny);
}
}
/* blas scale */
/* x = a. x */
void
my_dscal(int N, double a, double *x) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void dscal_(int *N, double *alpha, double *x, int *incx);
int i=1;
dscal_(&N,&a,x,&i);
}
void
my_sscal(int N, float a, float *x) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void sscal_(int *N, float *alpha, float *x, int *incx);
int i=1;
sscal_(&N,&a,x,&i);
}
/* x^T*y */
double
my_ddot(int N, double *x, double *y) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern double ddot_(int *N, double *x, int *incx, double *y, int *incy);
int i=1;
return(ddot_(&N,x,&i,y,&i));
}
/* ||x||_2 */
double
my_dnrm2(int N, double *x) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern double dnrm2_(int *N, double *x, int *incx);
int i=1;
return(dnrm2_(&N,x,&i));
}
float
my_fnrm2(int N, float *x) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern float snrm2_(int *N, float *x, int *incx);
int i=1;
return(snrm2_(&N,x,&i));
}
/* sum||x||_1 */
double
my_dasum(int N, double *x) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern double dasum_(int *N, double *x, int *incx);
int i=1;
return(dasum_(&N,x,&i));
}
float
my_fasum(int N, float *x) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern float sasum_(int *N, float *x, int *incx);
int i=1;
return(sasum_(&N,x,&i));
}
/* BLAS y = a.x + y */
void
my_daxpy(int N, double *x, double a, double *y) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void daxpy_(int *N, double *alpha, double *x, int *incx, double *y, int *incy);
int i=1; /* strides */
daxpy_(&N,&a,x,&i,y,&i);
}
/* BLAS y = a.x + y */
void
my_daxpys(int N, double *x, int incx, double a, double *y, int incy) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void daxpy_(int *N, double *alpha, double *x, int *incx, double *y, int *incy);
daxpy_(&N,&a,x,&incx,y,&incy);
}
void
my_saxpy(int N, float *x, float a, float *y) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void saxpy_(int *N, float *alpha, float *x, int *incx, float *y, int *incy);
int i=1; /* strides */
saxpy_(&N,&a,x,&i,y,&i);
}
/* max |x| index (start from 1...)*/
int
my_idamax(int N, double *x, int incx) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern int idamax_(int *N, double *x, int *incx);
return idamax_(&N,x,&incx);
}
int
my_isamax(int N, float *x, int incx) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern int isamax_(int *N, float *x, int *incx);
return isamax_(&N,x,&incx);
}
/* min |x| index (start from 1...)*/
int
my_idamin(int N, double *x, int incx) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern int idamin_(int *N, double *x, int *incx);
return idamin_(&N,x,&incx);
}
/* BLAS DGEMM C = alpha*op(A)*op(B)+ beta*C */
void
my_dgemm(char transa, char transb, int M, int N, int K, double alpha, double *A, int lda, double *B, int ldb, double beta, double *C, int ldc) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void dgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, double *ALPHA, double *A, int *LDA, double *B, int * LDB, double *BETA, double *C, int *LDC);
dgemm_(&transa, &transb, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
}
/* BLAS DGEMV y = alpha*op(A)*x+ beta*y : op 'T' or 'N' */
void
my_dgemv(char trans, int M, int N, double alpha, double *A, int lda, double *x, int incx, double beta, double *y, int incy) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void dgemv_(char *TRANS, int *M, int *N, double *ALPHA, double *A, int *LDA, double *X, int *INCX, double *BETA, double *Y, int *INCY);
dgemv_(&trans, &M, &N, &alpha, A, &lda, x, &incx, &beta, y, &incy);
}
/* following routines used in LAPACK solvers */
/* cholesky factorization: real symmetric */
int
my_dpotrf(char uplo, int N, double *A, int lda) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void dpotrf_(char *uplo, int *N, double *A, int *lda, int *info);
int info;
dpotrf_(&uplo,&N,A,&lda,&info);
return info;
}
/* solve Ax=b using cholesky factorization */
int
my_dpotrs(char uplo, int N, int nrhs, double *A, int lda, double *b, int ldb){
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void dpotrs_(char *uplo, int *N, int *nrhs, double *A, int *lda, double *b, int *ldb, int *info);
int info;
dpotrs_(&uplo,&N,&nrhs,A,&lda,b,&ldb,&info);
return info;
}
/* solve Ax=b using QR factorization */
int
my_dgels(char TRANS, int M, int N, int NRHS, double *A, int LDA, double *B, int LDB, double *WORK, int LWORK) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void dgels_(char *TRANS, int *M, int *N, int *NRHS, double *A, int *LDA, double *B, int *LDB, double *WORK, int *LWORK, int *INFO);
int info;
dgels_(&TRANS,&M,&N,&NRHS,A,&LDA,B,&LDB,WORK,&LWORK,&info);
return info;
}
/* A=U S VT, so V needs NOT to be transposed */
int
my_dgesvd(char JOBU, char JOBVT, int M, int N, double *A, int LDA, double *S,
double *U, int LDU, double *VT, int LDVT, double *WORK, int LWORK) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void dgesvd_(char *JOBU, char *JOBVT, int *M, int *N, double *A,
int *LDA, double *S, double *U, int *LDU, double *VT, int *LDVT,
double *WORK, int *LWORK, int *info);
int info;
dgesvd_(&JOBU,&JOBVT,&M,&N,A,&LDA,S,U,&LDU,VT,&LDVT,WORK,&LWORK,&info);
return info;
}
/* QR factorization QR=A, only TAU is used for Q, R stored in A*/
int
my_dgeqrf(int M, int N, double *A, int LDA, double *TAU, double *WORK, int LWORK) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void dgeqrf_(int *M, int *N, double *A, int *LDA, double *TAU, double *WORK, int *LWORK, int *INFO);
int info;
dgeqrf_(&M,&N,A,&LDA,TAU,WORK,&LWORK,&info);
return info;
}
/* calculate Q using elementary reflections */
int
my_dorgqr(int M,int N,int K,double *A,int LDA,double *TAU,double *WORK,int LWORK) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void dorgqr_(int *M, int *N, int *K, double *A, int *LDA, double *TAU, double *WORK, int *LWORK, int *INFO);
int info;
dorgqr_(&M, &N, &K, A, &LDA, TAU, WORK, &LWORK, &info);
return info;
}
/* solves a triangular system of equations Ax=b, A triangular */
int
my_dtrtrs(char UPLO, char TRANS, char DIAG,int N,int NRHS,double *A,int LDA,double *B,int LDB) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void dtrtrs_(char *UPLO,char *TRANS,char *DIAG,int *N,int *NRHS,double *A,int *LDA,double *B,int *LDB,int *INFO);
int info;
dtrtrs_(&UPLO,&TRANS,&DIAG,&N,&NRHS,A,&LDA,B,&LDB,&info);
return info;
}
/* blas ccopy */
/* y = x */
/* read x values spaced by Nx (so x size> N*Nx) */
/* write to y values spaced by Ny (so y size > N*Ny) */
void
my_ccopy(int N, complex double *x, int Nx, complex double *y, int Ny) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void zcopy_(int *N, complex double *x, int *incx, complex double *y, int *incy);
/* use memcpy if Nx=Ny=1 */
if (Nx==1&&Ny==1) {
memcpy((void*)y,(void*)x,sizeof(complex double)*(size_t)N);
} else {
zcopy_(&N,x,&Nx,y,&Ny);
}
}
/* blas scale */
/* x = a. x */
void
my_cscal(int N, complex double a, complex double *x) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void zscal_(int *N, complex double *alpha, complex double *x, int *incx);
int i=1;
zscal_(&N,&a,x,&i);
}
/* BLAS y = a.x + y */
void
my_caxpy(int N, complex double *x, complex double a, complex double *y) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void zaxpy_(int *N, complex double *alpha, complex double *x, int *incx, complex double *y, int *incy);
int i=1; /* strides */
zaxpy_(&N,&a,x,&i,y,&i);
}
/* BLAS x^H*y */
complex double
my_cdot(int N, complex double *x, complex double *y) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern complex double zdotc_(int *N, complex double *x, int *incx, complex double *y, int *incy);
int i=1;
return(zdotc_(&N,x,&i,y,&i));
}
/* A=U S VT, so V needs NOT to be transposed */
int
my_zgesvd(char JOBU, char JOBVT, int M, int N, complex double *A, int LDA, double *S,
complex double *U, int LDU, complex double *VT, int LDVT, complex double *WORK, int LWORK, double *RWORK) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void zgesvd_(char *JOBU, char *JOBVT, int *M, int *N, complex double *A,
int *LDA, double *S, complex double *U, int *LDU, complex double *VT, int *LDVT,
complex double *WORK, int *LWORK, double *RWORK, int *info);
int info;
zgesvd_(&JOBU,&JOBVT,&M,&N,A,&LDA,S,U,&LDU,VT,&LDVT,WORK,&LWORK,RWORK,&info);
return info;
}
/* solve Ax=b using QR factorization */
int
my_zgels(char TRANS, int M, int N, int NRHS, complex double *A, int LDA, complex double *B, int LDB, complex double *WORK, int LWORK) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void zgels_(char *TRANS, int *M, int *N, int *NRHS, complex double *A, int *LDA, complex double *B, int *LDB, complex double *WORK, int *LWORK, int *INFO);
int info;
zgels_(&TRANS,&M,&N,&NRHS,A,&LDA,B,&LDB,WORK,&LWORK,&info);
return info;
}
/* solve Ax=b using QR factorization */
int
my_cgels(char TRANS, int M, int N, int NRHS, complex float *A, int LDA, complex float *B, int LDB, complex float *WORK, int LWORK) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void cgels_(char *TRANS, int *M, int *N, int *NRHS, complex float *A, int *LDA, complex float *B, int *LDB, complex float *WORK, int *LWORK, int *INFO);
int info;
cgels_(&TRANS,&M,&N,&NRHS,A,&LDA,B,&LDB,WORK,&LWORK,&info);
return info;
}
/* BLAS ZGEMM C = alpha*op(A)*op(B)+ beta*C */
void
my_zgemm(char transa, char transb, int M, int N, int K, complex double alpha, complex double *A, int lda, complex double *B, int ldb, complex double beta, complex double *C, int ldc) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void zgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, complex double *ALPHA, complex double *A, int *LDA, complex double *B, int * LDB, complex double *BETA, complex double *C, int *LDC);
zgemm_(&transa, &transb, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
}
/* ||x||_2 */
double
my_cnrm2(int N, complex double *x) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern double dznrm2_(int *N, complex double *x, int *incx);
int i=1;
return(dznrm2_(&N,x,&i));
}
/* blas fcopy */
/* y = x */
/* read x values spaced by Nx (so x size> N*Nx) */
/* write to y values spaced by Ny (so y size > N*Ny) */
void
my_fcopy(int N, float *x, int Nx, float *y, int Ny) {
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
extern void scopy_(int *N, float *x, int *incx, float *y, int *incy);
/* use memcpy if Nx=Ny=1 */
if (Nx==1&&Ny==1) {
memcpy((void*)y,(void*)x,sizeof(float)*(size_t)N);
} else {
scopy_(&N,x,&Nx,y,&Ny);
}
}
/* LAPACK eigen value expert routine, real symmetric matrix */
int
my_dsyevx(char jobz, char range, char uplo, int N, double *A, int lda,
double vl, double vu, int il, int iu, double abstol, int M, double *W,
double *Z, int ldz, double *WORK, int lwork, int *iwork, int *ifail) {
extern void dsyevx_(char *JOBZ, char *RANGE, char *UPLO, int *N, double *A, int *LDA,
double *VL, double *VU, int *IL, int *IU, double *ABSTOL, int *M, double *W, double *Z,
int *LDZ, double *WORK, int *LWORK, int *IWORK, int *IFAIL, int *INFO);
int info;
dsyevx_(&jobz,&range,&uplo,&N,A,&lda,&vl,&vu,&il,&iu,&abstol,&M,W,Z,&ldz,WORK,&lwork,iwork,ifail,&info);
return info;
}
/* BLAS vector outer product
A= alpha x x^H + A
*/
void
my_zher(char uplo, int N, double alpha, complex double *x, int incx, complex double *A, int lda) {
extern void zher_(char *UPLO, int *N, double *ALPHA, complex double *X, int *INCX, complex double *A, int *LDA);
zher_(&uplo,&N,&alpha,x,&incx,A,&lda);
}

Binary file not shown.

View File

@ -1,705 +0,0 @@
/*
*
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
$Id$
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <float.h>
#include "Solvers.h"
#include <cuda_runtime.h>
//#define DEBUG
/* helper functions for diagnostics */
static void
checkCudaError(cudaError_t err, char *file, int line)
{
#ifdef CUDA_DEBUG
if(!err)
return;
fprintf(stderr,"GPU (CUDA): %s %s %d\n", cudaGetErrorString(err),file,line);
exit(EXIT_FAILURE);
#endif
}
static void
checkCublasError(cublasStatus_t cbstatus, char *file, int line)
{
#ifdef CUDA_DEBUG
if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
fprintf(stderr,"%s: %d: CUBLAS failure\n",file,line);
exit(EXIT_FAILURE);
}
#endif
}
/* OS-LM, but f() and jac() calculations are done
entirely in the GPU */
int
oslevmar_der_single_cuda(
void (*func)(double *p, double *hx, int m, int n, void *adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in R^n */
void (*jacf)(double *p, double *j, int m, int n, void *adata), /* function to evaluate the Jacobian \part x / \part p */
double *p, /* I/O: initial parameter estimates. On output has the estimated solution */
double *x, /* I: measurement vector. NULL implies a zero vector */
int M, /* I: parameter vector dimension (i.e. #unknowns) */
int N, /* I: measurement vector dimension */
int itmax, /* I: maximum number of iterations */
double opts[4], /* I: minim. options [\mu, \epsilon1, \epsilon2, \epsilon3]. Respectively the scale factor for initial \mu,
* stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2. Set to NULL for defaults to be used
*/
double info[10],
/* O: information regarding the minimization. Set to NULL if don't care
* info[0]= ||e||_2 at initial p.
* info[1-4]=[ ||e||_2, ||J^T e||_inf, ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p.
* info[5]= # iterations,
* info[6]=reason for terminating: 1 - stopped by small gradient J^T e
* 2 - stopped by small Dp
* 3 - stopped by itmax
* 4 - singular matrix. Restart from current p with increased mu
* 5 - no further error reduction is possible. Restart with increased mu
* 6 - stopped by small ||e||_2
* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
* info[7]= # function evaluations
* info[8]= # Jacobian evaluations
* info[9]= # linear systems solved, i.e. # attempts for reducing error
*/
cublasHandle_t cbhandle, /* device handle */
cusolverDnHandle_t solver_handle, /* solver handle */
double *gWORK, /* GPU allocated memory */
int linsolv, /* 0 Cholesky, 1 QR, 2 SVD */
int tileoff, /* tile offset when solving for many chunks */
int ntiles, /* total tile (data) size being solved for */
int randomize, /* if >0 randomize */
void *adata) /* pointer to possibly additional data, passed uninterpreted to func & jacf.
* Set to NULL if not needed
*/
{
/* general note: all device variables end with a 'd' */
int stop=0;
cudaError_t err;
cublasStatus_t cbstatus;
int nu=2,nu2;
double p_L2, Dp_L2=DBL_MAX, dF, dL, p_eL2, jacTe_inf=0.0, pDp_eL2, init_p_eL2;
double tmp,mu=0.0;
double tau, eps1, eps2, eps2_sq, eps3;
int k,ci,issolved;
double *hxd;
double *ed;
double *xd;
double *jacd;
double *jacTjacd,*jacTjacd0;
double *Dpd,*bd;
double *pd,*pnewd;
double *jacTed;
/* used in QR solver */
double *taud;
/* used in SVD solver */
double *Ud;
double *VTd;
double *Sd;
/* ME data */
me_data_t *dp=(me_data_t*)adata;
int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
/* coherency on device */
double *cohd;
/* baseline-station map on device/host */
short *bbd;
int solve_axb=linsolv;
/* setup default settings */
if(opts){
tau=opts[0];
eps1=opts[1];
eps2=opts[2];
eps2_sq=opts[2]*opts[2];
eps3=opts[3];
} else {
tau=CLM_INIT_MU;
eps1=CLM_STOP_THRESH;
eps2=CLM_STOP_THRESH;
eps2_sq=CLM_STOP_THRESH*CLM_STOP_THRESH;
eps3=CLM_STOP_THRESH;
}
/* calculate no of cuda threads and blocks */
int ThreadsPerBlock=DEFAULT_TH_PER_BK;
int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
unsigned long int moff;
if (!gWORK) {
err=cudaMalloc((void**)&xd, N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&jacd, M*N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&jacTjacd, M*M*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&jacTed, M*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&jacTjacd0, M*M*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&Dpd, M*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&bd, M*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&pd, M*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&pnewd, M*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
/* needed for calculating f() and jac() */
err=cudaMalloc((void**) &bbd, Nbase*2*sizeof(short));
checkCudaError(err,__FILE__,__LINE__);
/* we need coherencies for only this cluster */
err=cudaMalloc((void**) &cohd, Nbase*8*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&hxd, N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&ed, N*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
/* memory allocation: different solvers */
if (solve_axb==1) {
err=cudaMalloc((void**)&taud, M*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
} else if (solve_axb==2) {
err=cudaMalloc((void**)&Ud, M*M*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&VTd, M*M*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&Sd, M*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
}
} else {
moff=0;
xd=&gWORK[moff];
moff+=N;
jacd=&gWORK[moff];
moff+=M*N;
jacTjacd=&gWORK[moff];
moff+=M*M;
jacTed=&gWORK[moff];
moff+=M;
jacTjacd0=&gWORK[moff];
moff+=M*M;
Dpd=&gWORK[moff];
moff+=M;
bd=&gWORK[moff];
moff+=M;
pd=&gWORK[moff];
moff+=M;
pnewd=&gWORK[moff];
moff+=M;
cohd=&gWORK[moff];
moff+=Nbase*8;
hxd=&gWORK[moff];
moff+=N;
ed=&gWORK[moff];
moff+=N;
if (solve_axb==1) {
taud=&gWORK[moff];
moff+=M;
} else if (solve_axb==2) {
Ud=&gWORK[moff];
moff+=M*M;
VTd=&gWORK[moff];
moff+=M*M;
Sd=&gWORK[moff];
moff+=M;
}
bbd=(short*)&gWORK[moff];
moff+=(Nbase*2*sizeof(short))/sizeof(double);
}
/* extra storage for cusolver */
int work_size=0;
int *devInfo;
int devInfo_h=0;
err=cudaMalloc((void**)&devInfo, sizeof(int));
checkCudaError(err,__FILE__,__LINE__);
double *work;
double *rwork;
if (solve_axb==0) {
cusolverDnDpotrf_bufferSize(solver_handle, CUBLAS_FILL_MODE_UPPER, M, jacTjacd, M, &work_size);
err=cudaMalloc((void**)&work, work_size*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
} else if (solve_axb==1) {
cusolverDnDgeqrf_bufferSize(solver_handle, M, M, jacTjacd, M, &work_size);
err=cudaMalloc((void**)&work, work_size*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
} else {
cusolverDnDgesvd_bufferSize(solver_handle, M, M, &work_size);
err=cudaMalloc((void**)&work, work_size*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
err=cudaMalloc((void**)&rwork, 5*M*sizeof(double));
checkCudaError(err,__FILE__,__LINE__);
}
err=cudaMemcpyAsync(pd, p, M*sizeof(double), cudaMemcpyHostToDevice,0);
checkCudaError(err,__FILE__,__LINE__);
/* need to give right offset for coherencies */
/* offset: cluster offset+time offset */
err=cudaMemcpyAsync(cohd, &(dp->ddcoh[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(double), cudaMemcpyHostToDevice,0);
checkCudaError(err,__FILE__,__LINE__);
/* correct offset for baselines */
err=cudaMemcpyAsync(bbd, &(dp->ddbase[2*(dp->Nbase)*(tileoff)]), Nbase*2*sizeof(short), cudaMemcpyHostToDevice,0);
checkCudaError(err,__FILE__,__LINE__);
cudaDeviceSynchronize();
/* xd <=x */
err=cudaMemcpyAsync(xd, x, N*sizeof(double), cudaMemcpyHostToDevice,0);
checkCudaError(err,__FILE__,__LINE__);
/* ### compute e=x - f(p) and its L2 norm */
/* ### e=x-hx, p_eL2=||e|| */
/* p: params (Mx1), x: data (Nx1), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations*/
cudakernel_func(ThreadsPerBlock, (Nbase+ThreadsPerBlock-1)/ThreadsPerBlock, pd,hxd,M,N, cohd, bbd, Nbase, dp->M, dp->N);
/* e=x */
cbstatus=cublasDcopy(cbhandle, N, xd, 1, ed, 1);
/* e=x-hx */
double alpha=-1.0;
cbstatus=cublasDaxpy(cbhandle, N, &alpha, hxd, 1, ed, 1);
/* norm ||e|| */
cbstatus=cublasDnrm2(cbhandle, N, ed, 1, &p_eL2);
/* square */
p_eL2=p_eL2*p_eL2;
init_p_eL2=p_eL2;
if(!finite(p_eL2)) stop=7;
/* setup OS subsets and stating offsets */
/* ed : N, cohd : Nbase*8, bbd : Nbase*2 full size */
/* if ntiles<Nsubsets, make Nsubsets=ntiles */
int Nsubsets=10;
if (ntiles<Nsubsets) { Nsubsets=ntiles; }
/* FIXME: is 0.1 enough ? */
int max_os_iter=(int)ceil(0.1*(double)Nsubsets);
int Npersubset=(N+Nsubsets-1)/Nsubsets;
int Nbasepersubset=(Nbase+Nsubsets-1)/Nsubsets;
int *Nos,*Nbaseos,*edI,*NbI,*subI=0;
if ((Nos=(int*)calloc((size_t)Nsubsets,sizeof(int)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((Nbaseos=(int*)calloc((size_t)Nsubsets,sizeof(int)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((edI=(int*)calloc((size_t)Nsubsets,sizeof(int)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
if ((NbI=(int*)calloc((size_t)Nsubsets,sizeof(int)))==0) {
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
exit(1);
}
int l,ositer;
k=l=0;
for (ci=0; ci<Nsubsets; ci++) {
edI[ci]=k;
NbI[ci]=l;
if (k+Npersubset<N) {
Nos[ci]=Npersubset;
Nbaseos[ci]=Nbasepersubset;
} else {
Nos[ci]=N-k;
Nbaseos[ci]=Nbase-l;
}
k=k+Npersubset;
l=l+Nbasepersubset;
}
#ifdef DEBUG
for (ci=0; ci<Nsubsets; ci++) {
printf("ci=%d, Nos=%d, edI=%d, Nbseos=%d, NbI=%d\n",ci,Nos[ci],edI[ci],Nbaseos[ci],NbI[ci]);
}
#endif
/**** iteration loop ***********/
for(k=0; k<itmax && !stop; ++k){
#ifdef DEBUG
printf("iter=%d err=%lf\n",k,p_eL2);
#endif
if(p_eL2<=eps3){ /* error is small */
stop=6;
break;
}
if (randomize) {
/* random permutation of subsets */
subI=random_permutation(Nsubsets,0,0);
}
/**************** OS loop ***************************/
for (ositer=0; ositer<max_os_iter; ositer++) {
/* select subset to compute Jacobian */
if (randomize) {
l=subI[ositer];
} else {
l=(k+ositer)%Nsubsets;
}
/* NOTE: no. of subsets >= no. of OS iterations, so select
a random set of subsets */
/* N, Nbase changes with subset, cohd,bbd,ed gets offsets */
/* ed : N, cohd : Nbase*8, bbd : Nbase*2 full size */
/* p: params (Mx1), jacd: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations*/
/* FIXME thread/block sizes 16x16=256, so 16 is chosen */
//cudakernel_jacf(ThreadsPerBlock, ThreadsPerBlock/4, pd, jacd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
cudakernel_jacf(ThreadsPerBlock, ThreadsPerBlock/4, pd, jacd, M, Nos[l], &cohd[8*NbI[l]], &bbd[2*NbI[l]], Nbaseos[l], dp->M, dp->N);
/* Compute J^T J and J^T e */
/* Cache efficient computation of J^T J based on blocking
*/
/* since J is in ROW major order, assume it is transposed,
so actually calculate A=J*J^T, where J is size MxN */
//status=culaDeviceDgemm('N','T',M,M,Nos[l],1.0,jacd,M,jacd,M,0.0,jacTjacd,M);
//checkStatus(status,__FILE__,__LINE__);
double cone=1.0; double czero=0.0;
cbstatus=cublasDgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_T,M,M,Nos[l],&cone,jacd,M,jacd,M,&czero,jacTjacd,M);
/* create backup */
/* copy jacTjacd0<=jacTjacd */
cbstatus=cublasDcopy(cbhandle, M*M, jacTjacd, 1, jacTjacd0, 1);
/* J^T e */
/* calculate b=J^T*e (actually compute b=J*e, where J in row major (size MxN) */
//status=culaDeviceDgemv('N',M,Nos[l],1.0,jacd,M,&ed[edI[l]],1,0.0,jacTed,1);
//checkStatus(status,__FILE__,__LINE__);
cbstatus=cublasDgemv(cbhandle,CUBLAS_OP_N,M,Nos[l],&cone,jacd,M,&ed[edI[l]],1,&czero,jacTed,1);
/* Compute ||J^T e||_inf and ||p||^2 */
/* find infinity norm of J^T e, 1 based indexing*/
cbstatus=cublasIdamax(cbhandle, M, jacTed, 1, &ci);
err=cudaMemcpy(&jacTe_inf,&(jacTed[ci-1]),sizeof(double),cudaMemcpyDeviceToHost);
checkCudaError(err,__FILE__,__LINE__);
/* L2 norm of current parameter values */
/* norm ||Dp|| */
cbstatus=cublasDnrm2(cbhandle, M, pd, 1, &p_L2);
p_L2=p_L2*p_L2;
if(jacTe_inf<0.0) {jacTe_inf=-jacTe_inf;}
#ifdef DEBUG
printf("Inf norm=%lf\n",jacTe_inf);
#endif
/* check for convergence */
if((jacTe_inf <= eps1)){
Dp_L2=0.0; /* no increment for p in this case */
stop=1;
break;
}
/* compute initial (k=0) damping factor */
if (k==0) {
/* find max diagonal element (stride is M+1) */
/* should be MAX not MAX(ABS) */
cbstatus=cublasIdamax(cbhandle, M, jacTjacd, M+1, &ci); /* 1 based index */
ci=(ci-1)*(M+1); /* right value of the diagonal */
err=cudaMemcpy(&tmp,&(jacTjacd[ci]),sizeof(double),cudaMemcpyDeviceToHost);
checkCudaError(err,__FILE__,__LINE__);
mu=tau*tmp;
}
/* determine increment using adaptive damping */
while(1){
/* augment normal equations */
/* increment A => A+ mu*I, increment diagonal entries */
/* copy jacTjacd<=jacTjacd0 */
cbstatus=cublasDcopy(cbhandle, M*M, jacTjacd0, 1, jacTjacd, 1);
cudakernel_diagmu(ThreadsPerBlock, BlocksPerGrid, M, jacTjacd, mu);
#ifdef DEBUG
printf("mu=%lf\n",mu);
#endif
/*************************************************************************/
issolved=0;
/* solve augmented equations A x = b */
/* A==jacTjacd, b==Dpd, after solving, x==Dpd */
/* b=jacTed : intially right hand side, at exit the solution */
if (solve_axb==0) {
/* Cholesky solver **********************/
/* lower triangle of Ad is destroyed */
//status=culaDeviceDpotrf('U',M,jacTjacd,M);
cusolverDnDpotrf(solver_handle, CUBLAS_FILL_MODE_UPPER, M, jacTjacd, M, work, work_size, devInfo);
cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
if (!devInfo_h) {
issolved=1;
} else {
issolved=0;
#ifdef DEBUG
fprintf(stderr,"Singular matrix\n");
#endif
}
if (issolved) {
/* copy Dpd<=jacTed */
cbstatus=cublasDcopy(cbhandle, M, jacTed, 1, Dpd, 1);
#ifdef DEBUG
checkCublasError(cbstatus,__FILE__,__LINE__);
#endif
//status=culaDeviceDpotrs('U',M,1,jacTjacd,M,Dpd,M);
cusolverDnDpotrs(solver_handle, CUBLAS_FILL_MODE_UPPER,M,1,jacTjacd,M,Dpd,M,devInfo);
cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
if (devInfo_h) {
issolved=0;
#ifdef DEBUG
fprintf(stderr,"Singular matrix\n");
#endif
}
}
} else if (solve_axb==1) {
/* QR solver ********************************/
//status=culaDeviceDgeqrf(M,M,jacTjacd,M,taud);
cusolverDnDgeqrf(solver_handle, M, M, jacTjacd, M, taud, work, work_size, devInfo);
cudaDeviceSynchronize();
cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
if (!devInfo_h) {
issolved=1;
} else {
issolved=0;
#ifdef DEBUG
fprintf(stderr,"Singular matrix\n");
#endif
}
if (issolved) {
/* copy Dpd<=jacTed */
cbstatus=cublasDcopy(cbhandle, M, jacTed, 1, Dpd, 1);
//status=culaDeviceDgeqrs(M,M,1,jacTjacd,M,taud,Dpd,M);
cusolverDnDormqr(solver_handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, M, 1, M, jacTjacd, M, taud, Dpd, M, work, work_size, devInfo);
cudaDeviceSynchronize();
cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
if (devInfo_h) {
issolved=0;
#ifdef DEBUG
fprintf(stderr,"Singular matrix\n");
#endif
} else {
cone=1.0;
cbstatus=cublasDtrsm(cbhandle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_UPPER,CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,M,1,&cone,jacTjacd,M,Dpd,M);
}
}
} else {
/* SVD solver *********************************/
/* U S VT = A */
//status=culaDeviceDgesvd('A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M);
//checkStatus(status,__FILE__,__LINE__);
cusolverDnDgesvd(solver_handle,'A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M,work,work_size,rwork,devInfo);
cudaDeviceSynchronize();
/* copy Dpd<=jacTed */
cbstatus=cublasDcopy(cbhandle, M, jacTed, 1, Dpd, 1);
/* b<=U^T * b */
//status=culaDeviceDgemv('T',M,M,1.0,Ud,M,Dpd,1,0.0,Dpd,1);
//checkStatus(status,__FILE__,__LINE__);
cone=1.0; czero=0.0;
cbstatus=cublasDgemv(cbhandle,CUBLAS_OP_T,M,M,&cone,Ud,M,Dpd,1,&czero,Dpd,1);
/* divide by singular values Dpd[]/Sd[] for Sd[]> eps1 */
cudakernel_diagdiv(ThreadsPerBlock, BlocksPerGrid, M, eps1, Dpd, Sd);
/* b<=VT^T * b */
//status=culaDeviceDgemv('T',M,M,1.0,VTd,M,Dpd,1,0.0,Dpd,1);
//checkStatus(status,__FILE__,__LINE__);
cbstatus=cublasDgemv(cbhandle,CUBLAS_OP_T,M,M,&cone,VTd,M,Dpd,1,&czero,Dpd,1);
issolved=1;
}
/*************************************************************************/
/* compute p's new estimate and ||Dp||^2 */
if (issolved) {
/* compute p's new estimate and ||Dp||^2 */
/* pnew=p+Dp */
/* pnew=p */
cbstatus=cublasDcopy(cbhandle, M, pd, 1, pnewd, 1);
/* pnew=pnew+Dp */
alpha=1.0;
cbstatus=cublasDaxpy(cbhandle, M, &alpha, Dpd, 1, pnewd, 1);
/* norm ||Dp|| */
cbstatus=cublasDnrm2(cbhandle, M, Dpd, 1, &Dp_L2);
Dp_L2=Dp_L2*Dp_L2;
#ifdef DEBUG
printf("norm ||dp|| =%lf, norm ||p||=%lf\n",Dp_L2,p_L2);
#endif
if(Dp_L2<=eps2_sq*p_L2){ /* relative change in p is small, stop */
stop=2;
break;
}
if(Dp_L2>=(p_L2+eps2)/(CLM_EPSILON*CLM_EPSILON)){ /* almost singular */
stop=4;
break;
}
/* new function value */
/* compute ||e(pDp)||_2 */
/* ### hx=x-hx, pDp_eL2=||hx|| */
/* copy to device */
/* hxd<=hx */
cudakernel_func(ThreadsPerBlock, (Nbase+ThreadsPerBlock-1)/ThreadsPerBlock, pnewd, hxd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
/* e=x */
cbstatus=cublasDcopy(cbhandle, N, xd, 1, ed, 1);
/* e=x-hx */
alpha=-1.0;
cbstatus=cublasDaxpy(cbhandle, N, &alpha, hxd, 1, ed, 1);
/* note: e is updated */
/* norm ||e|| */
cbstatus=cublasDnrm2(cbhandle, N, ed, 1, &pDp_eL2);
pDp_eL2=pDp_eL2*pDp_eL2;
if(!finite(pDp_eL2)){ /* sum of squares is not finite, most probably due to a user error.
*/
stop=7;
break;
}
/* dL=Dp'*(mu*Dp+jacTe) */
/* bd=jacTe+mu*Dp */
cbstatus=cublasDcopy(cbhandle, M, jacTed, 1, bd, 1);
cbstatus=cublasDaxpy(cbhandle, M, &mu, Dpd, 1, bd, 1);
cbstatus=cublasDdot(cbhandle, M, Dpd, 1, bd, 1, &dL);
dF=p_eL2-pDp_eL2;
#ifdef DEBUG
printf("dF=%lf, dL=%lf\n",dF,dL);
#endif
if(dL>0.0 && dF>0.0){ /* reduction in error, increment is accepted */
tmp=(2.0*dF/dL-1.0);
tmp=1.0-tmp*tmp*tmp;
mu=mu*((tmp>=CLM_ONE_THIRD)? tmp : CLM_ONE_THIRD);
nu=2;
/* update p's estimate */
cbstatus=cublasDcopy(cbhandle, M, pnewd, 1, pd, 1);
/* update ||e||_2 */
p_eL2=pDp_eL2;
break;
}
}
/* if this point is reached, either the linear system could not be solved or
* the error did not reduce; in any case, the increment must be rejected
*/
mu*=(double)nu;
nu2=nu<<1; // 2*nu;
if(nu2<=nu){ /* nu has wrapped around (overflown). */
stop=5;
break;
}
nu=nu2;
} /* inner loop */
}
if (randomize) {
free(subI);
}
/**************** end OS loop ***************************/
}
/**** end iteration loop ***********/
free(Nos);
free(Nbaseos);
free(edI);
free(NbI);
if(k>=itmax) stop=3;
/* copy back current solution */
err=cudaMemcpyAsync(p,pd,M*sizeof(double),cudaMemcpyDeviceToHost,0);
checkCudaError(err,__FILE__,__LINE__);
checkCublasError(cbstatus,__FILE__,__LINE__);
/* synchronize async operations */
cudaDeviceSynchronize();
if (!gWORK) {
cudaFree(xd);
cudaFree(jacd);
cudaFree(jacTjacd);
cudaFree(jacTjacd0);
cudaFree(jacTed);
cudaFree(Dpd);
cudaFree(bd);
cudaFree(pd);
cudaFree(pnewd);
cudaFree(hxd);
cudaFree(ed);
if (solve_axb==1) {
cudaFree(taud);
} else if (solve_axb==2) {
cudaFree(Ud);
cudaFree(VTd);
cudaFree(Sd);
}
cudaFree(cohd);
cudaFree(bbd);
}
cudaFree(devInfo);
cudaFree(work);
if (solve_axb==2) {
cudaFree(rwork);
}
#ifdef DEBUG
printf("stop=%d\n",stop);
#endif
if(info){
info[0]=init_p_eL2;
info[1]=p_eL2;
info[2]=jacTe_inf;
info[3]=Dp_L2;
info[4]=mu;
info[5]=(double)k;
info[6]=(double)stop;
info[7]=(double)0;
info[8]=(double)0;
info[9]=(double)0;
}
return 0;
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,721 +0,0 @@
/*
*
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
$Id$
*/
#include "cuda.h"
#include <cuComplex.h>
#include <stdio.h>
/* enable this for kernel failure detection */
//#define CUDA_DBG
__global__ void kernel_deriv_robust(int Nbase, int tilesz, int M, int Ns, int Nparam, int goff, double robust_nu, double *x, double *coh, double *p, short *bb, int *ptoclus, double *grad){
/* global thread index */
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
/* parameter number of this thread */
unsigned int np=n+goff;
/* this thread works on
x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
bb[2*n:2*n+1] (sta1,sta2)
organization of p (N stations and M clusters)
sta 0 sta 1 sta 2 .... sta N-1
clus 0 0...7 8...15 16...23 ... 8N-8 8N-1
clus 1 8N..8N+7 8N+8..8N+15 8N+16..8N+23 .... 8N+8N-8...8N+8N-1
......
clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15.... ...(M-1)N+8N-8 (M-1)N+8N-1
organization of coherencies (coh)
[0, 8*M-1] : baseline 0
[8*M, 8*M+8*M-1]: baseline 1
[n*8*M, n*8*M+8*M-1]: baseline n
......
[n*8*M+cm*8, n*8*M+cm*8+7] cluster cm, baseline n
residual error stored at sum[n]
*/
if (n<Nparam) {
/* this thread will calculate derivative for parameter np,
and store it in grad[n] */
double gsum=0.0;
/* find which cluster this parameter belongs to */
/* ptoclus[0,1] are nchunk, and p[start index] for each cluster */
int cli=0;
/* np should be within ptoclus[2*cli+1]....ptoclus[2*cli+1]+ptoclus[2*cli]*8*Ns-1 */
while ((cli<M) && (np<ptoclus[2*cli+1] || np>ptoclus[2*cli+1]+ptoclus[2*cli]*8*Ns-1)) { cli++; }
/* now either ci>=M: cluster not found
or ci<M and ci is the right cluster */
if ((cli==M) && np>=ptoclus[2*cli-1] && np<=ptoclus[2*cli-1]+ptoclus[2*cli-2]*8*Ns-1) {
cli--;
}
if (cli<M) {
int pstart=ptoclus[2*cli+1];
int nchunk=ptoclus[2*cli];
/* find station and which parameter for this thread (parameter) */
/* this cluster has parameters ptoclus[2*cli+1] ..... +ptoclus[2*cli]*8*Ns-1 */
unsigned int np_s=(np-pstart)%(8*Ns);
unsigned int stc=np_s/8; /* this is the station of this param */
/* which chunk does this parameter belong to */
unsigned int tpchunk=(np-pstart)/(8*Ns);
int tilesperchunk=(tilesz+nchunk-1)/nchunk;
/* total baselines in one tile */
int Nbase0=(Ns-1)*Ns/2;
for(unsigned int nb=0; nb<Nbase; nb++) {
/* which tile is this ? */
int ttile=nb/Nbase0;
/* which chunk this tile belongs to */
int tptile=ttile/tilesperchunk;
/* now tptile has to match tpchunk, otherwise ignore calculation */
if (tptile==tpchunk) {
int sta1=(int)bb[2*nb];
int sta2=(int)bb[2*nb+1];
/* only calculate deriv if baseline corresponds
to this station and baseline is not flagged */
/* flagged baselines will have sta1==sta2==-1 */
if (((stc==sta1)||(stc==sta2)) && sta1>=0 && sta2>=0) {
/* which parameter 0..7 */
unsigned int stoff=np_s-stc*8;
/* which cluster 0..M-1 */
unsigned int stm=cli;
/* read residual vector, real,imag separate*/
double xr[8];
xr[0]=x[nb*8];
xr[1]=x[nb*8+1];
xr[2]=x[nb*8+2];
xr[3]=x[nb*8+3];
xr[4]=x[nb*8+4];
xr[5]=x[nb*8+5];
xr[6]=x[nb*8+6];
xr[7]=x[nb*8+7];
/* read in coherency */
cuDoubleComplex C[4];
C[0].x=coh[8*nb*M+8*stm];
C[0].y=coh[8*nb*M+8*stm+1];
C[1].x=coh[8*nb*M+8*stm+2];
C[1].y=coh[8*nb*M+8*stm+3];
C[2].x=coh[8*nb*M+8*stm+4];
C[2].y=coh[8*nb*M+8*stm+5];
C[3].x=coh[8*nb*M+8*stm+6];
C[3].y=coh[8*nb*M+8*stm+7];
cuDoubleComplex G1[4];
cuDoubleComplex G2[4];
if(stc==sta1) {
double pp[8];
pp[0]=0.0;
pp[1]=0.0;
pp[2]=0.0;
pp[3]=0.0;
pp[4]=0.0;
pp[5]=0.0;
pp[6]=0.0;
pp[7]=0.0;
pp[stoff]=1.0;
G1[0].x=pp[0];
G1[0].y=pp[1];
G1[1].x=pp[2];
G1[1].y=pp[3];
G1[2].x=pp[4];
G1[2].y=pp[5];
G1[3].x=pp[6];
G1[3].y=pp[7];
/* conjugate and transpose G2 */
G2[0].x=p[pstart+tpchunk*8*Ns+sta2*8];
G2[0].y=-p[pstart+tpchunk*8*Ns+sta2*8+1];
G2[2].x=p[pstart+tpchunk*8*Ns+sta2*8+2];
G2[2].y=-p[pstart+tpchunk*8*Ns+sta2*8+3];
G2[1].x=p[pstart+tpchunk*8*Ns+sta2*8+4];
G2[1].y=-p[pstart+tpchunk*8*Ns+sta2*8+5];
G2[3].x=p[pstart+tpchunk*8*Ns+sta2*8+6];
G2[3].y=-p[pstart+tpchunk*8*Ns+sta2*8+7];
} else if (stc==sta2) {
double pp[8];
pp[0]=0.0;
pp[1]=0.0;
pp[2]=0.0;
pp[3]=0.0;
pp[4]=0.0;
pp[5]=0.0;
pp[6]=0.0;
pp[7]=0.0;
pp[stoff]=1.0;
/* conjugate and transpose G2 */
G2[0].x=pp[0];
G2[0].y=-pp[1];
G2[2].x=pp[2];
G2[2].y=-pp[3];
G2[1].x=pp[4];
G2[1].y=-pp[5];
G2[3].x=pp[6];
G2[3].y=-pp[7];
/* conjugate and transpose G2 */
G1[0].x=p[pstart+tpchunk*8*Ns+sta1*8];
G1[0].y=p[pstart+tpchunk*8*Ns+sta1*8+1];
G1[1].x=p[pstart+tpchunk*8*Ns+sta1*8+2];
G1[1].y=p[pstart+tpchunk*8*Ns+sta1*8+3];
G1[2].x=p[pstart+tpchunk*8*Ns+sta1*8+4];
G1[2].y=p[pstart+tpchunk*8*Ns+sta1*8+5];
G1[3].x=p[pstart+tpchunk*8*Ns+sta1*8+6];
G1[3].y=p[pstart+tpchunk*8*Ns+sta1*8+7];
}
cuDoubleComplex T1[4];
/* T1=G1*C */
T1[0]=cuCadd(cuCmul(G1[0],C[0]),cuCmul(G1[1],C[2]));
T1[1]=cuCadd(cuCmul(G1[0],C[1]),cuCmul(G1[1],C[3]));
T1[2]=cuCadd(cuCmul(G1[2],C[0]),cuCmul(G1[3],C[2]));
T1[3]=cuCadd(cuCmul(G1[2],C[1]),cuCmul(G1[3],C[3]));
cuDoubleComplex T2[4];
/* T2=T1*G2 , G2 conjugate transposed */
T2[0]=cuCadd(cuCmul(T1[0],G2[0]),cuCmul(T1[1],G2[2]));
T2[1]=cuCadd(cuCmul(T1[0],G2[1]),cuCmul(T1[1],G2[3]));
T2[2]=cuCadd(cuCmul(T1[2],G2[0]),cuCmul(T1[3],G2[2]));
T2[3]=cuCadd(cuCmul(T1[2],G2[1]),cuCmul(T1[3],G2[3]));
/* calculate product xr*vec(J_p C J_q^H )/(nu+residual^2) */
double dsum;
dsum=xr[0]*T2[0].x/(robust_nu+xr[0]*xr[0]);
dsum+=xr[1]*T2[0].y/(robust_nu+xr[1]*xr[1]);
dsum+=xr[2]*T2[1].x/(robust_nu+xr[2]*xr[2]);
dsum+=xr[3]*T2[1].y/(robust_nu+xr[3]*xr[3]);
dsum+=xr[4]*T2[2].x/(robust_nu+xr[4]*xr[4]);
dsum+=xr[5]*T2[2].y/(robust_nu+xr[5]*xr[5]);
dsum+=xr[6]*T2[3].x/(robust_nu+xr[6]*xr[6]);
dsum+=xr[7]*T2[3].y/(robust_nu+xr[7]*xr[7]);
/* accumulate sum NOTE
its important to get the sign right,
depending on res=data-model or res=model-data */
gsum+=2.0*dsum;
}
}
}
}
grad[n]=gsum;
}
}
__global__ void kernel_func_wt(int Nbase, double *x, double *coh, double *p, short *bb, double *wt, int N){
/* global thread index : equal to the baseline */
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
/* this thread works on
x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
bb[2*n:2*n+1] (sta1,sta2)
organization of p (N stations and M clusters)
sta 0 sta 1 sta 2 .... sta N-1
clus 0 0...7 8...15 16...23 ... 8N-8 8N-1
clus 1 8N..8N+7 8N+8..8N+15 8N+16..8N+23 .... 8N+8N-8...8N+8N-1
......
clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15.... ...(M-1)N+8N-8 (M-1)N+8N-1
organization of coherencies (coh)
[0, 8*M-1] : baseline 0
[8*M, 8*M+8*M-1]: baseline 1
[n*8*M, n*8*M+8*M-1]: baseline n
......
[n*8*M+cm*8, n*8*M+cm*8+7] cluster cm, baseline n
residual error stored at sum[n]
*/
if(n<Nbase) {
int sta1=(int)bb[2*n];
int sta2=(int)bb[2*n+1];
/* condition for calculating this baseline sum is
1) its not flagged (sta1,sta2)>=0
*/
if (sta1>=0 && sta2>=0) {
cuDoubleComplex G1[4];
double pp[8];
pp[0]=p[sta1*8];
pp[1]=p[sta1*8+1];
pp[2]=p[sta1*8+2];
pp[3]=p[sta1*8+3];
pp[4]=p[sta1*8+4];
pp[5]=p[sta1*8+5];
pp[6]=p[sta1*8+6];
pp[7]=p[sta1*8+7];
G1[0].x=pp[0];
G1[0].y=pp[1];
G1[1].x=pp[2];
G1[1].y=pp[3];
G1[2].x=pp[4];
G1[2].y=pp[5];
G1[3].x=pp[6];
G1[3].y=pp[7];
cuDoubleComplex C[4];
C[0].x=coh[8*n];
C[0].y=coh[8*n+1];
C[1].x=coh[8*n+2];
C[1].y=coh[8*n+3];
C[2].x=coh[8*n+4];
C[2].y=coh[8*n+5];
C[3].x=coh[8*n+6];
C[3].y=coh[8*n+7];
cuDoubleComplex T1[4];
/* T=G1*C */
T1[0]=cuCadd(cuCmul(G1[0],C[0]),cuCmul(G1[1],C[2]));
T1[1]=cuCadd(cuCmul(G1[0],C[1]),cuCmul(G1[1],C[3]));
T1[2]=cuCadd(cuCmul(G1[2],C[0]),cuCmul(G1[3],C[2]));
T1[3]=cuCadd(cuCmul(G1[2],C[1]),cuCmul(G1[3],C[3]));
cuDoubleComplex G2[4];
/* conjugate this */
pp[0]=p[sta2*8];
pp[1]=-p[sta2*8+1];
pp[2]=p[sta2*8+2];
pp[3]=-p[sta2*8+3];
pp[4]=p[sta2*8+4];
pp[5]=-p[sta2*8+5];
pp[6]=p[sta2*8+6];
pp[7]=-p[sta2*8+7];
G2[0].x=pp[0];
G2[0].y=pp[1];
G2[2].x=pp[2];
G2[2].y=pp[3];
G2[1].x=pp[4];
G2[1].y=pp[5];
G2[3].x=pp[6];
G2[3].y=pp[7];
cuDoubleComplex T2[4];
T2[0]=cuCadd(cuCmul(T1[0],G2[0]),cuCmul(T1[1],G2[2]));
T2[1]=cuCadd(cuCmul(T1[0],G2[1]),cuCmul(T1[1],G2[3]));
T2[2]=cuCadd(cuCmul(T1[2],G2[0]),cuCmul(T1[3],G2[2]));
T2[3]=cuCadd(cuCmul(T1[2],G2[1]),cuCmul(T1[3],G2[3]));
/* update model vector, with weights */
x[8*n]=wt[8*n]*T2[0].x;
x[8*n+1]=wt[8*n+1]*T2[0].y;
x[8*n+2]=wt[8*n+2]*T2[1].x;
x[8*n+3]=wt[8*n+3]*T2[1].y;
x[8*n+4]=wt[8*n+4]*T2[2].x;
x[8*n+5]=wt[8*n+5]*T2[2].y;
x[8*n+6]=wt[8*n+6]*T2[3].x;
x[8*n+7]=wt[8*n+7]*T2[3].y;
}
}
}
__global__ void kernel_jacf_wt(int Nbase, int M, double *jac, double *coh, double *p, short *bb, double *wt, int N){
/* global thread index : equal to the baseline */
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
/* which parameter:0...M */
unsigned int m = threadIdx.y + blockDim.y*blockIdx.y;
/* this thread works on
x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
bb[2*n:2*n+1] (sta1,sta2)
organization of p (N stations and M clusters)
sta 0 sta 1 sta 2 .... sta N-1
clus 0 0...7 8...15 16...23 ... 8N-8 8N-1
clus 1 8N..8N+7 8N+8..8N+15 8N+16..8N+23 .... 8N+8N-8...8N+8N-1
......
clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15.... ...(M-1)N+8N-8 (M-1)N+8N-1
organization of coherencies (coh)
[0, 8*M-1] : baseline 0
[8*M, 8*M+8*M-1]: baseline 1
[n*8*M, n*8*M+8*M-1]: baseline n
......
[n*8*M+cm*8, n*8*M+cm*8+7] cluster cm, baseline n
residual error stored at sum[n]
*/
if(n<Nbase && m<M) {
int sta1=(int)bb[2*n];
int sta2=(int)bb[2*n+1];
/* condition for calculating this baseline sum is
If this baseline is flagged,
or if this parameter does not belong to sta1 or sta2
we do not compute
*/
//int stc=m/8; /* 0...Ns-1 (because M=total par= 8 * Nstations */
int stc=m>>3; /* 0...Ns-1 (because M=total par= 8 * Nstations */
if (((stc==sta2)||(stc==sta1)) && sta1>=0 && sta2>=0 ) {
cuDoubleComplex C[4];
C[0].x=coh[8*n];
C[0].y=coh[8*n+1];
C[1].x=coh[8*n+2];
C[1].y=coh[8*n+3];
C[2].x=coh[8*n+4];
C[2].y=coh[8*n+5];
C[3].x=coh[8*n+6];
C[3].y=coh[8*n+7];
/* which parameter exactly 0..7 */
//int stoff=m%8;
int stoff=m-stc*8;
double pp1[8];
double pp2[8];
if (stc==sta1) {
for (int cn=0; cn<8; cn++) {
pp1[cn]=0.0;
pp2[cn]=p[sta2*8+cn];
}
pp1[stoff]=1.0;
} else if (stc==sta2) {
for (int cn=0; cn<8; cn++) {
pp2[cn]=0.0;
pp1[cn]=p[sta1*8+cn];
}
pp2[stoff]=1.0;
}
cuDoubleComplex G1[4];
G1[0].x=pp1[0];
G1[0].y=pp1[1];
G1[1].x=pp1[2];
G1[1].y=pp1[3];
G1[2].x=pp1[4];
G1[2].y=pp1[5];
G1[3].x=pp1[6];
G1[3].y=pp1[7];
cuDoubleComplex T1[4];
/* T=G1*C */
T1[0]=cuCadd(cuCmul(G1[0],C[0]),cuCmul(G1[1],C[2]));
T1[1]=cuCadd(cuCmul(G1[0],C[1]),cuCmul(G1[1],C[3]));
T1[2]=cuCadd(cuCmul(G1[2],C[0]),cuCmul(G1[3],C[2]));
T1[3]=cuCadd(cuCmul(G1[2],C[1]),cuCmul(G1[3],C[3]));
cuDoubleComplex G2[4];
/* conjugate this */
G2[0].x=pp2[0];
G2[0].y=-pp2[1];
G2[2].x=pp2[2];
G2[2].y=-pp2[3];
G2[1].x=pp2[4];
G2[1].y=-pp2[5];
G2[3].x=pp2[6];
G2[3].y=-pp2[7];
cuDoubleComplex T2[4];
T2[0]=cuCadd(cuCmul(T1[0],G2[0]),cuCmul(T1[1],G2[2]));
T2[1]=cuCadd(cuCmul(T1[0],G2[1]),cuCmul(T1[1],G2[3]));
T2[2]=cuCadd(cuCmul(T1[2],G2[0]),cuCmul(T1[3],G2[2]));
T2[3]=cuCadd(cuCmul(T1[2],G2[1]),cuCmul(T1[3],G2[3]));
/* update jacobian , with row weights */
/* NOTE: row major order */
jac[m+M*8*n]=wt[8*n]*T2[0].x;
jac[m+M*(8*n+1)]=wt[8*n+1]*T2[0].y;
jac[m+M*(8*n+2)]=wt[8*n+2]*T2[1].x;
jac[m+M*(8*n+3)]=wt[8*n+3]*T2[1].y;
jac[m+M*(8*n+4)]=wt[8*n+4]*T2[2].x;
jac[m+M*(8*n+5)]=wt[8*n+5]*T2[2].y;
jac[m+M*(8*n+6)]=wt[8*n+6]*T2[3].x;
jac[m+M*(8*n+7)]=wt[8*n+7]*T2[3].y;
}
}
}
__global__ void kernel_setweights(int N, double *wt, double alpha){
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
/* make sure to use only M threads */
if (tid<N) {
wt[tid]=alpha;
}
}
__global__ void kernel_hadamard(int N, double *wt, double *x){
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
/* make sure to use only M threads */
if (tid<N) {
x[tid]*=wt[tid];
}
}
__global__ void kernel_updateweights(int N, double *wt, double *x, double *q, double nu){
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
/* make sure to use only M threads */
if (tid<N) {
wt[tid]=((nu+1.0)/(nu+x[tid]*x[tid]));
q[tid]=wt[tid]-log(wt[tid]); /* so that its +ve */
}
}
__global__ void kernel_sqrtweights(int N, double *wt){
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
/* make sure to use only M threads */
if (tid<N) {
wt[tid]=sqrt(wt[tid]);
}
}
__device__ double
digamma(double x) {
double result = 0.0, xx, xx2, xx4;
for ( ; x < 7.0; ++x) { /* reduce x till x<7 */
result -= 1.0/x;
}
x -= 1.0/2.0;
xx = 1.0/x;
xx2 = xx*xx;
xx4 = xx2*xx2;
result += log(x)+(1./24.)*xx2-(7.0/960.0)*xx4+(31.0/8064.0)*xx4*xx2-(127.0/30720.0)*xx4*xx4;
return result;
}
__global__ void kernel_evaluatenu(int Nd, double qsum, double *q, double deltanu,double nulow) {
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid<Nd) {
double thisnu=(nulow+((double)tid)*deltanu);
double dgm=digamma(thisnu*0.5+0.5);
q[tid]=dgm-log((thisnu+1.0)*0.5); /* psi((nu+1)/2)-log((nu+1)/2) */
dgm=digamma(thisnu*0.5);
q[tid]+=-dgm+log((thisnu)*0.5); /* -psi((nu)/2)+log((nu)/2) */
q[tid]+=-qsum+1.0; /* -(-sum(ln(w_i))/N+sum(w_i)/N)+1 */
}
}
/* only use extern if calling code is C */
extern "C"
{
/* set initial weights to 1 by a cuda kernel */
void
cudakernel_setweights(int ThreadsPerBlock, int BlocksPerGrid, int N, double *wt, double alpha) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_setweights<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, alpha);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* hadamard product by a cuda kernel x<= x*wt */
void
cudakernel_hadamard(int ThreadsPerBlock, int BlocksPerGrid, int N, double *wt, double *x) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_hadamard<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, x);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* update weights by a cuda kernel */
void
cudakernel_updateweights(int ThreadsPerBlock, int BlocksPerGrid, int N, double *wt, double *x, double *q, double robust_nu) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_updateweights<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, x, q, robust_nu);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* update weights by a cuda kernel */
void
cudakernel_sqrtweights(int ThreadsPerBlock, int BlocksPerGrid, int N, double *wt) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_sqrtweights<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* evaluate expression for finding optimum nu for
a range of nu values */
void
cudakernel_evaluatenu(int ThreadsPerBlock, int BlocksPerGrid, int Nd, double qsum, double *q, double deltanu,double nulow) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_evaluatenu<<< BlocksPerGrid, ThreadsPerBlock >>>(Nd, qsum, q, deltanu, nulow);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* cuda driver for calculating wt \odot f() */
/* p: params (Mx1), x: data (Nx1), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
void
cudakernel_func_wt(int ThreadsPerBlock, int BlocksPerGrid, double *p, double *x, int M, int N, double *coh, short *bbh, double *wt, int Nbase, int Mclus, int Nstations) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
cudaMemset(x, 0, N*sizeof(double));
// printf("Kernel data size=%d, block=%d, thread=%d, baselines=%d\n",N,BlocksPerGrid, ThreadsPerBlock,Nbase);
kernel_func_wt<<< BlocksPerGrid, ThreadsPerBlock >>>(Nbase, x, coh, p, bbh, wt, Nstations);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* cuda driver for calculating wt \odot jacf() */
/* p: params (Mx1), jac: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
void
cudakernel_jacf_wt(int ThreadsPerBlock_row, int ThreadsPerBlock_col, double *p, double *jac, int M, int N, double *coh, short *bbh, double *wt, int Nbase, int Mclus, int Nstations, int clus) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
/* NOTE: use small value for ThreadsPerBlock here, like 8 */
dim3 threadsPerBlock(16, 8);
/* jacobian: Nbase x Nstations (proportional to N), so */
dim3 numBlocks((Nbase+threadsPerBlock.x-1)/threadsPerBlock.x,
(M+threadsPerBlock.y-1)/threadsPerBlock.y);
/* set memory of jac to zero */
cudaMemset(jac, 0, N*M*sizeof(double));
// printf("Kernel Jax data size=%d, params=%d, block=%d,%d, thread=%d,%d, baselines=%d\n",N, M, numBlocks.x,numBlocks.y, threadsPerBlock.x, threadsPerBlock.y, Nbase);
kernel_jacf_wt<<< numBlocks, threadsPerBlock>>>(Nbase, M, jac, coh, p, bbh, wt, Nstations);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* cuda driver for kernel */
/* ThreadsPerBlock: keep <= 128 ???
BlocksPerGrid: depends on the threads/baselines> Threads*Blocks approx baselines
Nbase: no of baselines (total, including tilesz >1)
tilesz: tile size
M: no of clusters
Ns: no of stations
Nparam: no of actual parameters <=total
goff: starting point of gradient calculation 0..Nparams
x: N*8 x 1 residual
coh: N*8*M x 1
p: M*Ns*8 x 1
bb: 2*N x 1
ptoclus: 2*M x 1
grad: Nparamsx1 gradient values
*/
void cudakernel_lbfgs_robust(int ThreadsPerBlock, int BlocksPerGrid, int Nbase, int tilesz, int M, int Ns, int Nparam, int goff, double robust_nu, double *x, double *coh, double *p, short *bb, int *ptoclus, double *grad){
#ifdef CUDA_DBG
cudaError_t error;
#endif
/* invoke device on this block/thread grid (last argument is buffer size in bytes) */
kernel_deriv_robust<<< BlocksPerGrid, ThreadsPerBlock, ThreadsPerBlock*sizeof(double) >>> (Nbase, tilesz, M, Ns, Nparam, goff, robust_nu, x, coh, p, bb, ptoclus, grad);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
}

Binary file not shown.

View File

@ -1,536 +0,0 @@
/*
*
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
$Id$
*/
#include "cuda.h"
#include <cuComplex.h>
#include <stdio.h>
/* enable this for checking for kernel failure */
//#define CUDA_DBG
__global__ void
kernel_func_wt_fl(int Nbase, float *x, float *coh, float *p, short *bb, float *wt, int N){
/* global thread index : equal to the baseline */
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
/* this thread works on
x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
bb[2*n:2*n+1] (sta1,sta2)
organization of p (N stations and M clusters)
sta 0 sta 1 sta 2 .... sta N-1
clus 0 0...7 8...15 16...23 ... 8N-8 8N-1
clus 1 8N..8N+7 8N+8..8N+15 8N+16..8N+23 .... 8N+8N-8...8N+8N-1
......
clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15.... ...(M-1)N+8N-8 (M-1)N+8N-1
organization of coherencies (coh)
[0, 8*M-1] : baseline 0
[8*M, 8*M+8*M-1]: baseline 1
[n*8*M, n*8*M+8*M-1]: baseline n
......
[n*8*M+cm*8, n*8*M+cm*8+7] cluster cm, baseline n
residual error stored at sum[n]
*/
if(n<Nbase) {
int sta1=(int)bb[2*n];
int sta2=(int)bb[2*n+1];
/* condition for calculating this baseline sum is
1) its not flagged (sta1,sta2)>=0
*/
if (sta1>=0 && sta2>=0) {
cuComplex G1[4];
float pp[8];
pp[0]=p[sta1*8];
pp[1]=p[sta1*8+1];
pp[2]=p[sta1*8+2];
pp[3]=p[sta1*8+3];
pp[4]=p[sta1*8+4];
pp[5]=p[sta1*8+5];
pp[6]=p[sta1*8+6];
pp[7]=p[sta1*8+7];
G1[0].x=pp[0];
G1[0].y=pp[1];
G1[1].x=pp[2];
G1[1].y=pp[3];
G1[2].x=pp[4];
G1[2].y=pp[5];
G1[3].x=pp[6];
G1[3].y=pp[7];
cuComplex C[4];
C[0].x=coh[8*n];
C[0].y=coh[8*n+1];
C[1].x=coh[8*n+2];
C[1].y=coh[8*n+3];
C[2].x=coh[8*n+4];
C[2].y=coh[8*n+5];
C[3].x=coh[8*n+6];
C[3].y=coh[8*n+7];
cuComplex T1[4];
/* T=G1*C */
T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
cuComplex G2[4];
/* conjugate this */
pp[0]=p[sta2*8];
pp[1]=-p[sta2*8+1];
pp[2]=p[sta2*8+2];
pp[3]=-p[sta2*8+3];
pp[4]=p[sta2*8+4];
pp[5]=-p[sta2*8+5];
pp[6]=p[sta2*8+6];
pp[7]=-p[sta2*8+7];
G2[0].x=pp[0];
G2[0].y=pp[1];
G2[2].x=pp[2];
G2[2].y=pp[3];
G2[1].x=pp[4];
G2[1].y=pp[5];
G2[3].x=pp[6];
G2[3].y=pp[7];
cuComplex T2[4];
T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
/* update model vector, with weights */
x[8*n]=wt[8*n]*T2[0].x;
x[8*n+1]=wt[8*n+1]*T2[0].y;
x[8*n+2]=wt[8*n+2]*T2[1].x;
x[8*n+3]=wt[8*n+3]*T2[1].y;
x[8*n+4]=wt[8*n+4]*T2[2].x;
x[8*n+5]=wt[8*n+5]*T2[2].y;
x[8*n+6]=wt[8*n+6]*T2[3].x;
x[8*n+7]=wt[8*n+7]*T2[3].y;
}
}
}
__global__ void
kernel_jacf_wt_fl(int Nbase, int M, float *jac, float *coh, float *p, short *bb, float *wt, int N){
/* global thread index : equal to the baseline */
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
/* which parameter:0...M */
unsigned int m = threadIdx.y + blockDim.y*blockIdx.y;
/* this thread works on
x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
bb[2*n:2*n+1] (sta1,sta2)
organization of p (N stations and M clusters)
sta 0 sta 1 sta 2 .... sta N-1
clus 0 0...7 8...15 16...23 ... 8N-8 8N-1
clus 1 8N..8N+7 8N+8..8N+15 8N+16..8N+23 .... 8N+8N-8...8N+8N-1
......
clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15.... ...(M-1)N+8N-8 (M-1)N+8N-1
organization of coherencies (coh)
[0, 8*M-1] : baseline 0
[8*M, 8*M+8*M-1]: baseline 1
[n*8*M, n*8*M+8*M-1]: baseline n
......
[n*8*M+cm*8, n*8*M+cm*8+7] cluster cm, baseline n
residual error stored at sum[n]
*/
if(n<Nbase && m<M) {
int sta1=(int)bb[2*n];
int sta2=(int)bb[2*n+1];
/* condition for calculating this baseline sum is
If this baseline is flagged,
or if this parameter does not belong to sta1 or sta2
we do not compute
*/
//int stc=m/8; /* 0...Ns-1 (because M=total par= 8 * Nstations */
int stc=m>>3; /* 0...Ns-1 (because M=total par= 8 * Nstations */
if (((stc==sta2)||(stc==sta1)) && sta1>=0 && sta2>=0 ) {
cuComplex C[4];
C[0].x=coh[8*n];
C[0].y=coh[8*n+1];
C[1].x=coh[8*n+2];
C[1].y=coh[8*n+3];
C[2].x=coh[8*n+4];
C[2].y=coh[8*n+5];
C[3].x=coh[8*n+6];
C[3].y=coh[8*n+7];
/* which parameter exactly 0..7 */
//int stoff=m%8;
int stoff=m-stc*8;
float pp1[8];
float pp2[8];
if (stc==sta1) {
for (int cn=0; cn<8; cn++) {
pp1[cn]=0.0f;
pp2[cn]=p[sta2*8+cn];
}
pp1[stoff]=1.0f;
} else if (stc==sta2) {
for (int cn=0; cn<8; cn++) {
pp2[cn]=0.0f;
pp1[cn]=p[sta1*8+cn];
}
pp2[stoff]=1.0f;
}
cuComplex G1[4];
G1[0].x=pp1[0];
G1[0].y=pp1[1];
G1[1].x=pp1[2];
G1[1].y=pp1[3];
G1[2].x=pp1[4];
G1[2].y=pp1[5];
G1[3].x=pp1[6];
G1[3].y=pp1[7];
cuComplex T1[4];
/* T=G1*C */
T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
cuComplex G2[4];
/* conjugate this */
G2[0].x=pp2[0];
G2[0].y=-pp2[1];
G2[2].x=pp2[2];
G2[2].y=-pp2[3];
G2[1].x=pp2[4];
G2[1].y=-pp2[5];
G2[3].x=pp2[6];
G2[3].y=-pp2[7];
cuComplex T2[4];
T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
/* update jacobian , with row weights */
/* NOTE: row major order */
jac[m+M*8*n]=wt[8*n]*T2[0].x;
jac[m+M*(8*n+1)]=wt[8*n+1]*T2[0].y;
jac[m+M*(8*n+2)]=wt[8*n+2]*T2[1].x;
jac[m+M*(8*n+3)]=wt[8*n+3]*T2[1].y;
jac[m+M*(8*n+4)]=wt[8*n+4]*T2[2].x;
jac[m+M*(8*n+5)]=wt[8*n+5]*T2[2].y;
jac[m+M*(8*n+6)]=wt[8*n+6]*T2[3].x;
jac[m+M*(8*n+7)]=wt[8*n+7]*T2[3].y;
}
}
}
__global__ void
kernel_setweights_fl(int N, float *wt, float alpha){
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
/* make sure to use only M threads */
if (tid<N) {
wt[tid]=alpha;
}
}
__global__ void
kernel_hadamard_fl(int N, float *wt, float *x){
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
/* make sure to use only M threads */
if (tid<N) {
x[tid]*=wt[tid];
}
}
__global__ void
kernel_updateweights_fl(int N, float *wt, float *x, float *q, float nu){
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
/* make sure to use only M threads */
if (tid<N) {
wt[tid]=((nu+1.0f)/(nu+x[tid]*x[tid]));
q[tid]=wt[tid]-logf(wt[tid]); /* so that its +ve */
}
}
__global__ void
kernel_sqrtweights_fl(int N, float *wt){
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
/* make sure to use only M threads */
if (tid<N) {
wt[tid]=sqrtf(wt[tid]);
}
}
__device__ float
digamma_fl(float x) {
float result = 0.0f, xx, xx2, xx4;
for ( ; x < 7.0f; ++x) { /* reduce x till x<7 */
result -= 1.0f/x;
}
x -= 1.0f/2.0f;
xx = 1.0f/x;
xx2 = xx*xx;
xx4 = xx2*xx2;
result += logf(x)+(1.0f/24.0f)*xx2-(7.0f/960.0f)*xx4+(31.0f/8064.0f)*xx4*xx2-(127.0f/30720.0f)*xx4*xx4;
return result;
}
__global__ void
kernel_evaluatenu_fl(int Nd, float qsum, float *q, float deltanu,float nulow) {
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid<Nd) {
float thisnu=(nulow+((float)tid)*deltanu);
float dgm=digamma_fl(thisnu*0.5f+0.5f);
q[tid]=dgm-logf((thisnu+1.0f)*0.5f); /* psi((nu+1)/2)-log((nu+1)/2) */
dgm=digamma_fl(thisnu*0.5f);
q[tid]+=-dgm+logf((thisnu)*0.5f); /* -psi((nu)/2)+log((nu)/2) */
q[tid]+=-qsum+1.0f; /* -(-sum(ln(w_i))/N+sum(w_i)/N)+1 */
}
}
__global__ void
kernel_evaluatenu_fl_eight(int Nd, float qsum, float *q, float deltanu,float nulow, float nu0) {
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
/* each block calculte psi((nu+8)/2)-log((nu+8)/2) */
/* actually p=2, so psi((nu+2)/2)-log((nu+2)/2) */
float dgm0;
if (threadIdx.x==0) {
dgm0=digamma_fl(nu0*0.5f+1.0f);
dgm0=dgm0-logf((nu0+2.0f)*0.5f); /* psi((nu0+8)/2)-log((nu0+8)/2) */
}
__syncthreads();
if (tid<Nd) {
float thisnu=(nulow+((float)tid)*deltanu);
q[tid]=dgm0; /* psi((nu0+8)/2)-log((nu0+8)/2) */
float dgm=digamma_fl(thisnu*0.5f);
q[tid]+=-dgm+logf((thisnu)*0.5f); /* -psi((nu)/2)+log((nu)/2) */
q[tid]+=-qsum+1.0f; /* -(-sum(ln(w_i))/N+sum(w_i)/N)+1 */
}
}
/* only use extern if calling code is C */
extern "C"
{
/* set initial weights to 1 by a cuda kernel */
void
cudakernel_setweights_fl(int ThreadsPerBlock, int BlocksPerGrid, int N, float *wt, float alpha) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_setweights_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, alpha);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* hadamard product by a cuda kernel x<= x*wt */
void
cudakernel_hadamard_fl(int ThreadsPerBlock, int BlocksPerGrid, int N, float *wt, float *x) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_hadamard_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, x);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* update weights by a cuda kernel */
void
cudakernel_updateweights_fl(int ThreadsPerBlock, int BlocksPerGrid, int N, float *wt, float *x, float *q, float robust_nu) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_updateweights_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, x, q, robust_nu);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* update weights by a cuda kernel */
void
cudakernel_sqrtweights_fl(int ThreadsPerBlock, int BlocksPerGrid, int N, float *wt) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_sqrtweights_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* evaluate expression for finding optimum nu for
a range of nu values */
void
cudakernel_evaluatenu_fl(int ThreadsPerBlock, int BlocksPerGrid, int Nd, float qsum, float *q, float deltanu,float nulow) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_evaluatenu_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(Nd, qsum, q, deltanu,nulow);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* evaluate expression for finding optimum nu for
a range of nu values, using AECM (p=8 before, but now p=2)
nu0: current value of robust_nu*/
void
cudakernel_evaluatenu_fl_eight(int ThreadsPerBlock, int BlocksPerGrid, int Nd, float qsum, float *q, float deltanu,float nulow, float nu0) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
kernel_evaluatenu_fl_eight<<< BlocksPerGrid, ThreadsPerBlock >>>(Nd, qsum, q, deltanu,nulow, nu0);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* cuda driver for calculating wt \odot f() */
/* p: params (Mx1), x: data (Nx1), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
void
cudakernel_func_wt_fl(int ThreadsPerBlock, int BlocksPerGrid, float *p, float *x, int M, int N, float *coh, short *bbh, float *wt, int Nbase, int Mclus, int Nstations) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
cudaMemset(x, 0, N*sizeof(float));
// printf("Kernel data size=%d, block=%d, thread=%d, baselines=%d\n",N,BlocksPerGrid, ThreadsPerBlock,Nbase);
kernel_func_wt_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(Nbase, x, coh, p, bbh, wt, Nstations);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
/* cuda driver for calculating wt \odot jacf() */
/* p: params (Mx1), jac: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
void
cudakernel_jacf_wt_fl(int ThreadsPerBlock_row, int ThreadsPerBlock_col, float *p, float *jac, int M, int N, float *coh, short *bbh, float *wt, int Nbase, int Mclus, int Nstations, int clus) {
#ifdef CUDA_DBG
cudaError_t error;
#endif
/* NOTE: use small value for ThreadsPerBlock here, like 8 */
dim3 threadsPerBlock(16, 8);
/* jacobian: Nbase x Nstations (proportional to N), so */
dim3 numBlocks((Nbase+threadsPerBlock.x-1)/threadsPerBlock.x,
(M+threadsPerBlock.y-1)/threadsPerBlock.y);
/* set memory of jac to zero */
cudaMemset(jac, 0, N*M*sizeof(float));
// printf("Kernel Jax data size=%d, params=%d, block=%d,%d, thread=%d,%d, baselines=%d\n",N, M, numBlocks.x,numBlocks.y, threadsPerBlock.x, threadsPerBlock.y, Nbase);
kernel_jacf_wt_fl<<< numBlocks, threadsPerBlock>>>(Nbase, M, jac, coh, p, bbh, wt, Nstations);
cudaDeviceSynchronize();
#ifdef CUDA_DBG
error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
exit(-1);
}
#endif
}
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,894 +0,0 @@
/*
*
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
$Id$
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <float.h>
#include "Solvers.h"
#include <cuda_runtime.h>
//#define DEBUG
/* helper functions for diagnostics */
static void
checkCudaError(cudaError_t err, char *file, int line)
{
#ifdef CUDA_DEBUG
if(!err)
return;
fprintf(stderr,"GPU (CUDA): %s %s %d\n", cudaGetErrorString(err),file,line);
exit(EXIT_FAILURE);
#endif
}
static void
checkCublasError(cublasStatus_t cbstatus, char *file, int line)
{
#ifdef CUDA_DEBUG
if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
fprintf(stderr,"%s: %d: CUBLAS failure\n",file,line);
exit(EXIT_FAILURE);
}
#endif
}
/* Retraction
rnew: new value */
/* rnew = x + r */
void
cudakernel_fns_R(int N, cuFloatComplex *x, cuFloatComplex *r, cuFloatComplex *rnew, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
cublasStatus_t cbstatus;
cbstatus=cublasCcopy(cbhandle,4*N,x,1,rnew,1);
cuFloatComplex alpha;
alpha.x=1.0f; alpha.y=0.0f;
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, r, 1, rnew, 1);
checkCublasError(cbstatus,__FILE__,__LINE__);
}
/* inner product (metric) */
float
cudakernel_fns_g(int N,cuFloatComplex *x,cuFloatComplex *eta, cuFloatComplex *gamma,cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
/* 2 x real( trace(eta'*gamma) )
= 2 x real( eta(:,1)'*gamma(:,1) + eta(:,2)'*gamma(:,2) )
no need to calculate off diagonal terms
)*/
cublasStatus_t cbstatus;
cuFloatComplex r1,r2;
//complex double v1=my_cdot(2*N,eta,gamma);
cbstatus=cublasCdotc(cbhandle,2*N,eta,1,gamma,1,&r1);
//complex double v2=my_cdot(2*N,&eta[2*N],&gamma[2*N]);
cbstatus=cublasCdotc(cbhandle,2*N,&eta[2*N],1,&gamma[2*N],1,&r2);
checkCublasError(cbstatus,__FILE__,__LINE__);
return 2.0f*(r1.x+r2.x);
}
/* Projection
rnew: new value */
void
cudakernel_fns_proj(int N, cuFloatComplex *x, cuFloatComplex *z, cuFloatComplex *rnew, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
/* projection = Z-X Om, where
Om X^H X+X^H X Om = X^H Z - Z^H X
is solved to find Om */
cublasStatus_t cbstatus;
/* find X^H X */
cuFloatComplex xx00,xx01,xx10,xx11,*bd;
//xx00=my_cdot(2*N,x,x);
cbstatus=cublasCdotc(cbhandle,2*N,x,1,x,1,&xx00);
//xx01=my_cdot(2*N,x,&x[2*N]);
cbstatus=cublasCdotc(cbhandle,2*N,x,1,&x[2*N],1,&xx01);
xx10=cuConjf(xx01);
//xx11=my_cdot(2*N,&x[2*N],&x[2*N]);
cbstatus=cublasCdotc(cbhandle,2*N,&x[2*N],1,&x[2*N],1,&xx11);
/* find X^H Z (and using this just calculte Z^H X directly) */
cuFloatComplex xz00,xz01,xz10,xz11;
//xz00=my_cdot(2*N,x,z);
cbstatus=cublasCdotc(cbhandle,2*N,x,1,z,1,&xz00);
//xz01=my_cdot(2*N,x,&z[2*N]);
cbstatus=cublasCdotc(cbhandle,2*N,x,1,&z[2*N],1,&xz01);
//xz10=my_cdot(2*N,&x[2*N],z);
cbstatus=cublasCdotc(cbhandle,2*N,&x[2*N],1,z,1,&xz10);
//xz11=my_cdot(2*N,&x[2*N],&z[2*N]);
cbstatus=cublasCdotc(cbhandle,2*N,&x[2*N],1,&z[2*N],1,&xz11);
/* find X^H Z - Z^H X */
cuFloatComplex rr00,rr01,rr10,rr11;
//rr00=xz00-conj(xz00);
rr00=cuCsubf(xz00,cuConjf(xz00));
//rr01=xz01-conj(xz10);
rr01=cuCsubf(xz01,cuConjf(xz10));
//rr10=-conj(rr01);
rr10.x=-rr01.x; rr10.y=rr01.y;
//rr11=xz11-conj(xz11);
rr11=cuCsubf(xz11,cuConjf(xz11));
/* find I_2 kron (X^H X) + (X^H X)^T kron I_2 */
/* A = [2*xx00 xx01 xx10 0
xx10 xx11+xx00 0 xx10
xx01 0 xx11+xx00 xx01
0 xx01 xx10 2*xx11 ]
*/
cuFloatComplex A[16],*Ad;
A[0]=cuCmulf(make_cuFloatComplex(2.0f,0.0f),xx00);
A[5]=A[10]=cuCaddf(xx00,xx11);
A[15]=cuCmulf(make_cuFloatComplex(2.0f,0.0f),xx11);
A[1]=A[8]=A[11]=A[13]=xx10;
A[2]=A[4]=A[7]=A[14]=xx01;
A[3]=A[6]=A[9]=A[12]=make_cuFloatComplex(0.0f,0.0f);
cuFloatComplex b[4];
b[0]=rr00;
b[1]=rr10;
b[2]=rr01;
b[3]=rr11;
#ifdef DEBUG
printf("BEFOREA=[\n");
printf("%f+j*(%f) %f+j*(%f) %f+j*(%f) %f+j*(%f)\n",A[0].x,A[0].y,A[4].x,A[4].y,A[8].x,A[8].y,A[12].x,A[12].y);
printf("%f+j*(%f) %f+j*(%f) %f+j*(%f) %f+j*(%f)\n",A[1].x,A[1].y,A[5].x,A[5].y,A[9].x,A[9].y,A[13].x,A[13].y);
printf("%f+j*(%f) %f+j*(%f) %f+j*(%f) %f+j*(%f)\n",A[2].x,A[2].y,A[6].x,A[6].y,A[10].x,A[10].y,A[14].x,A[14].y);
printf("%f+j*(%f) %f+j*(%f) %f+j*(%f) %f+j*(%f)\n",A[3].x,A[3].y,A[7].x,A[7].y,A[11].x,A[11].y,A[15].x,A[15].y);
printf("];\n");
printf("BEFOREb=[\n");
printf("%f+j*(%f)\n",b[0].x,b[0].y);
printf("%f+j*(%f)\n",b[1].x,b[1].y);
printf("%f+j*(%f)\n",b[2].x,b[2].y);
printf("%f+j*(%f)\n",b[3].x,b[3].y);
printf("];\n");
#endif
/* solve A u = b to find u , using double precision */
cudaMalloc((void **)&Ad, 16*sizeof(cuFloatComplex));
cudaMemcpy(Ad,A,16*sizeof(cuFloatComplex),cudaMemcpyHostToDevice);
/* copy b to device */
cudaMalloc((void **)&bd, 4*sizeof(cuFloatComplex));
cudaMemcpy(bd,b,4*sizeof(cuFloatComplex),cudaMemcpyHostToDevice);
//culaStatus status;
//status=culaDeviceCgels('N',4,4,1,(culaDeviceFloatComplex *)Ad,4,(culaDeviceFloatComplex *)bd,4);
//checkStatus(status,__FILE__,__LINE__);
int work_size=0;
int *devInfo;
cudaError_t err;
err=cudaMalloc((void**)&devInfo, sizeof(int));
checkCudaError(err,__FILE__,__LINE__);
cuFloatComplex *work,*taud;
cusolverDnCgeqrf_bufferSize(solver_handle, 4, 4, (cuFloatComplex *)Ad, 4, &work_size);
err=cudaMalloc((void**)&work, work_size*sizeof(cuFloatComplex));
err=cudaMalloc((void**)&taud, 4*sizeof(cuFloatComplex));
checkCudaError(err,__FILE__,__LINE__);
cusolverDnCgeqrf(solver_handle, 4, 4, Ad, 4, taud, work, work_size, devInfo);
cusolverDnCunmqr(solver_handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_C, 4, 1, 4, Ad, 4, taud, bd, 4, work, work_size, devInfo);
cuFloatComplex cone; cone.x=1.0f; cone.y=0.0f;
cbstatus=cublasCtrsm(cbhandle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_UPPER,CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,4,1,&cone,Ad,4,bd,4);
cudaFree(work);
cudaFree(taud);
cudaFree(devInfo);
#ifdef DEBUG
cudaMemcpy(b,bd,4*sizeof(cuFloatComplex),cudaMemcpyDeviceToHost);
printf("Afterb=[\n");
printf("%f+j*(%f)\n",b[0].x,b[0].y);
printf("%f+j*(%f)\n",b[1].x,b[1].y);
printf("%f+j*(%f)\n",b[2].x,b[2].y);
printf("%f+j*(%f)\n",b[3].x,b[3].y);
printf("];\n");
#endif
/* form Z - X * Om, where Om is given by solution b
but no need to rearrange b because it is already in col major order */
//my_ccopy(4*N,z,1,rnew,1);
cbstatus=cublasCcopy(cbhandle,4*N,z,1,rnew,1);
checkCublasError(cbstatus,__FILE__,__LINE__);
//my_zgemm('N','N',2*N,2,2,-1.0+0.0*_Complex_I,z,2*N,b,2,1.0+0.0*_Complex_I,rnew,2*N);
cuFloatComplex a1,a2;
a1.x=-1.0f; a1.y=0.0f;
a2.x=1.0f; a2.y=0.0f;
#ifdef DEBUG
/* read back eta for checking */
cuFloatComplex *etalocal;
cudaHostAlloc((void **)&etalocal, sizeof(cuFloatComplex)*4*N,cudaHostAllocDefault);
cudaMemcpy(etalocal, rnew, 4*N*sizeof(cuFloatComplex), cudaMemcpyDeviceToHost);
printf("Rnewbefore=[\n");
int ci;
for (ci=0; ci<2*N; ci++) {
printf("%f+j*(%f) %f+j*(%f);\n",etalocal[ci].x,etalocal[ci].y,etalocal[ci+2*N].x,etalocal[ci+2*N].y);
}
printf("]\n");
#endif
cbstatus=cublasCgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_N,2*N,2,2,&a1,x,2*N,bd,2,&a2,rnew,2*N);
#ifdef DEBUG
checkCublasError(cbstatus,__FILE__,__LINE__);
cudaMemcpy(etalocal, rnew, 4*N*sizeof(cuFloatComplex), cudaMemcpyDeviceToHost);
printf("Rnewafter=[\n");
for (ci=0; ci<2*N; ci++) {
printf("%f+j*(%f) %f+j*(%f);\n",etalocal[ci].x,etalocal[ci].y,etalocal[ci+2*N].x,etalocal[ci+2*N].y);
}
printf("]\n");
cudaFreeHost(etalocal);
#endif
checkCublasError(cbstatus,__FILE__,__LINE__);
cudaFree(Ad);
cudaFree(bd);
}
/* gradient, also projected to tangent space */
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
static void
cudakernel_fns_fgrad(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, float *y, float *coh, short *bbh, float *iw, int negate, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
cuFloatComplex *tempeta,*tempb;
cublasStatus_t cbstatus=CUBLAS_STATUS_SUCCESS;
cuFloatComplex alpha;
cudaMalloc((void**)&tempeta, sizeof(cuFloatComplex)*4*N);
cudaMalloc((void**)&tempb, sizeof(cuFloatComplex)*4*N);
/* max size of M for one kernel call, to determine optimal blocks */
int T=DEFAULT_TH_PER_BK*ThreadsPerBlock;
if (M<T) {
cudakernel_fns_fgradflat(ThreadsPerBlock, BlocksPerGrid, N, M, x, tempeta, y, coh, bbh);
} else {
/* reset memory to zero */
cudaMemset(tempeta, 0, sizeof(cuFloatComplex)*4*N);
/* loop through M baselines */
int L=(M+T-1)/T;
int ct=0;
int myT,ci;
for (ci=0; ci<L; ci++) {
if (ct+T<M) {
myT=T;
} else {
myT=M-ct;
}
int B=(myT+ThreadsPerBlock-1)/ThreadsPerBlock;
cudakernel_fns_fgradflat(ThreadsPerBlock, B, N, myT, x, tempb, &y[ct*8], &coh[ct*8], &bbh[ct*2]);
alpha.x=1.0f;alpha.y=0.0f;
/* tempeta <= tempeta + tempb */
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, tempb, 1, tempeta, 1);
ct=ct+T;
}
}
cudakernel_fns_fscale(N, tempeta, iw);
/* find -ve gradient */
if (negate) {
alpha.x=-1.0f;alpha.y=0.0f;
cbstatus=cublasCscal(cbhandle,4*N,&alpha,tempeta,1);
}
cudakernel_fns_proj(N, x, tempeta, eta, cbhandle, solver_handle);
checkCublasError(cbstatus,__FILE__,__LINE__);
cudaFree(tempeta);
cudaFree(tempb);
}
/* Hessian, also projected to tangent space */
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
static void
cudakernel_fns_fhess(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, cuFloatComplex *fhess, float *y, float *coh, short *bbh, float *iw, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
cuFloatComplex *tempeta,*tempb;
cudaMalloc((void**)&tempeta, sizeof(cuFloatComplex)*4*N);
cudaMalloc((void**)&tempb, sizeof(cuFloatComplex)*4*N);
cuFloatComplex alpha;
cublasStatus_t cbstatus=CUBLAS_STATUS_SUCCESS;
/* max size of M for one kernel call, to determine optimal blocks */
int T=DEFAULT_TH_PER_BK*ThreadsPerBlock;
if (M<T) {
cudakernel_fns_fhessflat(ThreadsPerBlock, BlocksPerGrid, N, M, x, eta, tempeta, y, coh, bbh);
} else {
/* reset memory to zero */
cudaMemset(tempeta, 0, sizeof(cuFloatComplex)*4*N);
/* loop through M baselines */
int L=(M+T-1)/T;
int ct=0;
int myT,ci;
for (ci=0; ci<L; ci++) {
if (ct+T<M) {
myT=T;
} else {
myT=M-ct;
}
int B=(myT+ThreadsPerBlock-1)/ThreadsPerBlock;
cudakernel_fns_fhessflat(ThreadsPerBlock, B, N, myT, x, eta, tempb, &y[ct*8], &coh[ct*8], &bbh[ct*2]);
alpha.x=1.0f;alpha.y=0.0f;
/* tempeta <= tempeta + tempb */
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, tempb, 1, tempeta, 1);
ct=ct+T;
}
}
cudakernel_fns_fscale(N, tempeta, iw);
cudakernel_fns_proj(N, x, tempeta, fhess, cbhandle, solver_handle);
checkCublasError(cbstatus,__FILE__,__LINE__);
cudaFree(tempeta);
cudaFree(tempb);
}
/* Armijo step calculation,
output teta: Armijo gradient
return value: 0 : cost reduced, 1: no cost reduction, so do not run again
mincost: minimum value of cost found, if possible
*/
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
static int
armijostep(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *teta, float *y, float *coh, short *bbh, float *iw, float *mincost, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
float alphabar=10.0f;
float beta=0.2f;
float sigma=0.5f;
cublasStatus_t cbstatus;
/* temp storage, re-using global storage */
cuFloatComplex *eta, *x_prop;
cudaMalloc((void**)&eta, sizeof(cuFloatComplex)*4*N);
cudaMalloc((void**)&x_prop, sizeof(cuFloatComplex)*4*N);
//double fx=fns_f(x,y,gdata);
float fx=cudakernel_fns_f(ThreadsPerBlock,BlocksPerGrid,N,M,x,y,coh,bbh);
//fns_fgrad(x,eta,y,gdata,0);
cudakernel_fns_fgrad(ThreadsPerBlock,BlocksPerGrid,N,M,x,eta,y,coh,bbh,iw,0,cbhandle, solver_handle);
#ifdef DEBUG
float eta_nrm;
cublasScnrm2(cbhandle,4*N,eta,1,&eta_nrm);
printf("||eta||=%f\n",eta_nrm);
/* read back eta for checking */
cuFloatComplex *etalocal;
cudaHostAlloc((void **)&etalocal, sizeof(cuFloatComplex)*4*N,cudaHostAllocDefault);
cudaMemcpy(etalocal, eta, 4*N*sizeof(cuFloatComplex), cudaMemcpyDeviceToHost);
printf("Eta=[\n");
int ci;
for (ci=0; ci<2*N; ci++) {
printf("%f %f %f %f\n",etalocal[ci].x,etalocal[ci].y,etalocal[ci+2*N].x,etalocal[ci+2*N].y);
}
printf("]\n");
cudaFreeHost(etalocal);
#endif
float beta0=beta;
float minfx=fx; float minbeta=beta0;
float lhs,rhs,metric;
int m,nocostred=0;
cuFloatComplex alpha;
*mincost=fx;
float metric0=cudakernel_fns_g(N,x,eta,eta,cbhandle,solver_handle);
for (m=0; m<50; m++) {
/* abeta=(beta0)*alphabar*eta; */
//my_ccopy(4*dp->N,eta,1,teta,1);
cbstatus=cublasCcopy(cbhandle,4*N,eta,1,teta,1);
//my_cscal(4*dp->N,beta0*alphabar+0.0*_Complex_I,teta);
alpha.x=beta0*alphabar;alpha.y=0.0f;
cbstatus=cublasCscal(cbhandle,4*N,&alpha,teta,1);
/* Rx=R(x,teta); */
//fns_R(dp->N,x,teta,x_prop);
cudakernel_fns_R(N,x,teta,x_prop,cbhandle,solver_handle);
//lhs=fns_f(x_prop,y,gdata);
lhs=cudakernel_fns_f(ThreadsPerBlock,BlocksPerGrid,N,M,x_prop,y,coh,bbh);
if (lhs<minfx) {
minfx=lhs;
*mincost=minfx;
minbeta=beta0;
}
//rhs=fx+sigma*fns_g(dp->N,x,eta,teta);
//metric=cudakernel_fns_g(N,x,eta,teta,cbhandle);
metric=beta0*alphabar*metric0;
rhs=fx+sigma*metric;
#ifdef DEBUG
printf("m=%d lhs=%e rhs=%e rat=%e norm=%e\n",m,lhs,rhs,lhs/rhs,metric);
#endif
if ((!isnan(lhs) && lhs<=rhs)) {
minbeta=beta0;
break;
}
beta0=beta0*beta;
}
/* if no further cost improvement is seen */
if (lhs>fx) {
nocostred=1;
}
//my_ccopy(4*dp->N,eta,1,teta,1);
cbstatus=cublasCcopy(cbhandle,4*N,eta,1,teta,1);
alpha.x=minbeta*alphabar; alpha.y=0.0f;
//my_cscal(4*dp->N,minbeta*alphabar+0.0*_Complex_I,teta);
cbstatus=cublasCscal(cbhandle,4*N,&alpha,teta,1);
checkCublasError(cbstatus,__FILE__,__LINE__);
cudaFree(eta);
cudaFree(x_prop);
return nocostred;
}
/* truncated conjugate gradient method
x, grad, eta, r, z, delta, Hxd : size 2N x 2 complex
so, vector size is 4N complex double
output: eta
return value: stop_tCG code
y: vec(V) visibilities
*/
/* need 8N*(BlocksPerGrid+2)+ 8N*6 float storage */
static int
tcg_solve_cuda(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *grad, cuFloatComplex *eta, cuFloatComplex *fhess, float Delta, float theta, float kappa, int max_inner, int min_inner, float *y, float *coh, short *bbh, float *iw, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
cuFloatComplex *r,*z,*delta,*Hxd, *rnew;
float e_Pe, r_r, norm_r, z_r, d_Pd, d_Hd, alpha, e_Pe_new,
e_Pd, Deltasq, tau, zold_rold, beta, norm_r0;
int cj, stop_tCG;
cudaMalloc((void**)&r, sizeof(cuFloatComplex)*4*N);
cudaMalloc((void**)&z, sizeof(cuFloatComplex)*4*N);
cudaMalloc((void**)&delta, sizeof(cuFloatComplex)*4*N);
cudaMalloc((void**)&Hxd, sizeof(cuFloatComplex)*4*N);
cudaMalloc((void**)&rnew, sizeof(cuFloatComplex)*4*N);
cublasStatus_t cbstatus;
cuFloatComplex a0;
/*
initial values
*/
cbstatus=cublasCcopy(cbhandle,4*N,grad,1,r,1);
e_Pe=0.0f;
r_r=cudakernel_fns_g(N,x,r,r,cbhandle,solver_handle);
norm_r=sqrtf(r_r);
norm_r0=norm_r;
cbstatus=cublasCcopy(cbhandle,4*N,r,1,z,1);
z_r=cudakernel_fns_g(N,x,z,r,cbhandle,solver_handle);
d_Pd=z_r;
/*
initial search direction
*/
cudaMemset(delta, 0, sizeof(cuFloatComplex)*4*N);
a0.x=-1.0f; a0.y=0.0f;
cbstatus=cublasCaxpy(cbhandle,4*N, &a0, z, 1, delta, 1);
e_Pd=cudakernel_fns_g(N,x,eta,delta,cbhandle,solver_handle);
stop_tCG=5;
/* % begin inner/tCG loop
for j = 1:max_inner,
*/
for(cj=1; cj<=max_inner; cj++) {
cudakernel_fns_fhess(ThreadsPerBlock,BlocksPerGrid,N,M,x,delta,Hxd,y,coh,bbh,iw, cbhandle, solver_handle);
d_Hd=cudakernel_fns_g(N,x,delta,Hxd,cbhandle,solver_handle);
alpha=z_r/d_Hd;
e_Pe_new = e_Pe + 2.0f*alpha*e_Pd + alpha*alpha*d_Pd;
Deltasq=Delta*Delta;
if (d_Hd <= 0.0f || e_Pe_new >= Deltasq) {
tau = (-e_Pd + sqrtf(e_Pd*e_Pd + d_Pd*(Deltasq-e_Pe)))/d_Pd;
a0.x=tau;
cbstatus=cublasCaxpy(cbhandle,4*N, &a0, delta, 1, eta, 1);
/* Heta = Heta + tau *Hdelta */
cbstatus=cublasCaxpy(cbhandle,4*N, &a0, Hxd, 1, fhess, 1);
stop_tCG=(d_Hd<=0.0f?1:2);
break;
}
e_Pe=e_Pe_new;
a0.x=alpha;
cbstatus=cublasCaxpy(cbhandle,4*N, &a0, delta, 1, eta, 1);
/* Heta = Heta + alpha*Hdelta */
cbstatus=cublasCaxpy(cbhandle,4*N, &a0, Hxd, 1, fhess, 1);
cbstatus=cublasCaxpy(cbhandle,4*N, &a0, Hxd, 1, r, 1);
cudakernel_fns_proj(N, x, r, rnew, cbhandle,solver_handle);
cbstatus=cublasCcopy(cbhandle,4*N,rnew,1,r,1);
r_r=cudakernel_fns_g(N,x,r,r,cbhandle,solver_handle);
norm_r=sqrtf(r_r);
/*
check kappa/theta stopping criterion
*/
if (cj >= min_inner) {
float norm_r0pow=powf(norm_r0,theta);
if (norm_r <= norm_r0*MIN(norm_r0pow,kappa)) {
stop_tCG=(kappa<norm_r0pow?3:4);
break;
}
}
cbstatus=cublasCcopy(cbhandle,4*N,r,1,z,1);
zold_rold=z_r;
z_r=cudakernel_fns_g(N,x,z,r,cbhandle,solver_handle);
beta=z_r/zold_rold;
a0.x=beta;
cbstatus=cublasCscal(cbhandle,4*N,&a0,delta,1);
a0.x=-1.0f;
cbstatus=cublasCaxpy(cbhandle,4*N, &a0, z, 1, delta, 1);
e_Pd = beta*(e_Pd + alpha*d_Pd);
d_Pd = z_r + beta*beta*d_Pd;
}
checkCublasError(cbstatus,__FILE__,__LINE__);
cudaFree(r);
cudaFree(z);
cudaFree(delta);
cudaFree(Hxd);
cudaFree(rnew);
return stop_tCG;
}
/* follow clmfit_fl.c */
int
rtr_solve_cuda_fl(
float *x0, /* initial values and updated solution at output (size 8*N float) */
float *y, /* data vector (size 8*M float) */
int N, /* no of stations */
int M, /* no of constraints */
int itmax_sd, /* maximum number of iterations RSD */
int itmax_rtr, /* maximum number of iterations RTR */
float Delta_bar, float Delta0, /* Trust region radius and initial value */
double *info, /* initial and final residuals */
cublasHandle_t cbhandle, /* device handle */
cusolverDnHandle_t solver_handle, /* solver handle */
int tileoff, /* tile offset when solving for many chunks */
int ntiles, /* total tile (data) size being solved for */
me_data_t *adata)
{
/* general note: all device variables end with a 'd' */
cudaError_t err;
cublasStatus_t cbstatus=CUBLAS_STATUS_SUCCESS;
/* ME data */
me_data_t *dp=(me_data_t*)adata;
int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
/* coherency on device */
float *cohd;
/* baseline-station map on device/host */
short *bbd;
/* calculate no of cuda threads and blocks */
int ThreadsPerBlock=DEFAULT_TH_PER_BK;
int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
/* reshape x to make J: 2Nx2 complex double
*/
complex float *x;
if ((x=(complex float*)malloc((size_t)4*N*sizeof(complex float)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
/* map x: [(re,im)J_1(0,0) (re,im)J_1(0,1) (re,im)J_1(1,0) (re,im)J_1(1,1)...]
to
J: [J_1(0,0) J_1(1,0) J_2(0,0) J_2(1,0) ..... J_1(0,1) J_1(1,1) J_2(0,1) J_2(1,1)....]
*/
float *Jd=(float*)x;
/* re J(0,0) */
my_fcopy(N, &x0[0], 8, &Jd[0], 4);
/* im J(0,0) */
my_fcopy(N, &x0[1], 8, &Jd[1], 4);
/* re J(1,0) */
my_fcopy(N, &x0[4], 8, &Jd[2], 4);
/* im J(1,0) */
my_fcopy(N, &x0[5], 8, &Jd[3], 4);
/* re J(0,1) */
my_fcopy(N, &x0[2], 8, &Jd[4*N], 4);
/* im J(0,1) */
my_fcopy(N, &x0[3], 8, &Jd[4*N+1], 4);
/* re J(1,1) */
my_fcopy(N, &x0[6], 8, &Jd[4*N+2], 4);
/* im J(1,1) */
my_fcopy(N, &x0[7], 8, &Jd[4*N+3], 4);
int ci;
/***************************************************/
cuFloatComplex *xd,*fgradxd,*etad,*Hetad,*x_propd;
float *yd;
/* for counting how many baselines contribute to each station
grad/hess calculation */
float *iwd,*iw;
if ((iw=(float*)malloc((size_t)N*sizeof(float)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
cudaMalloc((void**)&fgradxd, sizeof(cuFloatComplex)*4*N);
cudaMalloc((void**)&etad, sizeof(cuFloatComplex)*4*N);
cudaMalloc((void**)&Hetad, sizeof(cuFloatComplex)*4*N);
cudaMalloc((void**)&x_propd, sizeof(cuFloatComplex)*4*N);
cudaMalloc((void**)&xd, sizeof(cuFloatComplex)*4*N);
cudaMalloc((void**)&yd, sizeof(float)*8*M);
cudaMalloc((void**)&cohd, sizeof(float)*8*Nbase);
cudaMalloc((void**)&bbd, sizeof(short)*2*Nbase);
cudaMalloc((void**)&iwd, sizeof(float)*N);
/* need 8N*(BlocksPerGrid+8) for tcg_solve+grad/hess storage,
so total storage needed is
8N*(BlocksPerGrid+8) + 8N*5 + 8*M + 8*Nbase + 2*Nbase + N
*/
/* yd <=y : V */
err=cudaMemcpy(yd, y, 8*M*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* need to give right offset for coherencies */
/* offset: cluster offset+time offset */
/* C */
err=cudaMemcpy(cohd, &(dp->ddcohf[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* correct offset for baselines */
err=cudaMemcpy(bbd, &(dp->ddbase[2*(dp->Nbase)*(tileoff)]), Nbase*2*sizeof(short), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
/* xd <=x : solution */
err=cudaMemcpy(xd, x, 8*N*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
float fx,fx0,norm_grad,Delta,fx_prop,rhonum,rhoden,rho;
/* count how many baselines contribute to each station, store (inverse) in iwd */
count_baselines(Nbase,N,iw,&(dp->ddbase[2*(dp->Nbase)*(tileoff)]),dp->Nt);
err=cudaMemcpy(iwd, iw, N*sizeof(float), cudaMemcpyHostToDevice);
checkCudaError(err,__FILE__,__LINE__);
free(iw);
fx=cudakernel_fns_f(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd);
fx0=fx;
#ifdef DEBUG
printf("Initial Cost=%g\n",fx0);
#endif
/***************************************************/
int rsdstat=0;
/* RSD solution */
for (ci=0; ci<itmax_sd; ci++) {
/* Armijo step */
/* teta=armijostep(V,C,N,x); */
//armijostep(N,x,eta,y,&gdata);
rsdstat=armijostep(ThreadsPerBlock, BlocksPerGrid, N, M, xd, etad, yd, cohd, bbd,iwd,&fx,cbhandle,solver_handle);
/* x=R(x,teta); */
cudakernel_fns_R(N,xd,etad,x_propd,cbhandle,solver_handle);
//my_ccopy(4*N,x_propd,1,xd,1);
if (!rsdstat) {
/* cost reduced, update solution */
cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
} else {
/* no cost reduction, break loop */
break;
}
}
Delta_bar=MIN(fx,0.01f);
Delta0=Delta_bar*0.125f;
//printf("fx=%g Delta_bar=%g Delta0=%g\n",fx,Delta_bar,Delta0);
#ifdef DEBUG
printf("NEW RSD cost=%g\n",fx);
#endif
/***************************************************/
int min_inner,max_inner,min_outer,max_outer;
float epsilon,kappa,theta,rho_prime;
min_inner=1; max_inner=itmax_rtr;//8*N;
min_outer=3;//itmax_rtr; //3;
max_outer=itmax_rtr;
epsilon=(float)CLM_EPSILON;
kappa=0.1f;
theta=1.0f;
/* default values 0.25, 0.75, 0.25, 2.0 */
float eta1=0.0001f; float eta2=0.99f; float alpha1=0.25f; float alpha2=3.5f;
rho_prime=eta1; /* should be <= 0.25, tune for parallel solve */
float rho_regularization; /* use large damping */
rho_regularization=fx*1e-6f;
/* damping: too small => locally converge, globally diverge
|\
|\ | \___
-|\ | \|
\
right damping: locally and globally converge
-|\
\|\
\|\
\____
*/
float rho_reg;
int model_decreased=0;
/* RTR solution */
int k=0;
int stop_outer=(itmax_rtr>0?0:1);
int stop_inner=0;
if (!stop_outer) {
cudakernel_fns_fgrad(ThreadsPerBlock,BlocksPerGrid,N,M,xd,fgradxd,yd,cohd,bbd,iwd,1,cbhandle,solver_handle);
norm_grad=sqrtf(cudakernel_fns_g(N,xd,fgradxd,fgradxd,cbhandle,solver_handle));
}
Delta=Delta0;
/* initial residual */
info[0]=fx0;
/*
% ** Start of TR loop **
*/
while(!stop_outer) {
/*
% update counter
*/
k++;
/* eta = 0*fgradx; */
cudaMemset(etad, 0, sizeof(cuFloatComplex)*4*N);
/* solve TR subproblem, also returns Hessian */
stop_inner=tcg_solve_cuda(ThreadsPerBlock,BlocksPerGrid, N, M, xd, fgradxd, etad, Hetad, Delta, theta, kappa, max_inner, min_inner,yd,cohd,bbd,iwd,cbhandle, solver_handle);
/*
Heta = fns.fhess(x,eta);
*/
/*
compute the retraction of the proposal
*/
cudakernel_fns_R(N,xd,etad,x_propd,cbhandle,solver_handle);
/*
compute cost of the proposal
*/
fx_prop=cudakernel_fns_f(ThreadsPerBlock,BlocksPerGrid,N,M,x_propd,yd,cohd,bbd);
/*
check the performance of the quadratic model
*/
rhonum=fx-fx_prop;
rhoden=-cudakernel_fns_g(N,xd,fgradxd,etad,cbhandle,solver_handle)-0.5f*cudakernel_fns_g(N,xd,Hetad,etad,cbhandle,solver_handle);
/* regularization of rho ratio */
/*
rho_reg = max(1, abs(fx)) * eps * options.rho_regularization;
rhonum = rhonum + rho_reg;
rhoden = rhoden + rho_reg;
*/
rho_reg=MAX(1.0f,fx)*rho_regularization; /* no epsilon */
rhonum+=rho_reg;
rhoden+=rho_reg;
/*
rho = rhonum / rhoden;
*/
rho=rhonum/rhoden;
/* model_decreased = (rhoden >= 0); */
/* OLD CODE if (fabsf(rhonum/fx) <sqrtf_epsilon) {
rho=1.0f;
} */
model_decreased=(rhoden>=0.0f?1:0);
#ifdef DEBUG
printf("stop_inner=%d rho_reg=%g rho =%g/%g= %g rho'= %g\n",stop_inner,rho_reg,rhonum,rhoden,rho,rho_prime);
#endif
/*
choose new TR radius based on performance
*/
if ( !model_decreased || rho<eta1 ) {
Delta=alpha1*Delta;
} else if (rho>eta2 && (stop_inner==2 || stop_inner==1)) {
Delta=MIN(alpha2*Delta,Delta_bar);
}
/*
choose new iterate based on performance
*/
if (model_decreased && rho>rho_prime) {
cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
fx=fx_prop;
cudakernel_fns_fgrad(ThreadsPerBlock,BlocksPerGrid,N,M,xd,fgradxd,yd,cohd,bbd,iwd,1,cbhandle,solver_handle);
norm_grad=sqrtf(cudakernel_fns_g(N,xd,fgradxd,fgradxd,cbhandle,solver_handle));
}
/*
Testing for Stop Criteria
*/
if (norm_grad<epsilon && k>min_outer) {
stop_outer=1;
}
/*
stop after max_outer iterations
*/
if (k>=max_outer) {
stop_outer=1;
}
#ifdef DEBUG
printf("Iter %d cost=%g\n",k,fx);
#endif
}
/* final residual */
info[1]=fx;
#ifdef DEBUG
printf("NEW RTR cost=%g\n",fx);
#endif
/***************************************************/
checkCublasError(cbstatus,__FILE__,__LINE__);
cudaDeviceSynchronize();
if(fx0>fx) {
//printf("Cost final %g initial %g\n",fx,fx0);
/* copy back current solution */
err=cudaMemcpy(x,xd,8*N*sizeof(float),cudaMemcpyDeviceToHost);
checkCudaError(err,__FILE__,__LINE__);
/* copy back solution to x0 : format checked*/
/* re J(0,0) */
my_fcopy(N, &Jd[0], 4, &x0[0], 8);
/* im J(0,0) */
my_fcopy(N, &Jd[1], 4, &x0[1], 8);
/* re J(1,0) */
my_fcopy(N, &Jd[2], 4, &x0[4], 8);
/* im J(1,0) */
my_fcopy(N, &Jd[3], 4, &x0[5], 8);
/* re J(0,1) */
my_fcopy(N, &Jd[4*N], 4, &x0[2], 8);
/* im J(0,1) */
my_fcopy(N, &Jd[4*N+1], 4, &x0[3], 8);
/* re J(1,1) */
my_fcopy(N, &Jd[4*N+2], 4, &x0[6], 8);
/* im J(1,1) */
my_fcopy(N, &Jd[4*N+3], 4, &x0[7], 8);
}
free(x);
cudaFree(fgradxd);
cudaFree(etad);
cudaFree(Hetad);
cudaFree(x_propd);
cudaFree(xd);
cudaFree(yd);
cudaFree(cohd);
cudaFree(bbd);
cudaFree(iwd);
return 0;
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,443 +0,0 @@
/*
*
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
$Id$
*/
#include "Solvers.h"
#include <math.h>
/* Digamma function
if x>7 use digamma(x) = digamma(x+1) - 1/x
for accuracy
using maple expansion
series(Psi(x+1/2), x=infinity, 21);
ln(x)+1/24/x^2-7/960/x^4+31/8064/x^6-127/30720/x^8+511/67584/x^10-1414477/67092480/x^12+8191/98304/x^14-118518239/267386880/x^16+5749691557/1882718208/x^18-91546277357/3460300800/x^20+O(1/x^21)
based on code by Mark Johnson, 2nd September 2007
*/
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
static double
digamma(double x) {
/* FIXME catch -ve value as input */
double result = 0.0, xx, xx2, xx4;
for ( ; x < 7.0; ++x) { /* reduce x till x<7 */
result -= 1.0/x;
}
x -= 0.5;
xx = 1.0/x;
xx2 = xx*xx;
xx4 = xx2*xx2;
result += log(x)+(1./24.)*xx2-(7.0/960.0)*xx4+(31.0/8064.0)*xx4*xx2-(127.0/30720.0)*xx4*xx4;
return result;
}
/* update w<= (nu+1)/(nu+delta^2)
then q <= w-log(w), so that it is +ve
*/
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
static void *
w_nu_update_threadfn(void *data) {
thread_data_vecnu_t *t=(thread_data_vecnu_t*)data;
int ci;
for (ci=t->starti; ci<=t->endi; ci++) {
//t->ed[ci]*=t->wtd[ci]; ??
t->wtd[ci]=(t->nu0+1.0)/(t->nu0+t->ed[ci]*t->ed[ci]);
t->q[ci]=t->wtd[ci]-log(t->wtd[ci]);
}
return NULL;
}
/* update w<= sqrt(w) */
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
static void *
w_sqrt_threadfn(void *data) {
thread_data_vecnu_t *t=(thread_data_vecnu_t*)data;
int ci;
for (ci=t->starti; ci<=t->endi; ci++) {
t->wtd[ci]=sqrt(t->wtd[ci]);
}
return NULL;
}
/* update nu */
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
static void *
q_update_threadfn(void *data) {
thread_data_vecnu_t *t=(thread_data_vecnu_t*)data;
int ci;
double thisnu,dgm;
for (ci=t->starti; ci<=t->endi; ci++) {
thisnu=(t->nulow+(double)ci*t->nu0); /* deltanu stored in nu0 */
dgm=digamma(thisnu*0.5+0.5);
t->q[ci]=dgm-log((thisnu+1.0)*0.5); /* psi((nu+1)/2)-log((nu+1)/2) */
dgm=digamma(thisnu*0.5);
t->q[ci]+=-dgm+log((thisnu)*0.5); /* -psi((nu)/2)+log((nu)/2) */
t->q[ci]+=-t->sumq+1.0; /* q is w-log(w), so -ve: sum(ln(w_i))/N-sum(w_i)/N+1 */
}
return NULL;
}
/* update nu */
#ifdef USE_MIC
__attribute__ ((target(MIC)))
#endif
static void *
q_update_threadfn_aecm(void *data) {
thread_data_vecnu_t *t=(thread_data_vecnu_t*)data;
int ci;
double thisnu,dgm;
for (ci=t->starti; ci<=t->endi; ci++) {
thisnu=(t->nulow+(double)ci*t->nu0); /* deltanu stored in nu0 */
dgm=digamma(thisnu*0.5);
t->q[ci]=-dgm+log((thisnu)*0.5); /* -psi((nu)/2)+log((nu)/2) */
t->q[ci]+=-t->sumq+1.0; /* q is w-log(w), so -ve: sum(ln(w_i))/N-sum(w_i)/N+1 */
}
return NULL;
}
/* update nu (degrees of freedom)
also update w
nu0: current value of nu
w: Nx1 weight vector
ed: Nx1 residual error
psi() : digamma function
find soltion to
psi((nu+1)/2)-ln((nu+1)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1 = 0
use ln(gamma()) => lgamma_r
*/
double
update_w_and_nu(double nu0, double *w, double *ed, int N, int Nt, double nulow, double nuhigh) {
int Nd=30; /* no of samples to estimate nu */
int nth,nth1,ci;
int Nthb0,Nthb;
pthread_attr_t attr;
pthread_t *th_array;
thread_data_vecnu_t *threaddata;
double deltanu,*q,thisnu,sumq;
if ((q=(double*)calloc((size_t)N,sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
/* setup threads */
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((threaddata=(thread_data_vecnu_t*)malloc((size_t)Nt*sizeof(thread_data_vecnu_t)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
/* calculate min values a thread can handle */
Nthb0=(N+Nt-1)/Nt;
/* iterate over threads, allocating indices per thread */
ci=0;
for (nth=0; nth<Nt && ci<N; nth++) {
if (ci+Nthb0<N) {
Nthb=Nthb0;
} else {
Nthb=N-ci;
}
threaddata[nth].starti=ci;
threaddata[nth].endi=ci+Nthb-1;
threaddata[nth].ed=ed;
threaddata[nth].wtd=w;
threaddata[nth].q=q;
threaddata[nth].nu0=nu0;
threaddata[nth].nulow=nulow;
threaddata[nth].nuhigh=nuhigh;
pthread_create(&th_array[nth],&attr,w_nu_update_threadfn,(void*)(&threaddata[nth]));
/* next baseline set */
ci=ci+Nthb;
}
/* now wait for threads to finish */
for(nth1=0; nth1<nth; nth1++) {
pthread_join(th_array[nth1],NULL);
}
sumq=my_dasum(N,q)/(double)N; /* sum(|w_i-log(w_i)|/N), assume all elements are +ve */
for(nth1=0; nth1<nth; nth1++) {
pthread_create(&th_array[nth1],&attr,w_sqrt_threadfn,(void*)(&threaddata[nth1]));
}
for(nth1=0; nth1<nth; nth1++) {
pthread_join(th_array[nth1],NULL);
}
/* search range 2 to 30 because if nu~=30, its Gaussian */
deltanu=(double)(nuhigh-nulow)/(double)Nd;
Nthb0=(Nd+Nt-1)/Nt;
/* check for too low number of values per thread, halve the threads */
if (Nthb0<=2) {
Nt=Nt/2;
Nthb0=(Nd+Nt-1)/Nt;
}
ci=0;
for (nth=0; nth<Nt && ci<Nd; nth++) {
if (ci+Nthb0<Nd) {
Nthb=Nthb0;
} else {
Nthb=Nd-ci;
}
threaddata[nth].starti=ci;
threaddata[nth].endi=ci+Nthb-1;
threaddata[nth].q=q;
threaddata[nth].nu0=deltanu;
threaddata[nth].sumq=sumq;
pthread_create(&th_array[nth],&attr,q_update_threadfn,(void*)(&threaddata[nth]));
/* next baseline set */
ci=ci+Nthb;
}
/* now wait for threads to finish */
for(nth1=0; nth1<nth; nth1++) {
pthread_join(th_array[nth1],NULL);
}
pthread_attr_destroy(&attr);
free(th_array);
free(threaddata);
ci=my_idamin(Nd,q,1);
thisnu=(nulow+(double)ci*deltanu);
free(q);
return thisnu;
return 0;
}
/* update nu (degrees of freedom)
nu_old: old nu
logsumw = 1/N sum(log(w_i)-w_i)
use Nd values in [nulow,nuhigh] to find nu
psi() : digamma function
find soltion to
psi((nu_old+p)/2)-ln((nu_old+p)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1 = 0
use ln(gamma()) => lgamma_r
p: 1 or 8
*/
double
update_nu(double logsumw, int Nd, int Nt, double nulow, double nuhigh, int p, double nu_old) {
int ci,nth,nth1,Nthb,Nthb0;
double deltanu,thisnu,*q;
pthread_attr_t attr;
pthread_t *th_array;
thread_data_vecnu_t *threaddata;
if ((q=(double*)calloc((size_t)Nd,sizeof(double)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
/* setup threads */
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((threaddata=(thread_data_vecnu_t*)malloc((size_t)Nt*sizeof(thread_data_vecnu_t)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
/* calculate psi((nu_old+p)/2)-ln((nu_old+p)/2) */
double dgm=digamma((nu_old+(double)p)*0.5);
dgm=dgm-log((nu_old+(double)p)*0.5); /* psi((nu+p)/2)-log((nu+p)/2) */
deltanu=(double)(nuhigh-nulow)/(double)Nd;
Nthb0=(Nd+Nt-1)/Nt;
/* check for too low number of values per thread, halve the threads */
if (Nthb0<=2) {
Nt=Nt/2;
Nthb0=(Nd+Nt-1)/Nt;
}
ci=0;
for (nth=0; nth<Nt && ci<Nd; nth++) {
if (ci+Nthb0<Nd) {
Nthb=Nthb0;
} else {
Nthb=Nd-ci;
}
threaddata[nth].starti=ci;
threaddata[nth].endi=ci+Nthb-1;
threaddata[nth].q=q;
threaddata[nth].nu0=deltanu;
threaddata[nth].nulow=nulow;
threaddata[nth].nuhigh=nuhigh;
threaddata[nth].sumq=-logsumw-dgm;
pthread_create(&th_array[nth],&attr,q_update_threadfn_aecm,(void*)(&threaddata[nth]));
/* next baseline set */
ci=ci+Nthb;
}
/* now wait for threads to finish */
for(nth1=0; nth1<nth; nth1++) {
pthread_join(th_array[nth1],NULL);
}
pthread_attr_destroy(&attr);
free(th_array);
free(threaddata);
ci=my_idamin(Nd,q,1);
thisnu=(nulow+((double)ci)*deltanu);
free(q);
return thisnu;
}
/* ud = sqrt(u^2+v^2) */
static double
ncp_weight(double ud) {
/* fo(x) = 1/(1+alpha*exp(-x/A))
A ~=30
*/
if (ud>400.0) return 1.0; /* no effect on long baselines */
//return 1.0/(1.0+0.4*exp(-0.05*ud));
return 1.0/(1.0+1.8*exp(-0.05*ud));
}
static void *
threadfn_setblweight(void *data) {
thread_data_baselinewt_t *t=(thread_data_baselinewt_t*)data;
int ci;
for (ci=0; ci<t->Nb; ci++) {
/* get sqrt(u^2+v^2) */
double uu=t->u[ci+t->boff]*t->freq0;
double vv=t->v[ci+t->boff]*t->freq0;
double a=ncp_weight(sqrt(uu*uu+vv*vv));
t->wt[8*(ci+t->boff)]*=a;
t->wt[8*(ci+t->boff)+1]*=a;
t->wt[8*(ci+t->boff)+2]*=a;
t->wt[8*(ci+t->boff)+3]*=a;
t->wt[8*(ci+t->boff)+4]*=a;
t->wt[8*(ci+t->boff)+5]*=a;
t->wt[8*(ci+t->boff)+6]*=a;
t->wt[8*(ci+t->boff)+7]*=a;
//printf("%lf %lf %lf\n",uu,vv,a);
}
return NULL;
}
/*
taper data by weighting based on uv distance (for short baselines)
for example: use weights as the inverse density function
1/( 1+f(u,v) )
as u,v->inf, f(u,v) -> 0 so long baselines are not affected
x: Nbase*8 x 1 (input,output) data
u,v : Nbase x 1
note: u = u/c, v=v/c here, so need freq to convert to wavelengths */
void
whiten_data(int Nbase, double *x, double *u, double *v, double freq0, int Nt) {
pthread_attr_t attr;
pthread_t *th_array;
thread_data_baselinewt_t *threaddata;
int ci,nth1,nth;
int Nthb0,Nthb;
Nthb0=(Nbase+Nt-1)/Nt;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
if ((threaddata=(thread_data_baselinewt_t*)malloc((size_t)Nt*sizeof(thread_data_baselinewt_t)))==0) {
#ifndef USE_MIC
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
#endif
exit(1);
}
/* iterate over threads, allocating baselines per thread */
ci=0;
for (nth=0; nth<Nt && ci<Nbase; nth++) {
if (ci+Nthb0<Nbase) {
Nthb=Nthb0;
} else {
Nthb=Nbase-ci;
}
threaddata[nth].Nb=Nthb;
threaddata[nth].boff=ci;
threaddata[nth].wt=x;
threaddata[nth].u=u;
threaddata[nth].v=v;
threaddata[nth].freq0=freq0;
pthread_create(&th_array[nth],&attr,threadfn_setblweight,(void*)(&threaddata[nth]));
/* next baseline set */
ci=ci+Nthb;
}
/* now wait for threads to finish */
for(nth1=0; nth1<nth; nth1++) {
pthread_join(th_array[nth1],NULL);
}
pthread_attr_destroy(&attr);
free(th_array);
free(threaddata);
}

Binary file not shown.

File diff suppressed because it is too large Load Diff