Header file and Solvers folder redundant as well
This commit is contained in:
parent
d6b518116e
commit
1e2c24b31a
Binary file not shown.
Binary file not shown.
|
@ -1,2 +0,0 @@
|
|||
*.swp
|
||||
*.swo
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,54 +0,0 @@
|
|||
CC=gcc
|
||||
CXX=g++
|
||||
#CFLAGS= -Wall -O3 -g #-pg
|
||||
CFLAGS= -Wall -O3 -fopt-info-optimized
|
||||
CLIBS= -lm -lpthread
|
||||
#LAPACK=-L/usr/lib/atlas/sse -llapack -lblas
|
||||
#LAPACK=-L/usr/local/GotoBLAS2/lib -lgoto2 -lpthread -lgfortran
|
||||
LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
|
||||
|
||||
|
||||
INCLUDES= -I.
|
||||
LIBPATH=
|
||||
|
||||
#### glib
|
||||
GLIBI=-I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include -I/usr/lib/x86_64-linux-gnu/glib-2.0/include/ -I/usr/lib64/glib-2.0/include
|
||||
GLIBL=-lglib-2.0
|
||||
|
||||
OBJECTS= lmfit_nocuda.o clmfit_nocuda.o lbfgs_nocuda.o myblas.o residual.o robustlm.o updatenu.o robust_lbfgs_nocuda.o rtr_solve.o rtr_solve_robust.o manifold_average.o consensus_poly.o rtr_solve_robust_admm.o admm_solve.o
|
||||
|
||||
default:libsolvers.a
|
||||
lmfit_nocuda.o:lmfit_nocuda.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
clmfit_nocuda.o:clmfit_nocuda.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
lbfgs_nocuda.o:lbfgs_nocuda.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
myblas.o:myblas.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
residual.o:residual.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
robustlm.o:robustlm.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
updatenu.o:updatenu.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
robust_lbfgs_nocuda.o:robust_lbfgs_nocuda.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
rtr_solve.o:rtr_solve.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
rtr_solve_robust.o:rtr_solve_robust.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
manifold_average.o:manifold_average.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
consensus_poly.o:consensus_poly.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
rtr_solve_robust_admm.o:rtr_solve_robust_admm.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
admm_solve.o:admm_solve.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
|
||||
|
||||
RANLIB=ranlib
|
||||
libsolvers.a:$(OBJECTS) Solvers.h
|
||||
ar rv $@ $(OBJECTS); \
|
||||
$(RANLIB) $@;
|
|
@ -1,88 +0,0 @@
|
|||
CC=gcc
|
||||
CXX=g++
|
||||
NVCC=nvcc
|
||||
CFLAGS= -Wall -O3 -g -DHAVE_CUDA -DHYBRID_CODE
|
||||
CLIBS= -lm -lpthread
|
||||
LAPACK=-L/usr/local/OpenBLAS/lib/ -lopenblas -lgfortran -lpthread
|
||||
|
||||
CUDAINC=/usr/local/cuda/include
|
||||
CUDALIB=-L/usr/local/cuda/lib64 -lcuda -lcudart
|
||||
#NVCC=/usr/local/cuda/bin/nvcc
|
||||
#NVCFLAGS=-arch=sm_35 -g -G --ptxas-options=-v -O3
|
||||
NVCFLAGS=-arch=sm_35 --ptxas-options=-v -O3
|
||||
|
||||
#### glib
|
||||
GLIBI=-I/usr/include/glib-2.0 -I/usr/lib64/glib-2.0/include/
|
||||
GLIBL=-lglib-2.0 -L/usr/lib64
|
||||
|
||||
# NVML
|
||||
NVML_INC=/usr/include/nvidia/gdk/
|
||||
NVML_LIB=-lnvidia-ml -L/usr/lib64/nvidia/
|
||||
|
||||
INCLUDES= -I. -I$(CUDAINC) -I$(NVML_INC)
|
||||
LIBPATH= $(CUDALIB)
|
||||
|
||||
|
||||
OBJECTS=lmfit.o lbfgs.o myblas.o mderiv.o clmfit.o clmfit_nocuda.o residual.o barrier.o robust.o robustlm.o oslmfit.o mderiv_fl.o clmfit_fl.o updatenu.o robust_lbfgs_nocuda.o robust_fl.o manifold_fl.o rtr_solve_cuda.o rtr_solve_robust_cuda.o diagnostics.o diag_fl.o manifold_average.o consensus_poly.o rtr_solve_robust_cuda_admm.o rtr_solve_robust_admm.o admm_solve.o load_balance.o
|
||||
|
||||
|
||||
default:libsolvers-gpu.a
|
||||
lmfit.o:lmfit.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
lbfgs.o:lbfgs.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
myblas.o:myblas.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
mderiv.o:mderiv.cu
|
||||
$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
clmfit.o:clmfit.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
clmfit_nocuda.o:clmfit_nocuda.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
residual.o:residual.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
barrier.o:barrier.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
robustlm.o:robustlm.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
robust.o:robust.cu
|
||||
$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
robust_fl.o:robust_fl.cu
|
||||
$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
oslmfit.o:oslmfit.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
robust_lbfgs_nocuda.o:robust_lbfgs_nocuda.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
clmfit_fl.o:clmfit_fl.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
updatenu.o:updatenu.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
mderiv_fl.o:mderiv_fl.cu
|
||||
$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
manifold_fl.o:manifold_fl.cu
|
||||
$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
rtr_solve_cuda.o:rtr_solve_cuda.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
rtr_solve_robust_cuda.o:rtr_solve_robust_cuda.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
diagnostics.o:diagnostics.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
diag_fl.o:diag_fl.cu
|
||||
$(NVCC) $(NVCFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
manifold_average.o:manifold_average.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
consensus_poly.o:consensus_poly.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
rtr_solve_robust_cuda_admm.o:rtr_solve_robust_cuda_admm.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
rtr_solve_robust_admm.o:rtr_solve_robust_admm.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
admm_solve.o:admm_solve.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
load_balance.o:load_balance.c
|
||||
$(CC) $(CFLAGS) $(INCLUDES) $(GLIBI) -c $<
|
||||
|
||||
RANLIB=ranlib
|
||||
libsolvers-gpu.a:$(OBJECTS) Solvers.h
|
||||
ar rv $@ $(OBJECTS); \
|
||||
$(RANLIB) $@;
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -1,121 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
#include <pthread.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "Solvers.h"
|
||||
/* implementation of a barrier to sync threads.
|
||||
The barrier has two doors (enter and exit). Only one door
|
||||
can be open at a time. Initially the enter door is open.
|
||||
All threads that enter the barrier are sleeping (wait).
|
||||
The last thread to enter the barrier will
|
||||
1)close the enter door
|
||||
2)wakeup all sleeping threads.
|
||||
3)open the exit door.
|
||||
So the woken up threads will leave the barrier one by
|
||||
one, as they are awoken. The last thread to leave the barrier
|
||||
will
|
||||
1)open the enter door
|
||||
2)close the exit door,
|
||||
So finally the barrier reaches its initial state
|
||||
*/
|
||||
|
||||
/* initialize barrier */
|
||||
/* N - no. of accomodated threads */
|
||||
void
|
||||
init_th_barrier(th_barrier *barrier, int N)
|
||||
{
|
||||
barrier->tcount=0; /* initially empty */
|
||||
barrier->nthreads=N;
|
||||
pthread_mutex_init(&barrier->enter_mutex,NULL);
|
||||
pthread_mutex_init(&barrier->exit_mutex,NULL);
|
||||
pthread_cond_init(&barrier->lastthread_cond,NULL);
|
||||
pthread_cond_init(&barrier->exit_cond,NULL);
|
||||
}
|
||||
/* destroy barrier */
|
||||
void
|
||||
destroy_th_barrier(th_barrier *barrier)
|
||||
{
|
||||
pthread_mutex_destroy(&barrier->enter_mutex);
|
||||
pthread_mutex_destroy(&barrier->exit_mutex);
|
||||
pthread_cond_destroy(&barrier->lastthread_cond);
|
||||
pthread_cond_destroy(&barrier->exit_cond);
|
||||
barrier->tcount=barrier->nthreads=0;
|
||||
}
|
||||
/* the main operation of the barrier */
|
||||
void
|
||||
sync_barrier(th_barrier *barrier)
|
||||
{
|
||||
/* trivial case */
|
||||
if(barrier->nthreads <2) return;
|
||||
/* else */
|
||||
/* new threads enters the barrier. Now close the entry door
|
||||
so that other threads cannot enter the barrier until we are done */
|
||||
pthread_mutex_lock(&barrier->enter_mutex);
|
||||
/* next lock the exit mutex - no threads can leave either */
|
||||
pthread_mutex_lock(&barrier->exit_mutex);
|
||||
/* now check to see if this is the last expected thread */
|
||||
if( ++(barrier->tcount) < barrier->nthreads) {
|
||||
/* no. this is not the last thread. so open the entry door */
|
||||
pthread_mutex_unlock(&barrier->enter_mutex);
|
||||
/* go to sleep */
|
||||
pthread_cond_wait(&barrier->exit_cond,&barrier->exit_mutex);
|
||||
} else {
|
||||
/* this is the last thread */
|
||||
/* wakeup sleeping threads */
|
||||
pthread_cond_broadcast(&barrier->exit_cond);
|
||||
/* go to sleep until all threads are woken up
|
||||
and leave the barrier */
|
||||
pthread_cond_wait(&barrier->lastthread_cond,&barrier->exit_mutex);
|
||||
/* now all threads have left the barrier. so open the entry door again */
|
||||
pthread_mutex_unlock(&barrier->enter_mutex);
|
||||
}
|
||||
/* next to the last thread leaving the barrier */
|
||||
if(--(barrier->tcount)==1) {
|
||||
/* wakeup the last sleeping thread */
|
||||
pthread_cond_broadcast(&barrier->lastthread_cond);
|
||||
}
|
||||
pthread_mutex_unlock(&barrier->exit_mutex);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* master and two slaves */
|
||||
//int
|
||||
//main(int argc, char *argv[]) {
|
||||
// th_pipeline p;
|
||||
//
|
||||
// gbdata g;
|
||||
//
|
||||
// init_pipeline(&p,&g);
|
||||
//sync_barrier(&(p.gate1)); /* stop at gate 1 */
|
||||
// g.status=0; /* master work */
|
||||
//sync_barrier(&(p.gate2)); /* stop at gate 2 */
|
||||
// //exec_pipeline(&p);
|
||||
//sync_barrier(&(p.gate1)); /* stop at gate 1 */
|
||||
// g.status=10; /* master work */
|
||||
//sync_barrier(&(p.gate2)); /* stop at gate 2 */
|
||||
// //exec_pipeline(&p);
|
||||
// destroy_pipeline(&p);
|
||||
// /* still need to free slave_data structs, from data */
|
||||
// return 0;
|
||||
//}
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -1,349 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2014 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
#include "Solvers.h"
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
|
||||
//#define DEBUG
|
||||
/* build matrix with polynomial terms
|
||||
B : Npoly x Nf, each row is one basis function
|
||||
Npoly : total basis functions
|
||||
Nf: frequencies
|
||||
freqs: Nfx1 array freqs
|
||||
freq0: reference freq
|
||||
type :
|
||||
0 :[1 ((f-fo)/fo) ((f-fo)/fo)^2 ...] basis functions
|
||||
1 : normalize each row such that norm is 1
|
||||
2 : Bernstein poly \sum N_C_r x^r (1-x)^r where x in [0,1] : use min,max values of freq to normalize
|
||||
Note: freqs might not be in sorted order, so need to search array to find min,max values
|
||||
3: [1 ((f-fo)/fo) (fo/f-1) ((f-fo)/fo)^2 (fo/f-1)^2 ... ] basis, for this case odd Npoly preferred
|
||||
*/
|
||||
int
|
||||
setup_polynomials(double *B, int Npoly, int Nf, double *freqs, double freq0, int type) {
|
||||
|
||||
if (type==0 || type==1) {
|
||||
double frat,dsum;
|
||||
double invf=1.0/freq0;
|
||||
int ci,cm;
|
||||
for (ci=0; ci<Nf; ci++) {
|
||||
B[ci*Npoly]=1.0;
|
||||
frat=(freqs[ci]-freq0)*invf;
|
||||
for (cm=1; cm<Npoly; cm++) {
|
||||
B[ci*Npoly+cm]=B[ci*Npoly+cm-1]*frat;
|
||||
}
|
||||
}
|
||||
#ifdef DEBUG
|
||||
int cj;
|
||||
printf("BT=[\n");
|
||||
for(cj=0; cj<Npoly; cj++) {
|
||||
for (ci=0; ci<Nf; ci++) {
|
||||
printf("%lf ",B[ci*Npoly+cj]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("];\n");
|
||||
#endif
|
||||
if (type==1) {
|
||||
/* normalize each row such that norm is 1 */
|
||||
for (cm=0; cm<Npoly; cm++) {
|
||||
dsum=0.0;
|
||||
for (ci=0; ci<Nf; ci++) {
|
||||
dsum+=B[ci*Npoly+cm]*B[ci*Npoly+cm];
|
||||
}
|
||||
if (dsum>0.0) {
|
||||
invf=1.0/sqrt(dsum);
|
||||
} else {
|
||||
invf=0.0;
|
||||
}
|
||||
for (ci=0; ci<Nf; ci++) {
|
||||
B[ci*Npoly+cm] *=invf;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (type==2) {
|
||||
/* Bernstein polynomials */
|
||||
int idmax=my_idamax(Nf, freqs, 1);
|
||||
int idmin=my_idamin(Nf, freqs, 1);
|
||||
double fmax=freqs[idmax-1];
|
||||
double fmin=freqs[idmin-1];
|
||||
double *fact; /* factorial array */
|
||||
double *px,*p1x; /* arrays for powers of x and (1+x) */
|
||||
if ((fact=(double*)calloc((size_t)Npoly,sizeof(double)))==0) {
|
||||
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((px=(double*)calloc((size_t)Npoly*Nf,sizeof(double)))==0) {
|
||||
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((p1x=(double*)calloc((size_t)Npoly*Nf,sizeof(double)))==0) {
|
||||
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
fact[0]=1.0;
|
||||
int ci,cj;
|
||||
for (ci=1; ci<Npoly; ci++) {
|
||||
fact[ci]=fact[ci-1]*(double)ci;
|
||||
}
|
||||
double invf=1.0/(fmax-fmin);
|
||||
double frat;
|
||||
for (ci=0; ci<Nf; ci++) {
|
||||
/* normalize coordinates */
|
||||
frat=(freqs[ci]-fmin)*invf;
|
||||
px[ci]=1.0;
|
||||
p1x[ci]=1.0;
|
||||
px[ci+Nf]=frat;
|
||||
p1x[ci+Nf]=1.0-frat;
|
||||
}
|
||||
for (cj=2; cj<Npoly; cj++) {
|
||||
for (ci=0; ci<Nf; ci++) {
|
||||
px[cj*Nf+ci]=px[(cj-1)*Nf+ci]*px[Nf+ci];
|
||||
p1x[cj*Nf+ci]=p1x[(cj-1)*Nf+ci]*p1x[Nf+ci];
|
||||
}
|
||||
}
|
||||
for (cj=0; cj<Npoly; cj++) { /* ci: freq, cj: poly order */
|
||||
frat=fact[Npoly-1]/(fact[Npoly-cj-1]*fact[cj]);
|
||||
for (ci=0; ci<Nf; ci++) {
|
||||
B[ci*Npoly+cj]=frat*px[cj*Nf+ci]*p1x[(Npoly-cj-1)*Nf+ci];
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("BT=[\n");
|
||||
for(cj=0; cj<Npoly; cj++) {
|
||||
for (ci=0; ci<Nf; ci++) {
|
||||
printf("%lf ",B[ci*Npoly+cj]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("];\n");
|
||||
#endif
|
||||
free(fact);
|
||||
free(px);
|
||||
free(p1x);
|
||||
} else if (type==3) { /* [1 (f-fo)/fo (fo/f-1) ... */
|
||||
double frat;
|
||||
double invf=1.0/freq0;
|
||||
int ci,cm;
|
||||
for (ci=0; ci<Nf; ci++) {
|
||||
B[ci*Npoly]=1.0;
|
||||
frat=(freqs[ci]-freq0)*invf;
|
||||
double lastval=frat;
|
||||
for (cm=1; cm<Npoly; cm+=2) { /* odd values 1,3,5,... */
|
||||
B[ci*Npoly+cm]=lastval;
|
||||
lastval*=frat;
|
||||
}
|
||||
frat=(freq0/freqs[ci]-1.0);
|
||||
lastval=frat;
|
||||
for (cm=2; cm<Npoly; cm+=2) { /* even values 2,4,6,... */
|
||||
B[ci*Npoly+cm]=lastval;
|
||||
lastval*=frat;
|
||||
}
|
||||
}
|
||||
#ifdef DEBUG
|
||||
int cj;
|
||||
printf("BT=[\n");
|
||||
for(cj=0; cj<Npoly; cj++) {
|
||||
for (ci=0; ci<Nf; ci++) {
|
||||
printf("%lf ",B[ci*Npoly+cj]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("];\n");
|
||||
#endif
|
||||
|
||||
} else {
|
||||
fprintf(stderr,"%s : %d: undefined polynomial type\n",__FILE__,__LINE__);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* build matrix with polynomial terms
|
||||
B : Npoly x Nf, each row is one basis function
|
||||
Bi: Npoly x Npoly pseudo inverse of sum( B(:,col) x B(:,col)' )
|
||||
Npoly : total basis functions
|
||||
Nf: frequencies
|
||||
fratio: Nfx1 array of weighing factors depending on the flagged data of each freq
|
||||
Sum taken is a weighted sum, using weights in fratio
|
||||
*/
|
||||
int
|
||||
find_prod_inverse(double *B, double *Bi, int Npoly, int Nf, double *fratio) {
|
||||
|
||||
int ci,status,lwork=0;
|
||||
double w[1],*WORK,*U,*S,*VT;
|
||||
/* set Bi to zero */
|
||||
memset(Bi,0,sizeof(double)*Npoly*Npoly);
|
||||
/* find sum */
|
||||
for (ci=0; ci<Nf; ci++) {
|
||||
/* outer product */
|
||||
my_dgemm('N','T',Npoly,Npoly,1,fratio[ci],&B[ci*Npoly],Npoly,&B[ci*Npoly],Npoly,1.0,Bi,Npoly);
|
||||
}
|
||||
#ifdef DEBUG
|
||||
int cj;
|
||||
printf("BT=[\n");
|
||||
for (ci=0; ci<Nf; ci++) {
|
||||
for(cj=0; cj<Npoly; cj++) {
|
||||
printf("%lf ",B[ci*Npoly+cj]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("];\nBi=[\n");
|
||||
for (ci=0; ci<Npoly; ci++) {
|
||||
for(cj=0; cj<Npoly; cj++) {
|
||||
printf("%lf ",Bi[ci*Npoly+cj]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("];\n");
|
||||
#endif
|
||||
|
||||
if ((U=(double*)calloc((size_t)Npoly*Npoly,sizeof(double)))==0) {
|
||||
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((VT=(double*)calloc((size_t)Npoly*Npoly,sizeof(double)))==0) {
|
||||
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((S=(double*)calloc((size_t)Npoly,sizeof(double)))==0) {
|
||||
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* memory for SVD */
|
||||
status=my_dgesvd('A','A',Npoly,Npoly,Bi,Npoly,S,U,Npoly,VT,Npoly,w,-1);
|
||||
if (!status) {
|
||||
lwork=(int)w[0];
|
||||
} else {
|
||||
printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
|
||||
exit(1);
|
||||
}
|
||||
if ((WORK=(double*)calloc((size_t)lwork,sizeof(double)))==0) {
|
||||
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
status=my_dgesvd('A','A',Npoly,Npoly,Bi,Npoly,S,U,Npoly,VT,Npoly,WORK,lwork);
|
||||
if (status) {
|
||||
printf("%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* find 1/singular values, and multiply columns of U with new singular values */
|
||||
for (ci=0; ci<Npoly; ci++) {
|
||||
if (S[ci]>CLM_EPSILON) {
|
||||
S[ci]=1.0/S[ci];
|
||||
} else {
|
||||
S[ci]=0.0;
|
||||
}
|
||||
my_dscal(Npoly,S[ci],&U[ci*Npoly]);
|
||||
}
|
||||
|
||||
/* find product U 1/S V^T */
|
||||
my_dgemm('N','N',Npoly,Npoly,Npoly,1.0,U,Npoly,VT,Npoly,0.0,Bi,Npoly);
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Bii=[\n");
|
||||
for (ci=0; ci<Npoly; ci++) {
|
||||
for(cj=0; cj<Npoly; cj++) {
|
||||
printf("%lf ",Bi[ci*Npoly+cj]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("];\n");
|
||||
#endif
|
||||
|
||||
free(U);
|
||||
free(S);
|
||||
free(VT);
|
||||
free(WORK);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* update Z
|
||||
Z: 8N Npoly x M double array (real and complex need to be updated separate)
|
||||
N : stations
|
||||
M : clusters
|
||||
Npoly: no of basis functions
|
||||
z : right hand side 8NM Npoly x 1 (note the different ordering from Z)
|
||||
Bi : NpolyxNpoly matrix, Bi^T=Bi assumed
|
||||
*/
|
||||
int
|
||||
update_global_z(double *Z,int N,int M,int Npoly,double *z,double *Bi) {
|
||||
/* one block of Z for one direction 2Nx2xNpoly (complex)
|
||||
and 8NxNpoly real values : select one column : 2NxNpoly (complex)
|
||||
select real,imag : 2NxNpoly each (vector)
|
||||
reshape each to 2NxNpoly matrix => Q
|
||||
Bi : NpolyxNpoly matrix = B^T
|
||||
|
||||
for each direction (M values)
|
||||
select 2N,2N,... : 2Nx Npoly complex values from z (ordered by M)
|
||||
select real,imag: size 2NxNpoly, 2NxNpoly vectors
|
||||
reshape to 2NxNpoly => R
|
||||
reshape to 2NxNpoly => I (imag)
|
||||
|
||||
then Q=([R I] Bi^T) for each column
|
||||
Q=[R_1^T I_1^T R_2^T I_2^T]^T Bi^T for 2 columns
|
||||
R_1,I_1,R_2,I_2 : size 2NxNpoly
|
||||
R : (2N 4) x Npoly
|
||||
so find Q
|
||||
*/
|
||||
double *R,*Q;
|
||||
if ((R=(double*)calloc((size_t)2*N*Npoly*4,sizeof(double)))==0) {
|
||||
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((Q=(double*)calloc((size_t)2*N*Npoly*4,sizeof(double)))==0) {
|
||||
printf("%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int ci,np;
|
||||
for (ci=0; ci<M; ci++) {
|
||||
for (np=0; np<Npoly; np++) {
|
||||
/* select 2N */
|
||||
my_dcopy(2*N, &z[8*N*ci+np*8*N*M], 4, &R[np*8*N], 1); /* R_1 */
|
||||
my_dcopy(2*N, &z[8*N*ci+np*8*N*M+1], 4, &R[np*8*N+2*N], 1); /* I_1 */
|
||||
my_dcopy(2*N, &z[8*N*ci+np*8*N*M+2], 4, &R[np*8*N+2*2*N], 1); /* R_2 */
|
||||
my_dcopy(2*N, &z[8*N*ci+np*8*N*M+3], 4, &R[np*8*N+3*2*N], 1); /* I_2 */
|
||||
}
|
||||
/* find Q=R B^T */
|
||||
memset(Q,0,sizeof(double)*2*N*Npoly*4);
|
||||
my_dgemm('N','N',8*N,Npoly,Npoly,1.0,R,8*N,Bi,Npoly,1.0,Q,8*N);
|
||||
/* copy back to Z */
|
||||
for (np=0; np<Npoly; np++) {
|
||||
my_dcopy(2*N, &Q[np*8*N], 1, &Z[8*N*Npoly*ci+8*N*np], 4);
|
||||
my_dcopy(2*N, &Q[np*8*N+2*N], 1, &Z[8*N*Npoly*ci+8*N*np+1], 4);
|
||||
my_dcopy(2*N, &Q[np*8*N+2*2*N], 1, &Z[8*N*Npoly*ci+8*N*np+2], 4);
|
||||
my_dcopy(2*N, &Q[np*8*N+3*2*N], 1, &Z[8*N*Npoly*ci+8*N*np+3], 4);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
free(R);
|
||||
free(Q);
|
||||
return 0;
|
||||
}
|
Binary file not shown.
|
@ -1,270 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
#include "cuda.h"
|
||||
#include <cuComplex.h>
|
||||
#include <stdio.h>
|
||||
|
||||
/* enable this for checking for kernel failure */
|
||||
#define CUDA_DBG
|
||||
|
||||
__global__ void
|
||||
kernel_sqrtdiv_fl(int M, float eps, float *__restrict__ x){
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* make sure to use only M threads */
|
||||
if (tid<M) {
|
||||
if (x[tid]>eps) {
|
||||
x[tid]=1.0f/sqrtf(x[tid]);
|
||||
} else {
|
||||
x[tid]=0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
kernel_diagmult_fl(int M, float *__restrict__ U, const float *__restrict__ D) {
|
||||
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* which column this tid operates on */
|
||||
unsigned int col = tid/M;
|
||||
if (tid<M*M) {
|
||||
U[tid]=U[tid]*D[col];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
__global__ void
|
||||
kernel_jnorm_fl(int N, int M, const float *__restrict__ J, float *__restrict__ d) {
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* each thread handles one row */
|
||||
if (tid<N) {
|
||||
d[tid]=0.0f;
|
||||
for (int ci=0; ci<M; ci++) {
|
||||
/* J is transposed, so read each column */
|
||||
d[tid]=d[tid]+J[tid*M+ci]*J[tid*M+ci];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
kernel_jacf_fl2(int Nbase, int M, float *__restrict__ jac, const float *__restrict__ coh, const float *__restrict__ p, const short *__restrict__ bb, int N){
|
||||
/* global thread index : equal to the baseline */
|
||||
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
/* which parameter:0...M */
|
||||
unsigned int m = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
|
||||
if(n<Nbase && m<M) {
|
||||
int sta1=(int)bb[3*n];
|
||||
int sta2=(int)bb[3*n+1];
|
||||
/* condition for calculating this baseline sum is
|
||||
If this baseline is flagged,
|
||||
or if this parameter does not belong to sta1 or sta2
|
||||
we do not compute
|
||||
*/
|
||||
int stc=m>>3; /* 0...Ns-1 (because M=total par= 8 * Nstations */
|
||||
/* flags are not taken into account */
|
||||
if (((stc==sta2)||(stc==sta1))) {
|
||||
|
||||
cuFloatComplex C[4];
|
||||
C[0].x=coh[8*n];
|
||||
C[0].y=coh[8*n+1];
|
||||
C[1].x=coh[8*n+2];
|
||||
C[1].y=coh[8*n+3];
|
||||
C[2].x=coh[8*n+4];
|
||||
C[2].y=coh[8*n+5];
|
||||
C[3].x=coh[8*n+6];
|
||||
C[3].y=coh[8*n+7];
|
||||
|
||||
/* which parameter exactly 0..7 */
|
||||
int stoff=m-stc*8;
|
||||
float pp1[8];
|
||||
float pp2[8];
|
||||
if (stc==sta1) {
|
||||
for (int cn=0; cn<8; cn++) {
|
||||
pp1[cn]=0.0f;
|
||||
pp2[cn]=p[sta2*8+cn];
|
||||
}
|
||||
pp1[stoff]=1.0f;
|
||||
} else if (stc==sta2) {
|
||||
for (int cn=0; cn<8; cn++) {
|
||||
pp2[cn]=0.0f;
|
||||
pp1[cn]=p[sta1*8+cn];
|
||||
}
|
||||
pp2[stoff]=1.0f;
|
||||
}
|
||||
|
||||
|
||||
cuFloatComplex G1[4];
|
||||
G1[0].x=pp1[0];
|
||||
G1[0].y=pp1[1];
|
||||
G1[1].x=pp1[2];
|
||||
G1[1].y=pp1[3];
|
||||
G1[2].x=pp1[4];
|
||||
G1[2].y=pp1[5];
|
||||
G1[3].x=pp1[6];
|
||||
G1[3].y=pp1[7];
|
||||
|
||||
cuFloatComplex T1[4];
|
||||
/* T=G1*C */
|
||||
T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
|
||||
T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
|
||||
T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
|
||||
T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
|
||||
|
||||
cuFloatComplex G2[4];
|
||||
/* conjugate this */
|
||||
G2[0].x=pp2[0];
|
||||
G2[0].y=-pp2[1];
|
||||
G2[2].x=pp2[2];
|
||||
G2[2].y=-pp2[3];
|
||||
G2[1].x=pp2[4];
|
||||
G2[1].y=-pp2[5];
|
||||
G2[3].x=pp2[6];
|
||||
G2[3].y=-pp2[7];
|
||||
|
||||
cuFloatComplex T2[4];
|
||||
T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
|
||||
T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
|
||||
T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
|
||||
T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
|
||||
/* update jacobian */
|
||||
/* NOTE: row major order */
|
||||
jac[m+M*8*n]=T2[0].x;
|
||||
jac[m+M*(8*n+1)]=T2[0].y;
|
||||
jac[m+M*(8*n+2)]=T2[1].x;
|
||||
jac[m+M*(8*n+3)]=T2[1].y;
|
||||
jac[m+M*(8*n+4)]=T2[2].x;
|
||||
jac[m+M*(8*n+5)]=T2[2].y;
|
||||
jac[m+M*(8*n+6)]=T2[3].x;
|
||||
jac[m+M*(8*n+7)]=T2[3].y;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* only use extern if calling code is C */
|
||||
extern "C"
|
||||
{
|
||||
|
||||
|
||||
/* cuda driver for calculating jacf() */
|
||||
/* p: params (Mx1), jac: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
|
||||
void
|
||||
cudakernel_jacf_fl2(float *p, float *jac, int M, int N, float *coh, short *bbh, int Nbase, int Mclus, int Nstations) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
/* NOTE: use small value for ThreadsPerBlock here, like 8 */
|
||||
dim3 threadsPerBlock(16, 8);
|
||||
/* jacobian: Nbase x Nstations (proportional to N), so */
|
||||
dim3 numBlocks((Nbase+threadsPerBlock.x-1)/threadsPerBlock.x,
|
||||
(M+threadsPerBlock.y-1)/threadsPerBlock.y);
|
||||
/* set memory of jac to zero */
|
||||
cudaMemset(jac, 0, N*M*sizeof(float));
|
||||
// printf("Kernel Jax data size=%d, params=%d, block=%d,%d, thread=%d,%d, baselines=%d\n",N, M, numBlocks.x,numBlocks.y, threadsPerBlock.x, threadsPerBlock.y, Nbase);
|
||||
kernel_jacf_fl2<<< numBlocks, threadsPerBlock>>>(Nbase, M, jac, coh, p, bbh, Nstations);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
/* invert sqrt(singular values) 1/Sd[] for Sd[]> eps */
|
||||
void
|
||||
cudakernel_sqrtdiv_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float eps, float *Sd) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_sqrtdiv_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, eps, Sd);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* U <= U D,
|
||||
U : MxM
|
||||
D : Mx1, diagonal matrix
|
||||
*/
|
||||
void
|
||||
cudakernel_diagmult_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float *U, float *D) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_diagmult_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, U, D);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* diag(J^T J)
|
||||
d[i] = J[i,:] * J[i,:]
|
||||
J: NxM (in row major order, so J[i,:] is actually J[:,i]
|
||||
d: Nx1
|
||||
*/
|
||||
void
|
||||
cudakernel_jnorm_fl(int ThreadsPerBlock, int BlocksPerGrid, float *J, int N, int M, float *d) {
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_jnorm_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N,M,J,d);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
|
@ -1,550 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2014 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
#include "Solvers.h"
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <pthread.h>
|
||||
#include <math.h>
|
||||
|
||||
|
||||
static void
|
||||
checkCudaError(cudaError_t err, const char *file, int line)
|
||||
{
|
||||
#ifdef CUDA_DEBUG
|
||||
if(!err)
|
||||
return;
|
||||
fprintf(stderr,"GPU (CUDA): %s %s %d\n", cudaGetErrorString(err),file,line);
|
||||
exit(EXIT_FAILURE);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void
|
||||
checkCublasError(cublasStatus_t cbstatus, char *file, int line)
|
||||
{
|
||||
#ifdef CUDA_DEBUG
|
||||
if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
|
||||
fprintf(stderr,"%s: %d: CUBLAS failure\n",file,line);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/* find for one cluster J (J^T W J+ eW)^-1 J^T and extract diagonal as output
|
||||
p: parameters M x 1
|
||||
rd: residual vector N x 1 (on the device, invarient)
|
||||
x: (output) diagonal of leverage matrix
|
||||
|
||||
cbhandle,gWORK: BLAS/storage pointers
|
||||
|
||||
tileoff: need for hybrid parameters
|
||||
|
||||
adata: has all additional info: coherency,baselines,flags
|
||||
*/
|
||||
static int
|
||||
calculate_leverage(float *p, float *rd, float *x, int M, int N, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle, float *gWORK, int tileoff, int ntiles, me_data_t *dp) {
|
||||
|
||||
/* p needs to be copied to device and x needs to be copied back from device
|
||||
rd always remains in the device (select part with the right offset)
|
||||
N will change in hybrid mode, so copy back to x with right offset */
|
||||
|
||||
int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
|
||||
float *jacd,*xd,*jacTjacd,*pd,*cohd,*Ud,*VTd,*Sd;
|
||||
unsigned long int moff=0;
|
||||
short *bbd;
|
||||
|
||||
cudaError_t err;
|
||||
|
||||
/* total storage N+M*N+M*M+M+Nbase*8+M*M+M*M+M+M+Nbase*3(short)/(float) */
|
||||
xd=&gWORK[moff];
|
||||
moff+=N;
|
||||
jacd=&gWORK[moff];
|
||||
moff+=M*N;
|
||||
jacTjacd=&gWORK[moff];
|
||||
moff+=M*M;
|
||||
pd=&gWORK[moff];
|
||||
moff+=M;
|
||||
cohd=&gWORK[moff];
|
||||
moff+=Nbase*8;
|
||||
Ud=&gWORK[moff];
|
||||
moff+=M*M;
|
||||
VTd=&gWORK[moff];
|
||||
moff+=M*M;
|
||||
Sd=&gWORK[moff];
|
||||
moff+=M;
|
||||
|
||||
bbd=(short*)&gWORK[moff];
|
||||
moff+=(Nbase*3*sizeof(short))/sizeof(float);
|
||||
|
||||
err=cudaMemcpyAsync(pd, p, M*sizeof(float), cudaMemcpyHostToDevice,0);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* need to give right offset for coherencies */
|
||||
/* offset: cluster offset+time offset */
|
||||
err=cudaMemcpyAsync(cohd, &(dp->ddcohf[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(float), cudaMemcpyHostToDevice,0);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* correct offset for baselines */
|
||||
err=cudaMemcpyAsync(bbd, &(dp->ddbase[3*(dp->Nbase)*(tileoff)]), Nbase*3*sizeof(short), cudaMemcpyHostToDevice,0);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
int ThreadsPerBlock=DEFAULT_TH_PER_BK;
|
||||
int ci,Mi;
|
||||
|
||||
/* extra storage for cusolver */
|
||||
int work_size=0;
|
||||
int *devInfo;
|
||||
err=cudaMalloc((void**)&devInfo, sizeof(int));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
float *work;
|
||||
float *rwork;
|
||||
cusolverDnSgesvd_bufferSize(solver_handle, M, M, &work_size);
|
||||
err=cudaMalloc((void**)&work, work_size*sizeof(float));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&rwork, 5*M*sizeof(float));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
/* set mem to 0 */
|
||||
cudaMemset(xd, 0, N*sizeof(float));
|
||||
|
||||
/* calculate J^T, not taking flags into account */
|
||||
cudakernel_jacf_fl2(pd, jacd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
|
||||
|
||||
/* calculate JTJ=(J^T J - [e] [W]) */
|
||||
//status=culaDeviceSgemm('N','T',M,M,N,1.0f,jacd,M,jacd,M,0.0f,jacTjacd,M);
|
||||
//checkStatus(status,__FILE__,__LINE__);
|
||||
cublasStatus_t cbstatus=CUBLAS_STATUS_SUCCESS;
|
||||
float cone=1.0f; float czero=0.0f;
|
||||
cbstatus=cublasSgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_T,M,M,N,&cone,jacd,M,jacd,M,&czero,jacTjacd,M);
|
||||
|
||||
|
||||
/* add mu * I to JTJ */
|
||||
cudakernel_diagmu_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, jacTjacd, 1e-9f);
|
||||
|
||||
/* calculate inv(JTJ) using SVD */
|
||||
/* inv(JTJ) = Ud x Sid x VTd : we take into account that JTJ is symmetric */
|
||||
//status=culaDeviceSgesvd('A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M);
|
||||
//checkStatus(status,__FILE__,__LINE__);
|
||||
cusolverDnSgesvd(solver_handle,'A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M,work,work_size,rwork,devInfo);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
|
||||
/* find Sd= 1/sqrt(Sd) of the singular values (positive singular values) */
|
||||
cudakernel_sqrtdiv_fl(ThreadsPerBlock, (M+ThreadsPerBlock-1)/ThreadsPerBlock, M, 1e-9f, Sd);
|
||||
|
||||
/* multiply Ud with Sid (diagonal) Ud <= Ud Sid (columns modified) */
|
||||
cudakernel_diagmult_fl(ThreadsPerBlock, (M*M+ThreadsPerBlock-1)/ThreadsPerBlock, M, Ud, Sd);
|
||||
/* now multiply Ud VTd to get the square root */
|
||||
//status=culaDeviceSgemm('N','N',M,M,M,1.0f,Ud,M,VTd,M,0.0f,jacTjacd,M);
|
||||
//checkStatus(status,__FILE__,__LINE__);
|
||||
cbstatus=cublasSgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_N,M,M,M,&cone,Ud,M,VTd,M,&czero,jacTjacd,M);
|
||||
|
||||
/* calculate J^T, without taking flags into account (use same storage as previous J^T) */
|
||||
cudakernel_jacf_fl2(pd, jacd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
|
||||
|
||||
/* multiply (J^T)^T sqrt(B) == sqrt(B)^T J^T, taking M columns at a time */
|
||||
for (ci=0; ci<(N+M-1)/M;ci++) {
|
||||
if (ci*M+M<N) {
|
||||
Mi=M;
|
||||
} else {
|
||||
Mi=N-ci*M;
|
||||
}
|
||||
//status=culaDeviceSgemm('T','N',M,Mi,M,1.0f,jacTjacd,M,&jacd[ci*M*M],M,0.0f,VTd,M);
|
||||
//checkStatus(status,__FILE__,__LINE__);
|
||||
cbstatus=cublasSgemm(cbhandle,CUBLAS_OP_T,CUBLAS_OP_N,M,Mi,M,&cone,jacTjacd,M,&jacd[ci*M*M],M,&czero,VTd,M);
|
||||
|
||||
err=cudaMemcpy(&jacd[ci*M*M],VTd,Mi*M*sizeof(float),cudaMemcpyDeviceToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
/* xd[i] <= ||J[i,:]||^2 */
|
||||
cudakernel_jnorm_fl(ThreadsPerBlock, (N+ThreadsPerBlock-1)/ThreadsPerBlock, jacd, N, M, xd);
|
||||
|
||||
/* output x <=xd */
|
||||
err=cudaMemcpyAsync(x, xd, N*sizeof(float), cudaMemcpyDeviceToHost,0);
|
||||
cudaDeviceSynchronize();
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
checkCublasError(cbstatus,__FILE__,__LINE__);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/******************** pipeline functions **************************/
|
||||
typedef struct gb_data_dg_ {
|
||||
int status[2];
|
||||
float *p[2]; /* pointer to parameters being used by each thread (depends on cluster) */
|
||||
float *xo; /* residual vector (copied to device) */
|
||||
float *x[2]; /* output leverage values from each thread */
|
||||
int M[2]; /* no. of parameters (per cluster,hybrid) */
|
||||
int N[2]; /* no. of visibilities (might change in hybrid mode) */
|
||||
me_data_t *lmdata[2]; /* two for each thread */
|
||||
|
||||
/* GPU related info */
|
||||
cublasHandle_t cbhandle[2]; /* CUBLAS handles */
|
||||
cusolverDnHandle_t solver_handle[2];
|
||||
float *rd[2]; /* residual vector on the device (invarient) */
|
||||
float *gWORK[2]; /* GPU buffers */
|
||||
int64_t data_size; /* size of buffer (bytes) */
|
||||
|
||||
} gbdatadg;
|
||||
|
||||
|
||||
/* slave thread 2GPU function */
|
||||
static void *
|
||||
pipeline_slave_code_dg(void *data)
|
||||
{
|
||||
slave_tdata *td=(slave_tdata*)data;
|
||||
gbdatadg *gd=(gbdatadg*)(td->pline->data);
|
||||
int tid=td->tid;
|
||||
|
||||
while(1) {
|
||||
sync_barrier(&(td->pline->gate1)); /* stop at gate 1*/
|
||||
if(td->pline->terminate) break; /* if flag is set, break loop */
|
||||
sync_barrier(&(td->pline->gate2)); /* stop at gate 2 */
|
||||
/* do work */
|
||||
if (gd->status[tid]==PT_DO_CDERIV) {
|
||||
me_data_t *t=(me_data_t *)gd->lmdata[tid];
|
||||
/* divide the tiles into chunks tilesz/nchunk */
|
||||
int tilechunk=(t->tilesz+t->carr[t->clus].nchunk-1)/t->carr[t->clus].nchunk;
|
||||
|
||||
int ci;
|
||||
int cj=0;
|
||||
int ntiles;
|
||||
|
||||
/* loop over chunk, righ set of parameters and residual vector */
|
||||
for (ci=0; ci<t->carr[t->clus].nchunk; ci++) {
|
||||
/* divide the tiles into chunks tilesz/nchunk */
|
||||
if (cj+tilechunk<t->tilesz) {
|
||||
ntiles=tilechunk;
|
||||
} else {
|
||||
ntiles=t->tilesz-cj;
|
||||
}
|
||||
|
||||
/* right offset for rd[] and x[] needed and since no overlap,
|
||||
can wait for all chunks to complete */
|
||||
calculate_leverage(&gd->p[tid][ci*(gd->M[tid])],&gd->rd[tid][8*cj*t->Nbase],&gd->x[tid][8*cj*t->Nbase], gd->M[tid], 8*ntiles*t->Nbase, gd->cbhandle[tid], gd->solver_handle[tid], gd->gWORK[tid], cj, ntiles, gd->lmdata[tid]);
|
||||
|
||||
cj=cj+tilechunk;
|
||||
}
|
||||
|
||||
} else if (gd->status[tid]==PT_DO_AGPU) {
|
||||
attach_gpu_to_thread2(tid,&gd->cbhandle[tid],&gd->solver_handle[tid],&gd->gWORK[tid],gd->data_size,1);
|
||||
|
||||
/* copy residual vector to device */
|
||||
cudaError_t err;
|
||||
me_data_t *t=(me_data_t *)gd->lmdata[tid];
|
||||
err=cudaMalloc((void**)&gd->rd[tid], (size_t)8*t->tilesz*t->Nbase*sizeof(float));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
err=cudaMemcpy(gd->rd[tid], gd->xo, 8*t->tilesz*t->Nbase*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (gd->status[tid]==PT_DO_DGPU) {
|
||||
cudaFree(gd->rd[tid]);
|
||||
detach_gpu_from_thread2(gd->cbhandle[tid],gd->solver_handle[tid],gd->gWORK[tid],1);
|
||||
} else if (gd->status[tid]!=PT_DO_NOTHING) { /* catch error */
|
||||
fprintf(stderr,"%s: %d: invalid mode for slave tid=%d status=%d\n",__FILE__,__LINE__,tid,gd->status[tid]);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* initialize the pipeline
|
||||
and start the slaves rolling */
|
||||
static void
|
||||
init_pipeline_dg(th_pipeline *pline,
|
||||
void *data)
|
||||
{
|
||||
slave_tdata *t0,*t1;
|
||||
pthread_attr_init(&(pline->attr));
|
||||
pthread_attr_setdetachstate(&(pline->attr),PTHREAD_CREATE_JOINABLE);
|
||||
|
||||
init_th_barrier(&(pline->gate1),3); /* 3 threads, including master */
|
||||
init_th_barrier(&(pline->gate2),3); /* 3 threads, including master */
|
||||
pline->terminate=0;
|
||||
pline->data=data; /* data should have pointers to t1 and t2 */
|
||||
|
||||
if ((t0=(slave_tdata*)malloc(sizeof(slave_tdata)))==0) {
|
||||
fprintf(stderr,"no free memory\n");
|
||||
exit(1);
|
||||
}
|
||||
if ((t1=(slave_tdata*)malloc(sizeof(slave_tdata)))==0) {
|
||||
fprintf(stderr,"no free memory\n");
|
||||
exit(1);
|
||||
}
|
||||
if ((pline->thst=(taskhist*)malloc(sizeof(taskhist)))==0) {
|
||||
fprintf(stderr,"no free memory\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
init_task_hist(pline->thst);
|
||||
t0->pline=t1->pline=pline;
|
||||
t0->tid=0;
|
||||
t1->tid=1; /* link back t1, t2 to data so they could be freed */
|
||||
pline->sd0=t0;
|
||||
pline->sd1=t1;
|
||||
pthread_create(&(pline->slave0),&(pline->attr),pipeline_slave_code_dg,(void*)t0);
|
||||
pthread_create(&(pline->slave1),&(pline->attr),pipeline_slave_code_dg,(void*)t1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* destroy the pipeline */
|
||||
/* need to kill the slaves first */
|
||||
static void
|
||||
destroy_pipeline_dg(th_pipeline *pline)
|
||||
{
|
||||
|
||||
pline->terminate=1;
|
||||
sync_barrier(&(pline->gate1));
|
||||
pthread_join(pline->slave0,NULL);
|
||||
pthread_join(pline->slave1,NULL);
|
||||
destroy_th_barrier(&(pline->gate1));
|
||||
destroy_th_barrier(&(pline->gate2));
|
||||
pthread_attr_destroy(&(pline->attr));
|
||||
destroy_task_hist(pline->thst);
|
||||
free(pline->thst);
|
||||
free(pline->sd0);
|
||||
free(pline->sd1);
|
||||
pline->data=NULL;
|
||||
}
|
||||
/******************** end pipeline functions **************************/
|
||||
|
||||
|
||||
|
||||
/* Calculate St.Laurent-Cook Jacobian leverage
|
||||
xo: residual (modified)
|
||||
flags: 2 for flags based on uvcut, 1 for normal flags
|
||||
coh: coherencies are calculated for all baselines, regardless of flag
|
||||
diagmode: 1: replace residual, 2: calc noise/leverage ratio
|
||||
*/
|
||||
int
|
||||
calculate_diagnostics(double *u,double *v,double *w,double *p,double *xo,int N,int Nbase,int tilesz,baseline_t *barr, clus_source_t *carr, complex double *coh, int M,int Mt,int diagmode, int Nt) {
|
||||
|
||||
|
||||
int cj;
|
||||
int n;
|
||||
me_data_t lmdata0,lmdata1;
|
||||
int Nbase1;
|
||||
|
||||
/* no of data */
|
||||
n=Nbase*tilesz*8;
|
||||
|
||||
/* true no of baselines */
|
||||
Nbase1=Nbase*tilesz;
|
||||
|
||||
double *ddcoh;
|
||||
short *ddbase;
|
||||
|
||||
int c0,c1;
|
||||
|
||||
float *ddcohf, *pf, *xdummy0f, *xdummy1f, *res0, *dgf;
|
||||
/********* thread data ******************/
|
||||
/* barrier */
|
||||
th_pipeline tp;
|
||||
gbdatadg tpg;
|
||||
/****************************************/
|
||||
|
||||
lmdata0.clus=lmdata1.clus=-1;
|
||||
/* setup data for lmfit */
|
||||
lmdata0.u=lmdata1.u=u;
|
||||
lmdata0.v=lmdata1.v=v;
|
||||
lmdata0.w=lmdata1.w=w;
|
||||
lmdata0.Nbase=lmdata1.Nbase=Nbase;
|
||||
lmdata0.tilesz=lmdata1.tilesz=tilesz;
|
||||
lmdata0.N=lmdata1.N=N;
|
||||
lmdata0.barr=lmdata1.barr=barr;
|
||||
lmdata0.carr=lmdata1.carr=carr;
|
||||
lmdata0.M=lmdata1.M=M;
|
||||
lmdata0.Mt=lmdata1.Mt=Mt;
|
||||
lmdata0.freq0=lmdata1.freq0=NULL; /* not used */
|
||||
lmdata0.Nt=lmdata1.Nt=Nt;
|
||||
lmdata0.coh=lmdata1.coh=coh;
|
||||
/* rearrange coh for GPU use */
|
||||
if ((ddcoh=(double*)calloc((size_t)(M*Nbase1*8),sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((ddcohf=(float*)calloc((size_t)(M*Nbase1*8),sizeof(float)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((ddbase=(short*)calloc((size_t)(Nbase1*3),sizeof(short)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
rearrange_coherencies2(Nbase1, barr, coh, ddcoh, ddbase, M, Nt);
|
||||
lmdata0.ddcoh=lmdata1.ddcoh=ddcoh;
|
||||
lmdata0.ddbase=lmdata1.ddbase=ddbase;
|
||||
/* ddcohf (float) << ddcoh (double) */
|
||||
double_to_float(ddcohf,ddcoh,M*Nbase1*8,Nt);
|
||||
lmdata0.ddcohf=lmdata1.ddcohf=ddcohf;
|
||||
|
||||
if ((pf=(float*)calloc((size_t)(Mt*8*N),sizeof(float)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
double_to_float(pf,p,Mt*8*N,Nt);
|
||||
/* residual */
|
||||
if ((res0=(float*)calloc((size_t)(n),sizeof(float)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
double_to_float(res0,xo,n,Nt);
|
||||
|
||||
/* sum of diagonal values of leverage */
|
||||
if ((dgf=(float*)calloc((size_t)(n),sizeof(float)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((xdummy0f=(float*)calloc((size_t)(n),sizeof(float)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((xdummy1f=(float*)calloc((size_t)(n),sizeof(float)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
/********** setup threads *******************************/
|
||||
/* also calculate the total storage needed to be allocated on a GPU */
|
||||
/* determine total size for memory allocation
|
||||
residual = n (separately allocated)
|
||||
diagonal = n
|
||||
For one cluster,
|
||||
Jacobian = nxm, J^T J = mxm, (also inverse)
|
||||
*/
|
||||
int Mm=8*N; /* no of parameters */
|
||||
int64_t data_sz=0;
|
||||
data_sz=(int64_t)(n+Mm*n+3*Mm*Mm+3*Mm+Nbase1*8)*sizeof(float)+(int64_t)Nbase1*3*sizeof(short);
|
||||
tpg.data_size=data_sz;
|
||||
tpg.lmdata[0]=&lmdata0;
|
||||
tpg.lmdata[1]=&lmdata1;
|
||||
tpg.xo=res0; /* residual */
|
||||
|
||||
init_pipeline_dg(&tp,&tpg);
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_AGPU;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2*/
|
||||
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2*/
|
||||
|
||||
/********** done setup threads *******************************/
|
||||
tpg.x[0]=xdummy0f;
|
||||
tpg.M[0]=8*N; /* even though size of p is > M, dont change this */
|
||||
tpg.N[0]=n; /* Nbase*tilesz*8 */
|
||||
tpg.x[1]=xdummy1f;
|
||||
tpg.M[1]=8*N; /* even though size of p is > M, dont change this */
|
||||
tpg.N[1]=n; /* Nbase*tilesz*8 */
|
||||
|
||||
for (cj=0; cj<M/2; cj++) { /* iter per cluster pairs */
|
||||
c0=2*cj;
|
||||
c1=2*cj+1;
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1 */
|
||||
lmdata0.clus=c0;
|
||||
lmdata1.clus=c1;
|
||||
|
||||
/* run this from a separate thread */
|
||||
tpg.p[0]=&pf[carr[c0].p[0]]; /* length carr[c0].nchunk times */
|
||||
tpg.p[1]=&pf[carr[c1].p[0]]; /* length carr[c1].nchunk times */
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_CDERIV;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1 */
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
|
||||
/* add result to the sum */
|
||||
my_saxpy(n, xdummy0f, 1.0f, dgf);
|
||||
my_saxpy(n, xdummy1f, 1.0f, dgf);
|
||||
}
|
||||
/* odd cluster out, if M is odd */
|
||||
if (M%2) {
|
||||
c0=M-1;
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1 */
|
||||
tpg.p[0]=&pf[carr[c0].p[0]];
|
||||
lmdata0.clus=c0;
|
||||
|
||||
tpg.status[0]=PT_DO_CDERIV;
|
||||
tpg.status[1]=PT_DO_NOTHING;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
|
||||
/**************************************************************************/
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1 */
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_NOTHING;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2 */
|
||||
my_saxpy(n, xdummy0f, 1.0f, dgf);
|
||||
}
|
||||
free(pf);
|
||||
free(ddcohf);
|
||||
free(xdummy1f);
|
||||
free(res0);
|
||||
free(ddcoh);
|
||||
|
||||
/******** free threads ***************/
|
||||
sync_barrier(&(tp.gate1)); /* sync at gate 1*/
|
||||
tpg.status[0]=tpg.status[1]=PT_DO_DGPU;
|
||||
sync_barrier(&(tp.gate2)); /* sync at gate 2*/
|
||||
destroy_pipeline_dg(&tp);
|
||||
/******** done free threads ***************/
|
||||
|
||||
/* now add 1's to locations with flagged data */
|
||||
/* create array for adding */
|
||||
create_onezerovec(Nbase1, ddbase, xdummy0f, Nt);
|
||||
my_saxpy(n, xdummy0f, 1.0f, dgf);
|
||||
free(xdummy0f);
|
||||
free(ddbase);
|
||||
/* output */
|
||||
// for (cj=0; cj<n; cj++) {
|
||||
// printf("%d %f\n",cj,dgf[cj]);
|
||||
// }
|
||||
if (diagmode==1) {
|
||||
/* copy back to output */
|
||||
float_to_double(xo,dgf,n,Nt);
|
||||
} else {
|
||||
/* solve system of equations a * leverage + b * 1 = |residual|
|
||||
to find a,b scalars, and just print them as output */
|
||||
/* find 1^T |r| = sum (|residual|) and lev^T |r| */
|
||||
float sum1,sum2;
|
||||
find_sumproduct(n, res0, dgf, &sum1, &sum2, Nt);
|
||||
//printf("sum|res|=%f sum(lev^T |res|)=%f\n",sum1,sum2);
|
||||
float a00,a01,a11;
|
||||
a00=my_fnrm2(n,dgf); /* lev^T lev */
|
||||
a01=my_fasum(n,dgf); /* = a10 = sum|leverage| */
|
||||
a00=a00*a00;
|
||||
a11=(float)n; /* sum( 1 ) */
|
||||
float r00,r01;
|
||||
r00=sum1;
|
||||
r01=sum2;
|
||||
//printf("A=[\n %f %f;\n %f %f];\n b=[\n %f\n %f\n]\n",a00,a01,a01,a11,r00,r01);
|
||||
/* solve A [a b]^T = r */
|
||||
float alpha,beta,denom;
|
||||
denom=(a00*a11-a01*a01);
|
||||
//printf("denom=%f\n",denom);
|
||||
if (denom>1e-6f) { /* can be solved */
|
||||
alpha=(r00*a11-r01*a01)/denom;
|
||||
} else {
|
||||
alpha=0.0f;
|
||||
}
|
||||
beta=(r00-a00*alpha)/a01;
|
||||
printf("Error Noise/Model %e/%e\n",beta,alpha);
|
||||
}
|
||||
free(dgf);
|
||||
return 0;
|
||||
}
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -1,926 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
|
||||
#include "Solvers.h"
|
||||
#include <pthread.h>
|
||||
|
||||
|
||||
/**** repeated code here ********************/
|
||||
/* Jones matrix multiplication
|
||||
C=A*B
|
||||
*/
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
static void
|
||||
amb(complex double * __restrict a, complex double * __restrict b, complex double * __restrict c) {
|
||||
c[0]=a[0]*b[0]+a[1]*b[2];
|
||||
c[1]=a[0]*b[1]+a[1]*b[3];
|
||||
c[2]=a[2]*b[0]+a[3]*b[2];
|
||||
c[3]=a[2]*b[1]+a[3]*b[3];
|
||||
}
|
||||
|
||||
/* Jones matrix multiplication
|
||||
C=A*B^H
|
||||
*/
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
static void
|
||||
ambt(complex double * __restrict a, complex double * __restrict b, complex double * __restrict c) {
|
||||
c[0]=a[0]*conj(b[0])+a[1]*conj(b[1]);
|
||||
c[1]=a[0]*conj(b[2])+a[1]*conj(b[3]);
|
||||
c[2]=a[2]*conj(b[0])+a[3]*conj(b[1]);
|
||||
c[3]=a[2]*conj(b[2])+a[3]*conj(b[3]);
|
||||
}
|
||||
|
||||
/**** end repeated code ********************/
|
||||
|
||||
/* worker thread for a cpu */
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
static void *
|
||||
cpu_calc_deriv(void *adata) {
|
||||
thread_data_grad_t *t=(thread_data_grad_t*)adata;
|
||||
|
||||
int ci,nb;
|
||||
int stc,stoff,stm,sta1,sta2;
|
||||
int N=t->N; /* stations */
|
||||
int M=t->M; /* clusters */
|
||||
int Nbase=(t->Nbase)*(t->tilesz);
|
||||
|
||||
|
||||
complex double xr[4]; /* residuals */
|
||||
complex double G1[4],G2[4],C[4],T1[4],T2[4];
|
||||
double pp[8];
|
||||
complex double csum;
|
||||
int cli,tpchunk,pstart,nchunk,tilesperchunk,stci,ttile,tptile,poff;
|
||||
|
||||
/* iterate over each paramter */
|
||||
for (ci=t->g_start; ci<=t->g_end; ++ci) {
|
||||
t->g[ci]=0.0;
|
||||
/* find station and parameter corresponding to this value of ci */
|
||||
/* this parameter should correspond to the right baseline (x tilesz)
|
||||
to contribute to the derivative */
|
||||
cli=0;
|
||||
while((cli<M) && (ci<t->carr[cli].p[0] || ci>t->carr[cli].p[0]+8*N*t->carr[cli].nchunk-1)) {
|
||||
cli++;
|
||||
}
|
||||
/* now either cli>=M: cluster not found
|
||||
or cli<M and cli is the right cluster */
|
||||
if (cli==M && ci>=t->carr[cli-1].p[0] && ci<=t->carr[cli-1].p[0]+8*N*t->carr[cli-1].nchunk-1) {
|
||||
cli--;
|
||||
}
|
||||
|
||||
if (cli<M) {
|
||||
/* right parameter offset */
|
||||
stci=ci-t->carr[cli].p[0];
|
||||
|
||||
stc=(stci%(8*N))/8; /* 0..N-1 */
|
||||
/* make sure this baseline contribute to this parameter */
|
||||
tpchunk=stci/(8*N);
|
||||
nchunk=t->carr[cli].nchunk;
|
||||
pstart=t->carr[cli].p[0];
|
||||
tilesperchunk=(t->tilesz+nchunk-1)/nchunk;
|
||||
|
||||
|
||||
/* iterate over all baselines and accumulate sum */
|
||||
for (nb=0; nb<Nbase; ++nb) {
|
||||
/* which tile is this ? */
|
||||
ttile=nb/t->Nbase;
|
||||
/* which chunk this tile belongs to */
|
||||
tptile=ttile/tilesperchunk;
|
||||
/* now tptile has to match tpchunk, otherwise ignore calculation */
|
||||
if (tptile==tpchunk) {
|
||||
|
||||
sta1=t->barr[nb].sta1;
|
||||
sta2=t->barr[nb].sta2;
|
||||
|
||||
if (((stc==sta1)||(stc==sta2))&& !t->barr[nb].flag) {
|
||||
/* this baseline has a contribution */
|
||||
/* which paramter of this station */
|
||||
stoff=(stci%(8*N))%8; /* 0..7 */
|
||||
/* which cluster */
|
||||
stm=cli; /* 0..M-1 */
|
||||
|
||||
/* exact expression for derivative
|
||||
2 real( vec^H(residual_this_baseline)
|
||||
* vec(-J_{pm}C_{pqm} J_{qm}^H)
|
||||
where m: chosen cluster
|
||||
J_{pm},J_{qm} Jones matrices for baseline p-q
|
||||
depending on the parameter, J ==> E
|
||||
E: zero matrix, except 1 at location of m
|
||||
|
||||
residual : in x[8*nb:8*nb+7]
|
||||
C coh: in coh[8*M*nb+m*8:8*M*nb+m*8+7] (double storage)
|
||||
coh[4*M*nb+4*m:4*M*nb+4*m+3] (complex storage)
|
||||
J_p,J_q: in p[sta1*8+m*8*N: sta1*8+m*8*N+7]
|
||||
and p[sta2*8+m*8*N: sta2*8+m*8*N+ 7]
|
||||
*/
|
||||
/* read in residual vector, conjugated */
|
||||
xr[0]=(t->x[nb*8])-_Complex_I*(t->x[nb*8+1]);
|
||||
xr[1]=(t->x[nb*8+2])-_Complex_I*(t->x[nb*8+3]);
|
||||
xr[2]=(t->x[nb*8+4])-_Complex_I*(t->x[nb*8+5]);
|
||||
xr[3]=(t->x[nb*8+6])-_Complex_I*(t->x[nb*8+7]);
|
||||
|
||||
/* read in coherency */
|
||||
C[0]=t->coh[4*M*nb+4*stm];
|
||||
C[1]=t->coh[4*M*nb+4*stm+1];
|
||||
C[2]=t->coh[4*M*nb+4*stm+2];
|
||||
C[3]=t->coh[4*M*nb+4*stm+3];
|
||||
|
||||
memset(pp,0,sizeof(double)*8);
|
||||
if (stc==sta1) {
|
||||
/* this station parameter gradient */
|
||||
pp[stoff]=1.0;
|
||||
memset(G1,0,sizeof(complex double)*4);
|
||||
G1[0]=pp[0]+_Complex_I*pp[1];
|
||||
G1[1]=pp[2]+_Complex_I*pp[3];
|
||||
G1[2]=pp[4]+_Complex_I*pp[5];
|
||||
G1[3]=pp[6]+_Complex_I*pp[7];
|
||||
poff=pstart+tpchunk*8*N+sta2*8;
|
||||
G2[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]);
|
||||
G2[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]);
|
||||
G2[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+4]);
|
||||
G2[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]);
|
||||
|
||||
} else if (stc==sta2) {
|
||||
memset(G2,0,sizeof(complex double)*4);
|
||||
pp[stoff]=1.0;
|
||||
G2[0]=pp[0]+_Complex_I*pp[1];
|
||||
G2[1]=pp[2]+_Complex_I*pp[3];
|
||||
G2[2]=pp[4]+_Complex_I*pp[5];
|
||||
G2[3]=pp[6]+_Complex_I*pp[7];
|
||||
poff=pstart+tpchunk*8*N+sta1*8;
|
||||
G1[0]=(t->p[poff])+_Complex_I*(t->p[poff+1]);
|
||||
G1[1]=(t->p[poff+2])+_Complex_I*(t->p[poff+3]);
|
||||
G1[2]=(t->p[poff+4])+_Complex_I*(t->p[poff+5]);
|
||||
G1[3]=(t->p[poff+6])+_Complex_I*(t->p[poff+7]);
|
||||
}
|
||||
|
||||
/* T1=G1*C */
|
||||
amb(G1,C,T1);
|
||||
/* T2=T1*G2' */
|
||||
ambt(T1,G2,T2);
|
||||
|
||||
/* calculate product xr*vec(J_p C J_q^H ) */
|
||||
csum=xr[0]*T2[0];
|
||||
csum+=xr[1]*T2[1];
|
||||
csum+=xr[2]*T2[2];
|
||||
csum+=xr[3]*T2[3];
|
||||
|
||||
/* accumulate sum */
|
||||
t->g[ci]+=-2.0*creal(csum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
static int
|
||||
func_grad(
|
||||
void (*func)(double *p, double *hx, int m, int n, void *adata),
|
||||
double *p, double *g, double *xo, int m, int n, double step, void *adata) {
|
||||
/* gradient for each parameter is
|
||||
(||func(p+step*e_i)-x||^2-||func(p-step*e_i)-x||^2)/2*step
|
||||
i=0,...,m-1 for all parameters
|
||||
e_i: unit vector, 1 only at i-th location
|
||||
*/
|
||||
|
||||
double *x; /* array to store residual */
|
||||
int ci;
|
||||
me_data_t *dp=(me_data_t*)adata;
|
||||
|
||||
int Nt=dp->Nt;
|
||||
|
||||
pthread_attr_t attr;
|
||||
pthread_t *th_array;
|
||||
thread_data_grad_t *threaddata;
|
||||
|
||||
|
||||
if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
/* evaluate func once, store in x, and create threads */
|
||||
/* and calculate the residual x=xo-func */
|
||||
func(p,x,m,n,adata);
|
||||
/* calculate x<=x-xo */
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
|
||||
/* setup threads */
|
||||
pthread_attr_init(&attr);
|
||||
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
|
||||
|
||||
if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if ((threaddata=(thread_data_grad_t*)malloc((size_t)Nt*sizeof(thread_data_grad_t)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int nth,nth1,Nparm;
|
||||
|
||||
/* parameters per thread */
|
||||
Nparm=(m+Nt-1)/Nt;
|
||||
|
||||
/* each thread will calculate derivative of part of
|
||||
parameters */
|
||||
ci=0;
|
||||
for (nth=0; nth<Nt; nth++) {
|
||||
threaddata[nth].Nbase=dp->Nbase;
|
||||
threaddata[nth].tilesz=dp->tilesz;
|
||||
threaddata[nth].barr=dp->barr;
|
||||
threaddata[nth].carr=dp->carr;
|
||||
threaddata[nth].M=dp->M;
|
||||
threaddata[nth].N=dp->N;
|
||||
threaddata[nth].coh=dp->coh;
|
||||
threaddata[nth].m=m;
|
||||
threaddata[nth].n=n;
|
||||
threaddata[nth].x=x;
|
||||
threaddata[nth].p=p;
|
||||
threaddata[nth].g=g;
|
||||
threaddata[nth].g_start=ci;
|
||||
threaddata[nth].g_end=ci+Nparm-1;
|
||||
if (threaddata[nth].g_end>=m) {
|
||||
threaddata[nth].g_end=m-1;
|
||||
}
|
||||
ci=ci+Nparm;
|
||||
pthread_create(&th_array[nth],&attr,cpu_calc_deriv,(void*)(&threaddata[nth]));
|
||||
}
|
||||
|
||||
/* now wait for threads to finish */
|
||||
for(nth1=0; nth1<nth; nth1++) {
|
||||
pthread_join(th_array[nth1],NULL);
|
||||
}
|
||||
|
||||
pthread_attr_destroy(&attr);
|
||||
|
||||
free(th_array);
|
||||
free(threaddata);
|
||||
|
||||
free(x);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* use algorithm 9.1 to compute pk=Hk gk */
|
||||
/* pk,gk: size m x 1
|
||||
s, y: size mM x 1
|
||||
rho: size M x 1
|
||||
ii: true location of the k th values in s,y */
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
static void
|
||||
mult_hessian(int m, double *pk, double *gk, double *s, double *y, double *rho, int M, int ii) {
|
||||
int ci;
|
||||
double *alphai;
|
||||
int *idx; /* store sorted locations of s, y here */
|
||||
double gamma,beta;
|
||||
|
||||
if ((alphai=(double*)calloc((size_t)M,sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if ((idx=(int*)calloc((size_t)M,sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if (M>0) {
|
||||
/* find the location of k-1 th value */
|
||||
if (ii>0) {
|
||||
ii=ii-1;
|
||||
} else {
|
||||
ii=M-1;
|
||||
}
|
||||
/* s,y will have 0,1,...,ii,ii+1,...M-1 */
|
||||
/* map this to ii+1,ii+2,...,M-1,0,1,..,ii */
|
||||
for (ci=0; ci<M-ii-1; ci++){
|
||||
idx[ci]=(ii+ci+1);
|
||||
}
|
||||
for(ci=M-ii-1; ci<M; ci++) {
|
||||
idx[ci]=(ci-M+ii+1);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("prod M=%d, current ii=%d\n",M,ii);
|
||||
for(ci=0; ci<M; ci++) {
|
||||
printf("%d->%d ",ci,idx[ci]);
|
||||
}
|
||||
printf("\n");
|
||||
#endif
|
||||
/* q = grad(f)k : pk<=gk */
|
||||
my_dcopy(m,gk,1,pk,1);
|
||||
/* this should be done in the right order */
|
||||
for (ci=0; ci<M; ci++) {
|
||||
/* alphai=rhoi si^T*q */
|
||||
alphai[M-ci-1]=rho[idx[M-ci-1]]*my_ddot(m,&s[m*idx[M-ci-1]],pk);
|
||||
/* q=q-alphai yi */
|
||||
my_daxpy(m,&y[m*idx[M-ci-1]],-alphai[M-ci-1],pk);
|
||||
}
|
||||
/* r=Hk(0) q : initial hessian */
|
||||
/* gamma=s(k-1)^T*y(k-1)/y(k-1)^T*y(k-1)*/
|
||||
gamma=1.0;
|
||||
if (M>0) {
|
||||
gamma=my_ddot(m,&s[m*idx[M-1]],&y[m*idx[M-1]]);
|
||||
gamma/=my_ddot(m,&y[m*idx[M-1]],&y[m*idx[M-1]]);
|
||||
/* Hk(0)=gamma I, so scale q by gamma */
|
||||
/* r= Hk(0) q */
|
||||
my_dscal(m,gamma,pk);
|
||||
}
|
||||
|
||||
for (ci=0; ci<M; ci++) {
|
||||
/* beta=rhoi yi^T * r */
|
||||
beta=rho[idx[ci]]*my_ddot(m,&y[m*idx[ci]],pk);
|
||||
/* r = r + (alphai-beta)*si */
|
||||
my_daxpy(m,&s[m*idx[ci]],alphai[ci]-beta,pk);
|
||||
}
|
||||
|
||||
free(alphai);
|
||||
free(idx);
|
||||
}
|
||||
|
||||
/* cubic interpolation in interval [a,b] (a>b is possible)
|
||||
to find step that minimizes cost function */
|
||||
/* func: vector function
|
||||
xk: parameter values size m x 1 (at which step is calculated)
|
||||
pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
|
||||
a/b: interval for interpolation
|
||||
x: size n x 1 (storage)
|
||||
xp: size m x 1 (storage)
|
||||
xo: observed data size n x 1
|
||||
n: size of vector function
|
||||
step: step size for differencing
|
||||
adata: additional data passed to the function
|
||||
*/
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
static double
|
||||
cubic_interp(
|
||||
void (*func)(double *p, double *hx, int m, int n, void *adata),
|
||||
double *xk, double *pk, double a, double b, double *x, double *xp, double *xo, int m, int n, double step, void *adata) {
|
||||
|
||||
double f0,f1,f0d,f1d; /* function values and derivatives at a,b */
|
||||
double p01,p02,z0,fz0;
|
||||
double aa,cc;
|
||||
|
||||
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
|
||||
my_daxpy(m,pk,a,xp); /* xp<=xp+(a)*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
f0=my_dnrm2(n,x);
|
||||
f0*=f0;
|
||||
/* grad(phi_0): evaluate at -step and +step */
|
||||
my_daxpy(m,pk,step,xp); /* xp<=xp+(a+step)*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
p01=my_dnrm2(n,x);
|
||||
my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(a-step)*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
p02=my_dnrm2(n,x);
|
||||
f0d=(p01*p01-p02*p02)/(2.0*step);
|
||||
|
||||
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
|
||||
my_daxpy(m,pk,b,xp); /* xp<=xp+(b)*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
f1=my_dnrm2(n,x);
|
||||
f1*=f1;
|
||||
/* grad(phi_1): evaluate at -step and +step */
|
||||
my_daxpy(m,pk,step,xp); /* xp<=xp+(b+step)*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
p01=my_dnrm2(n,x);
|
||||
my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(b-step)*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
p02=my_dnrm2(n,x);
|
||||
f1d=(p01*p01-p02*p02)/(2.0*step);
|
||||
|
||||
|
||||
//printf("Interp a,f(a),f'(a): (%lf,%lf,%lf) (%lf,%lf,%lf)\n",a,f0,f0d,b,f1,f1d);
|
||||
/* cubic poly in [0,1] is f0+f0d z+eta z^2+xi z^3
|
||||
where eta=3(f1-f0)-2f0d-f1d, xi=f0d+f1d-2(f1-f0)
|
||||
derivative f0d+2 eta z+3 xi z^2 => cc+bb z+aa z^2 */
|
||||
aa=3.0*(f0-f1)/(b-a)+(f1d-f0d);
|
||||
p01=aa*aa-f0d*f1d;
|
||||
/* root exist? */
|
||||
if (p01>0.0) {
|
||||
/* root */
|
||||
cc=sqrt(p01);
|
||||
z0=b-(f1d+cc-aa)*(b-a)/(f1d-f0d+2.0*cc);
|
||||
/* FIXME: check if this is within boundary */
|
||||
aa=MAX(a,b);
|
||||
cc=MIN(a,b);
|
||||
//printf("Root=%lf, in [%lf,%lf]\n",z0,cc,aa);
|
||||
if (z0>aa || z0<cc) {
|
||||
fz0=f0+f1;
|
||||
} else {
|
||||
/* evaluate function for this root */
|
||||
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
|
||||
my_daxpy(m,pk,a+z0*(b-a),xp); /* xp<=xp+(z0)*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
fz0=my_dnrm2(n,x);
|
||||
fz0*=fz0;
|
||||
}
|
||||
|
||||
/* now choose between f0,f1,fz0,fz1 */
|
||||
if (f0<f1 && f0<fz0) {
|
||||
return a;
|
||||
}
|
||||
if (f1<fz0) {
|
||||
return b;
|
||||
}
|
||||
/* else */
|
||||
return (z0);
|
||||
} else {
|
||||
|
||||
/* find the value from a or b that minimizes func */
|
||||
if (f0<f1) {
|
||||
return a;
|
||||
} else {
|
||||
return b;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*************** Fletcher line search **********************************/
|
||||
/* zoom function for line search */
|
||||
/* func: vector function
|
||||
xk: parameter values size m x 1 (at which step is calculated)
|
||||
pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
|
||||
a/b: bracket interval [a,b] (a>b) is possible
|
||||
x: size n x 1 (storage)
|
||||
xp: size m x 1 (storage)
|
||||
phi_0: phi(0)
|
||||
gphi_0: grad(phi(0))
|
||||
xo: observed data size n x 1
|
||||
n: size of vector function
|
||||
step: step size for differencing
|
||||
adata: additional data passed to the function
|
||||
*/
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
static double
|
||||
linesearch_zoom(
|
||||
void (*func)(double *p, double *hx, int m, int n, void *adata),
|
||||
double *xk, double *pk, double a, double b, double *x, double *xp, double phi_0, double gphi_0, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) {
|
||||
|
||||
double alphaj,phi_j,phi_aj;
|
||||
double gphi_j,p01,p02,aj,bj;
|
||||
double alphak=1.0;
|
||||
int ci,found_step=0;
|
||||
|
||||
aj=a;
|
||||
bj=b;
|
||||
ci=0;
|
||||
while(ci<10) {
|
||||
/* choose alphaj from [a+t2(b-a),b-t3(b-a)] */
|
||||
p01=aj+t2*(bj-aj);
|
||||
p02=bj-t3*(bj-aj);
|
||||
alphaj=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata);
|
||||
//printf("cubic intep [%lf,%lf]->%lf\n",p01,p02,alphaj);
|
||||
|
||||
/* evaluate phi(alphaj) */
|
||||
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
|
||||
my_daxpy(m,pk,alphaj,xp); /* xp<=xp+(alphaj)*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
/* calculate x<=x-xo */
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
phi_j=my_dnrm2(n,x);
|
||||
phi_j*=phi_j;
|
||||
|
||||
/* evaluate phi(aj) */
|
||||
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
|
||||
my_daxpy(m,pk,aj,xp); /* xp<=xp+(alphaj)*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
/* calculate x<=x-xo */
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
phi_aj=my_dnrm2(n,x);
|
||||
phi_aj*=phi_aj;
|
||||
|
||||
|
||||
if ((phi_j>phi_0+rho*alphaj*gphi_0) || phi_j>=phi_aj) {
|
||||
bj=alphaj; /* aj unchanged */
|
||||
} else {
|
||||
/* evaluate grad(alphaj) */
|
||||
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
|
||||
my_daxpy(m,pk,alphaj+step,xp); /* xp<=xp+(alphaj+step)*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
/* calculate x<=x-xo */
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
p01=my_dnrm2(n,x);
|
||||
my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphaj-step)*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
/* calculate x<=x-xo */
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
p02=my_dnrm2(n,x);
|
||||
gphi_j=(p01*p01-p02*p02)/(2.0*step);
|
||||
|
||||
/* termination due to roundoff/other errors pp. 38, Fletcher */
|
||||
if ((aj-alphaj)*gphi_j<=step) {
|
||||
alphak=alphaj;
|
||||
found_step=1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (fabs(gphi_j)<=-sigma*gphi_0) {
|
||||
alphak=alphaj;
|
||||
found_step=1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (gphi_j*(bj-aj)>=0) {
|
||||
bj=aj;
|
||||
} /* else bj unchanged */
|
||||
aj=alphaj;
|
||||
}
|
||||
ci++;
|
||||
}
|
||||
|
||||
if (!found_step) {
|
||||
/* use bound to find possible step */
|
||||
alphak=alphaj;
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Found %g Interval [%lf,%lf]\n",alphak,a,b);
|
||||
#endif
|
||||
return alphak;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* line search */
|
||||
/* func: vector function
|
||||
xk: parameter values size m x 1 (at which step is calculated)
|
||||
pk: step direction size m x 1 (x(k+1)=x(k)+alphak * pk)
|
||||
alpha1: initial value for step
|
||||
sigma,rho,t1,t2,t3: line search parameters (from Fletcher)
|
||||
xo: observed data size n x 1
|
||||
n: size of vector function
|
||||
step: step size for differencing
|
||||
adata: additional data passed to the function
|
||||
*/
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
static double
|
||||
linesearch(
|
||||
void (*func)(double *p, double *hx, int m, int n, void *adata),
|
||||
double *xk, double *pk, double alpha1, double sigma, double rho, double t1, double t2, double t3, double *xo, int m, int n, double step, void *adata) {
|
||||
|
||||
/* phi(alpha)=f(xk+alpha pk)
|
||||
for vector function func
|
||||
f(xk) =||func(xk)||^2 */
|
||||
|
||||
double *x,*xp;
|
||||
double alphai,alphai1;
|
||||
double phi_0,phi_alphai,phi_alphai1;
|
||||
double p01,p02;
|
||||
double gphi_0,gphi_i;
|
||||
double alphak;
|
||||
|
||||
double mu;
|
||||
double tol; /* lower limit for minimization */
|
||||
|
||||
int ci;
|
||||
|
||||
if ((x=(double*)calloc((size_t)n,sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if ((xp=(double*)calloc((size_t)m,sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
alphak=1.0;
|
||||
/* evaluate phi_0 and grad(phi_0) */
|
||||
func(xk,x,m,n,adata);
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
phi_0=my_dnrm2(n,x);
|
||||
phi_0*=phi_0;
|
||||
/* select tolarance 1/100 of current function value */
|
||||
tol=MIN(0.01*phi_0,1e-6);
|
||||
|
||||
/* grad(phi_0): evaluate at -step and +step */
|
||||
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
|
||||
my_daxpy(m,pk,step,xp); /* xp<=xp+(0.0+step)*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
/* calculate x<=x-xo */
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
p01=my_dnrm2(n,x);
|
||||
my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(0.0-step)*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
/* calculate x<=x-xo */
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
p02=my_dnrm2(n,x);
|
||||
gphi_0=(p01*p01-p02*p02)/(2.0*step);
|
||||
|
||||
/* estimate for mu */
|
||||
/* mu = (tol-phi_0)/(rho gphi_0) */
|
||||
mu=(tol-phi_0)/(rho*gphi_0);
|
||||
#ifdef DEBUG
|
||||
printf("mu=%lf, alpha1=%lf\n",mu,alpha1);
|
||||
#endif
|
||||
|
||||
ci=1;
|
||||
alphai=alpha1; /* initial value for alpha(i) : check if 0<alphai<=mu */
|
||||
alphai1=0.0;
|
||||
phi_alphai1=phi_0;
|
||||
while(ci<10) {
|
||||
/* evalualte phi(alpha(i))=f(xk+alphai pk) */
|
||||
my_dcopy(m,xk,1,xp,1); /* xp<=xk */
|
||||
my_daxpy(m,pk,alphai,xp); /* xp<=xp+alphai*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
/* calculate x<=x-xo */
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
phi_alphai=my_dnrm2(n,x);
|
||||
phi_alphai*=phi_alphai;
|
||||
|
||||
if (phi_alphai<tol) {
|
||||
alphak=alphai;
|
||||
#ifdef DEBUG
|
||||
printf("Linesearch : Condition 0 met\n");
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
if ((phi_alphai>phi_0+alphai*gphi_0) || (ci>1 && phi_alphai>=phi_alphai1)) {
|
||||
/* ai=alphai1, bi=alphai bracket */
|
||||
alphak=linesearch_zoom(func,xk,pk,alphai1,alphai,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata);
|
||||
#ifdef DEBUG
|
||||
printf("Linesearch : Condition 1 met\n");
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
/* evaluate grad(phi(alpha(i))) */
|
||||
my_dcopy(m,xk,1,xp,1); /* NOT NEEDED here?? xp<=xk */
|
||||
my_daxpy(m,pk,alphai+step,xp); /* xp<=xp+(alphai+step)*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
/* calculate x<=x-xo */
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
p01=my_dnrm2(n,x);
|
||||
my_daxpy(m,pk,-2.0*step,xp); /* xp<=xp+(alphai-step)*pk */
|
||||
func(xp,x,m,n,adata);
|
||||
/* calculate x<=x-xo */
|
||||
my_daxpy(n,xo,-1.0,x);
|
||||
p02=my_dnrm2(n,x);
|
||||
gphi_i=(p01*p01-p02*p02)/(2.0*step);
|
||||
|
||||
if (fabs(gphi_i)<=-sigma*gphi_0) {
|
||||
alphak=alphai;
|
||||
#ifdef DEBUG
|
||||
printf("Linesearch : Condition 2 met\n");
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
if (gphi_i>=0) {
|
||||
/* ai=alphai, bi=alphai1 bracket */
|
||||
alphak=linesearch_zoom(func,xk,pk,alphai,alphai1,x,xp,phi_0,gphi_0,sigma,rho,t1,t2,t3,xo,m,n,step,adata);
|
||||
#ifdef DEBUG
|
||||
printf("Linesearch : Condition 3 met\n");
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
/* else preserve old values */
|
||||
if (mu<=(2.0*alphai-alphai1)) {
|
||||
/* next step */
|
||||
alphai1=alphai;
|
||||
alphai=mu;
|
||||
} else {
|
||||
/* choose by interpolation in [2*alphai-alphai1,min(mu,alphai+t1*(alphai-alphai1)] */
|
||||
p01=2.0*alphai-alphai1;
|
||||
p02=MIN(mu,alphai+t1*(alphai-alphai1));
|
||||
alphai=cubic_interp(func,xk,pk,p01,p02,x,xp,xo,m,n,step,adata);
|
||||
//printf("cubic interp [%lf,%lf]->%lf\n",p01,p02,alphai);
|
||||
}
|
||||
phi_alphai1=phi_alphai;
|
||||
|
||||
ci++;
|
||||
}
|
||||
|
||||
|
||||
|
||||
free(x);
|
||||
free(xp);
|
||||
#ifdef DEBUG
|
||||
printf("Step size=%g\n",alphak);
|
||||
#endif
|
||||
return alphak;
|
||||
}
|
||||
/*************** END Fletcher line search **********************************/
|
||||
|
||||
int
|
||||
lbfgs_fit(
|
||||
void (*func)(double *p, double *hx, int m, int n, void *adata),
|
||||
double *p, double *x, int m, int n, int itmax, int M, int gpu_threads, void *adata) {
|
||||
|
||||
double *gk; /* gradients at both k+1 and k iter */
|
||||
double *xk1,*xk; /* parameters at k+1 and k iter */
|
||||
double *pk; /* step direction H_k * grad(f) */
|
||||
|
||||
double step=1e-6; /* step for interpolation */
|
||||
double *y, *s; /* storage for delta(grad) and delta(p) */
|
||||
double *rho; /* storage for 1/yk^T*sk */
|
||||
int ci,ck,cm;
|
||||
double alphak=1.0;
|
||||
|
||||
|
||||
if ((gk=(double*)calloc((size_t)m,sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if ((xk1=(double*)calloc((size_t)m,sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if ((xk=(double*)calloc((size_t)m,sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ((pk=(double*)calloc((size_t)m,sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
/* storage size mM x 1*/
|
||||
if ((s=(double*)calloc((size_t)m*M,sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if ((y=(double*)calloc((size_t)m*M,sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if ((rho=(double*)calloc((size_t)M,sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* initial value for params xk=p */
|
||||
my_dcopy(m,p,1,xk,1);
|
||||
/* gradient gk=grad(f)_k */
|
||||
func_grad(func,xk,gk,x,m,n,step,adata);
|
||||
double gradnrm=my_dnrm2(m,gk);
|
||||
/* if gradient is too small, no need to solve, so stop */
|
||||
if (gradnrm<CLM_STOP_THRESH) {
|
||||
ck=itmax;
|
||||
step=0.0;
|
||||
} else {
|
||||
ck=0;
|
||||
/* step in [1e-6,1e-9] */
|
||||
step=MAX(1e-9,MIN(1e-3/gradnrm,1e-6));
|
||||
}
|
||||
#ifdef DEBUG
|
||||
printf("||grad||=%g step=%g\n",gradnrm,step);
|
||||
#endif
|
||||
|
||||
cm=0;
|
||||
ci=0;
|
||||
|
||||
while (ck<itmax) {
|
||||
/* mult with hessian pk=-H_k*gk */
|
||||
if (ck<M) {
|
||||
mult_hessian(m,pk,gk,s,y,rho,ck,ci);
|
||||
} else {
|
||||
mult_hessian(m,pk,gk,s,y,rho,M,ci);
|
||||
}
|
||||
my_dscal(m,-1.0,pk);
|
||||
|
||||
/* linesearch to find step length */
|
||||
/* parameters alpha1=10.0,sigma=0.1, rho=0.01, t1=9, t2=0.1, t3=0.5 */
|
||||
alphak=linesearch(func,xk,pk,10.0,0.1,0.01,9,0.1,0.5,x,m,n,step,adata);
|
||||
/* parameters c1=1e-4 c2=0.9, alpha1=1.0, alphamax=10.0, step (for alpha)=1e-4*/
|
||||
//alphak=linesearch_nw(func,xk,pk,1.0,10.0,1e-4,0.9,x,m,n,1e-4,adata);
|
||||
//alphak=1.0;
|
||||
/* check if step size is too small, then stop */
|
||||
if (fabs(alphak)<CLM_EPSILON) {
|
||||
break;
|
||||
}
|
||||
/* update parameters xk1=xk+alpha_k *pk */
|
||||
my_dcopy(m,xk,1,xk1,1);
|
||||
my_daxpy(m,pk,alphak,xk1);
|
||||
|
||||
/* calculate sk=xk1-xk and yk=gk1-gk */
|
||||
/* sk=xk1 */
|
||||
my_dcopy(m,xk1,1,&s[cm],1);
|
||||
/* sk=sk-xk */
|
||||
my_daxpy(m,xk,-1.0,&s[cm]);
|
||||
/* yk=-gk */
|
||||
my_dcopy(m,gk,1,&y[cm],1);
|
||||
my_dscal(m,-1.0,&y[cm]);
|
||||
|
||||
/* update gradient */
|
||||
func_grad(func,xk1,gk,x,m,n,step,adata);
|
||||
/* yk=yk+gk1 */
|
||||
my_daxpy(m,gk,1.0,&y[cm]);
|
||||
|
||||
/* calculate 1/yk^T*sk */
|
||||
rho[ci]=1.0/my_ddot(m,&y[cm],&s[cm]);
|
||||
|
||||
/* update xk=xk1 */
|
||||
my_dcopy(m,xk1,1,xk,1);
|
||||
|
||||
//printf("iter %d store %d\n",ck,cm);
|
||||
ck++;
|
||||
/* increment storage appropriately */
|
||||
if (cm<(M-1)*m) {
|
||||
/* offset of m */
|
||||
cm=cm+m;
|
||||
ci++;
|
||||
} else {
|
||||
cm=ci=0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* copy back solution to p */
|
||||
my_dcopy(m,xk,1,p,1);
|
||||
|
||||
/* for (ci=0; ci<m; ci++) {
|
||||
printf("grad %d=%lf\n",ci,gk[ci]);
|
||||
} */
|
||||
|
||||
free(gk);
|
||||
free(xk1);
|
||||
free(xk);
|
||||
free(pk);
|
||||
free(s);
|
||||
free(y);
|
||||
free(rho);
|
||||
return 0;
|
||||
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -1,161 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2006-2015 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <pthread.h>
|
||||
#include <math.h>
|
||||
#include "Solvers.h"
|
||||
|
||||
|
||||
#include <nvml.h>
|
||||
|
||||
//#define MPI_BUILD
|
||||
#ifdef MPI_BUILD
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
|
||||
//#define DEBUG
|
||||
|
||||
/* return random value in 0,1,..,maxval */
|
||||
#ifndef MPI_BUILD
|
||||
static int
|
||||
random_pick(int maxval, taskhist *th) {
|
||||
double rat=(double)random()/(double)RAND_MAX;
|
||||
double y=rat*(double)(maxval+1);
|
||||
int x=(int)floor(y);
|
||||
return x;
|
||||
}
|
||||
#endif
|
||||
|
||||
void
|
||||
init_task_hist(taskhist *th) {
|
||||
th->prev=-1;
|
||||
th->rseed=0;
|
||||
pthread_mutex_init(&th->prev_mutex,NULL);
|
||||
}
|
||||
|
||||
void
|
||||
destroy_task_hist(taskhist *th) {
|
||||
th->prev=-1;
|
||||
th->rseed=0;
|
||||
pthread_mutex_destroy(&th->prev_mutex);
|
||||
}
|
||||
|
||||
/* select a GPU from 0,1..,max_gpu
|
||||
in such a way to allow load balancing */
|
||||
int
|
||||
select_work_gpu(int max_gpu, taskhist *th) {
|
||||
#ifdef MPI_BUILD
|
||||
int rank;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
/* check if max_gpu > no. of actual devices */
|
||||
int actual_devcount;
|
||||
cudaGetDeviceCount(&actual_devcount);
|
||||
if (max_gpu+1>actual_devcount) {
|
||||
return rank%(actual_devcount);
|
||||
}
|
||||
return rank%(max_gpu+1); /* modulo value */
|
||||
#endif
|
||||
|
||||
#ifndef MPI_BUILD
|
||||
/* sequentially query the devices to find
|
||||
one with the min load/memory usage */
|
||||
nvmlReturn_t result;
|
||||
result = nvmlInit();
|
||||
int retval;
|
||||
int minid=-1;
|
||||
int maxid=-1;
|
||||
|
||||
|
||||
if (result!=NVML_SUCCESS) {
|
||||
fprintf(stderr,"%s: %d: cannot access NVML\n",__FILE__,__LINE__);
|
||||
/* return random pick */
|
||||
retval=random_pick(max_gpu, th);
|
||||
/* if this matches the previous value, select again */
|
||||
pthread_mutex_lock(&th->prev_mutex);
|
||||
while (retval==th->prev) {
|
||||
retval=random_pick(max_gpu, th);
|
||||
}
|
||||
|
||||
th->prev=retval;
|
||||
pthread_mutex_unlock(&th->prev_mutex);
|
||||
return retval;
|
||||
} else {
|
||||
/* iterate */
|
||||
nvmlDevice_t device;
|
||||
nvmlUtilization_t nvmlUtilization;
|
||||
nvmlMemory_t nvmlMemory;
|
||||
unsigned int min_util=101; /* GPU utilization */
|
||||
unsigned int max_util=0; /* GPU utilization */
|
||||
unsigned long long int max_free=0; /* max free memory */
|
||||
unsigned long long int min_free=ULLONG_MAX; /* max free memory */
|
||||
int ci;
|
||||
for (ci=0; ci<=max_gpu; ci++) {
|
||||
result=nvmlDeviceGetHandleByIndex(ci, &device);
|
||||
result=nvmlDeviceGetUtilizationRates(device, &nvmlUtilization);
|
||||
result=nvmlDeviceGetMemoryInfo(device, &nvmlMemory);
|
||||
if (min_util>nvmlUtilization.gpu) {
|
||||
min_util=nvmlUtilization.gpu;
|
||||
minid=ci;
|
||||
}
|
||||
if (max_util<nvmlUtilization.gpu) {
|
||||
max_util=nvmlUtilization.gpu;
|
||||
}
|
||||
if (max_free<nvmlMemory.free) {
|
||||
max_free=nvmlMemory.free;
|
||||
maxid=ci;
|
||||
}
|
||||
if (min_free>nvmlMemory.free) {
|
||||
min_free=nvmlMemory.free;
|
||||
}
|
||||
}
|
||||
result = nvmlShutdown();
|
||||
/* give priority for selection a GPU with max free memory,
|
||||
if there is a tie, use min utilization as second criterion */
|
||||
/* if all have 0 usage, again use random */
|
||||
if (max_free==min_free && max_util==min_util) {
|
||||
retval=random_pick(max_gpu,th);
|
||||
/* if this value matches previous one, select again */
|
||||
pthread_mutex_lock(&th->prev_mutex);
|
||||
while(retval==th->prev) {
|
||||
retval=random_pick(max_gpu,th);
|
||||
}
|
||||
th->prev=retval;
|
||||
pthread_mutex_unlock(&th->prev_mutex);
|
||||
return retval;
|
||||
} else {
|
||||
if (max_free==min_free) { /* all cards have equal free mem */
|
||||
retval=(int)minid;
|
||||
} else {
|
||||
retval=(int)maxid;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* update last pick */
|
||||
pthread_mutex_lock(&th->prev_mutex);
|
||||
th->prev=retval;
|
||||
pthread_mutex_unlock(&th->prev_mutex);
|
||||
|
||||
return retval;
|
||||
#endif
|
||||
}
|
Binary file not shown.
|
@ -1,627 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2014 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
#include "Solvers.h"
|
||||
#include <math.h>
|
||||
|
||||
//#define DEBUG
|
||||
typedef struct thread_data_manavg_ {
|
||||
double *Y;
|
||||
int startM;
|
||||
int endM;
|
||||
int Niter;
|
||||
int N;
|
||||
int M;
|
||||
int Nf;
|
||||
} thread_data_manavg_t;
|
||||
|
||||
/* worker thread function for manifold average+projection */
|
||||
static void*
|
||||
manifold_average_threadfn(void *data) {
|
||||
thread_data_manavg_t *t=(thread_data_manavg_t*)data;
|
||||
int ci,cj,iter;
|
||||
double *Yl;
|
||||
complex double *J3,*Jp;
|
||||
/* local storage 2Nx2 x Nf complex values */
|
||||
if ((Yl=(double*)malloc((size_t)t->N*8*t->Nf*sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((J3=(complex double*)malloc((size_t)t->N*4*sizeof(complex double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((Jp=(complex double*)malloc((size_t)t->N*4*sizeof(complex double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
#ifdef DEBUG
|
||||
complex double *Jerr;
|
||||
if ((Jerr=(complex double*)malloc((size_t)t->N*4*sizeof(complex double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
|
||||
complex double *Yc=(complex double*)Yl;
|
||||
complex double a=1.0/(double)t->Nf+0.0*_Complex_I;
|
||||
|
||||
/* work for SVD */
|
||||
complex double *WORK=0;
|
||||
complex double w[1];
|
||||
double RWORK[32]; /* size > 5*max_matrix_dimension */
|
||||
complex double JTJ[4],U[4],VT[4];
|
||||
double S[2];
|
||||
|
||||
int status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,w,-1,RWORK);
|
||||
if (status!=0) {
|
||||
fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
|
||||
exit(1);
|
||||
}
|
||||
int lwork=(int)w[0];
|
||||
if ((WORK=(complex double*)malloc((size_t)(int)lwork*sizeof(complex double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (ci=t->startM; ci<=t->endM; ci++) {
|
||||
/* copy to local storage */
|
||||
for (cj=0; cj<t->Nf; cj++) {
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N], 8, &Yl[cj*8*t->N], 4);
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+1], 8, &Yl[cj*8*t->N+1], 4);
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+4], 8, &Yl[cj*8*t->N+2], 4);
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+5], 8, &Yl[cj*8*t->N+3], 4);
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+2], 8, &Yl[cj*8*t->N+4*t->N], 4);
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+3], 8, &Yl[cj*8*t->N+4*t->N+1], 4);
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+6], 8, &Yl[cj*8*t->N+4*t->N+2], 4);
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+7], 8, &Yl[cj*8*t->N+4*t->N+3], 4);
|
||||
|
||||
}
|
||||
/* first averaging, select random block in [0,Nf-1] to project to */
|
||||
int cr=rand()%(t->Nf); /* remainder always in [0,Nf-1] */
|
||||
/* J3 <= cr th block */
|
||||
my_ccopy(t->N*4,&Yc[cr*t->N*4],1,J3,1);
|
||||
/* project the remainder */
|
||||
for (cj=0; cj<cr; cj++) {
|
||||
project_procrustes_block(t->N,J3,&Yc[cj*t->N*4]);
|
||||
}
|
||||
for (cj=cr+1; cj<t->Nf; cj++) {
|
||||
project_procrustes_block(t->N,J3,&Yc[cj*t->N*4]);
|
||||
}
|
||||
|
||||
|
||||
/* now each 2, 2N complex vales is one J block */
|
||||
/* average values and project to common average */
|
||||
for (iter=0; iter<t->Niter; iter++) {
|
||||
/* J3 <= 1st block */
|
||||
my_ccopy(t->N*4,Yc,1,J3,1);
|
||||
/* add the remainder */
|
||||
for (cj=1; cj<t->Nf; cj++) {
|
||||
my_caxpy(t->N*4,&Yc[cj*t->N*4],1.0+_Complex_I*0.0,J3);
|
||||
}
|
||||
my_cscal(t->N*4,a,J3);
|
||||
/* now find unitary matrix using Procrustes problem */
|
||||
for (cj=0; cj<t->Nf; cj++) {
|
||||
/* find product JTJ = J^H J3 */
|
||||
my_zgemm('C','N',2,2,2*t->N,1.0+_Complex_I*0.0,&Yc[cj*t->N*4],2*t->N,J3,2*t->N,0.0+_Complex_I*0.0,JTJ,2);
|
||||
status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,WORK,lwork,RWORK);
|
||||
//printf("%d %d %lf %lf\n",ci,cj,S[0],S[1]);
|
||||
/* find JTJ= U V^H */
|
||||
my_zgemm('N','N',2,2,2,1.0+_Complex_I*0.0,U,2,VT,2,0.0+_Complex_I*0.0,JTJ,2);
|
||||
/* find J*(JTJ) : projected matrix */
|
||||
my_zgemm('N','N',2*t->N,2,2,1.0+_Complex_I*0.0,&Yc[cj*t->N*4],2*t->N,JTJ,2,0.0+_Complex_I*0.0,Jp,2*t->N);
|
||||
/* copy back */
|
||||
my_ccopy(t->N*4,Jp,1,&Yc[cj*t->N*4],1);
|
||||
#ifdef DEBUG
|
||||
/* calculate error between projected value and global mean */
|
||||
my_ccopy(t->N*4,J3,1,Jerr,1);
|
||||
my_caxpy(t->N*4,&Yc[cj*t->N*4],-1.0+_Complex_I*0.0,Jerr);
|
||||
printf("Error freq=%d dir=%d iter=%d %lf\n",cj,ci,iter,my_cnrm2(t->N*4,Jerr));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
/* now get a fresh copy, because we should modify Y only by
|
||||
one unitary matrix */
|
||||
my_ccopy(t->N*4,Yc,1,J3,1);
|
||||
/* add the remainder */
|
||||
for (cj=1; cj<t->Nf; cj++) {
|
||||
my_caxpy(t->N*4,&Yc[cj*t->N*4],1.0+_Complex_I*0.0,J3);
|
||||
}
|
||||
my_cscal(t->N*4,a,J3);
|
||||
for (cj=0; cj<t->Nf; cj++) {
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N], 8, &Yl[cj*8*t->N], 4);
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+1], 8, &Yl[cj*8*t->N+1], 4);
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+4], 8, &Yl[cj*8*t->N+2], 4);
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+5], 8, &Yl[cj*8*t->N+3], 4);
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+2], 8, &Yl[cj*8*t->N+4*t->N], 4);
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+3], 8, &Yl[cj*8*t->N+4*t->N+1], 4);
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+6], 8, &Yl[cj*8*t->N+4*t->N+2], 4);
|
||||
my_dcopy(t->N, &t->Y[cj*8*t->N*t->M+ci*8*t->N+7], 8, &Yl[cj*8*t->N+4*t->N+3], 4);
|
||||
}
|
||||
|
||||
for (cj=0; cj<t->Nf; cj++) {
|
||||
/* find product JTJ = J^H J3 */
|
||||
my_zgemm('C','N',2,2,2*t->N,1.0+_Complex_I*0.0,&Yc[cj*t->N*4],2*t->N,J3,2*t->N,0.0+_Complex_I*0.0,JTJ,2);
|
||||
|
||||
status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,WORK,lwork,RWORK);
|
||||
/* find JTJ= U V^H */
|
||||
my_zgemm('N','N',2,2,2,1.0+_Complex_I*0.0,U,2,VT,2,0.0+_Complex_I*0.0,JTJ,2);
|
||||
/* find J*(JTJ) : projected matrix */
|
||||
my_zgemm('N','N',2*t->N,2,2,1.0+_Complex_I*0.0,&Yc[cj*t->N*4],2*t->N,JTJ,2,0.0+_Complex_I*0.0,Jp,2*t->N);
|
||||
/* copy back */
|
||||
my_ccopy(t->N*4,Jp,1,&Yc[cj*t->N*4],1);
|
||||
}
|
||||
|
||||
/* copy back from local storage */
|
||||
for (cj=0; cj<t->Nf; cj++) {
|
||||
my_dcopy(t->N, &Yl[cj*8*t->N], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N], 8);
|
||||
my_dcopy(t->N, &Yl[cj*8*t->N+1], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+1], 8);
|
||||
my_dcopy(t->N, &Yl[cj*8*t->N+2], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+4], 8);
|
||||
my_dcopy(t->N, &Yl[cj*8*t->N+3], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+5], 8);
|
||||
my_dcopy(t->N, &Yl[cj*8*t->N+4*t->N], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+2], 8);
|
||||
my_dcopy(t->N, &Yl[cj*8*t->N+4*t->N+1], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+3], 8);
|
||||
my_dcopy(t->N, &Yl[cj*8*t->N+4*t->N+2], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+6], 8);
|
||||
my_dcopy(t->N, &Yl[cj*8*t->N+4*t->N+3], 4, &t->Y[cj*8*t->N*t->M+ci*8*t->N+7], 8);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
free(Jerr);
|
||||
#endif
|
||||
free(Yl);
|
||||
free(J3);
|
||||
free(Jp);
|
||||
free(WORK);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int
|
||||
calculate_manifold_average(int N,int M,int Nf,double *Y,int Niter,int Nt) {
|
||||
/* Y : each 2Nx2xM blocks belong to one freq,
|
||||
select one 2Nx2 from this, reorder to J format : Nf blocks
|
||||
and average */
|
||||
pthread_attr_t attr;
|
||||
pthread_t *th_array;
|
||||
thread_data_manavg_t *threaddata;
|
||||
|
||||
int ci,Nthb0,Nthb,nth,nth1;
|
||||
/* clusters per thread */
|
||||
Nthb0=(M+Nt-1)/Nt;
|
||||
|
||||
/* setup threads */
|
||||
pthread_attr_init(&attr);
|
||||
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
|
||||
|
||||
if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((threaddata=(thread_data_manavg_t*)malloc((size_t)Nt*sizeof(thread_data_manavg_t)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
ci=0;
|
||||
for (nth=0; nth<Nt && ci<M; nth++) {
|
||||
if (ci+Nthb0<M) {
|
||||
Nthb=Nthb0;
|
||||
} else {
|
||||
Nthb=M-ci;
|
||||
}
|
||||
threaddata[nth].Y=Y;
|
||||
threaddata[nth].N=N;
|
||||
threaddata[nth].M=M;
|
||||
threaddata[nth].Nf=Nf;
|
||||
threaddata[nth].Niter=Niter;
|
||||
threaddata[nth].startM=ci;
|
||||
threaddata[nth].endM=ci+Nthb-1;
|
||||
|
||||
pthread_create(&th_array[nth],&attr,manifold_average_threadfn,(void*)(&threaddata[nth]));
|
||||
ci=ci+Nthb;
|
||||
}
|
||||
|
||||
for(nth1=0; nth1<nth; nth1++) {
|
||||
pthread_join(th_array[nth1],NULL);
|
||||
}
|
||||
|
||||
pthread_attr_destroy(&attr);
|
||||
|
||||
|
||||
free(th_array);
|
||||
free(threaddata);
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int
|
||||
project_procrustes(int N,double *J,double *J1) {
|
||||
/* min ||J - J1 U || find U */
|
||||
complex double *X,*Y;
|
||||
/* local storage */
|
||||
if ((X=(complex double*)malloc((size_t)N*4*sizeof(complex double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((Y=(complex double*)malloc((size_t)N*4*sizeof(complex double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
double *Jx=(double*)X;
|
||||
double *Jy=(double*)Y;
|
||||
/* copy to get correct format */
|
||||
my_dcopy(N, &J[0], 8, &Jx[0], 4);
|
||||
my_dcopy(N, &J[0+1], 8, &Jx[1], 4);
|
||||
my_dcopy(N, &J[0+4], 8, &Jx[2], 4);
|
||||
my_dcopy(N, &J[0+5], 8, &Jx[3], 4);
|
||||
my_dcopy(N, &J[0+2], 8, &Jx[4*N], 4);
|
||||
my_dcopy(N, &J[0+3], 8, &Jx[4*N+1], 4);
|
||||
my_dcopy(N, &J[0+6], 8, &Jx[4*N+2], 4);
|
||||
my_dcopy(N, &J[0+7], 8, &Jx[4*N+3], 4);
|
||||
my_dcopy(N, &J1[0], 8, &Jy[0], 4);
|
||||
my_dcopy(N, &J1[0+1], 8, &Jy[1], 4);
|
||||
my_dcopy(N, &J1[0+4], 8, &Jy[2], 4);
|
||||
my_dcopy(N, &J1[0+5], 8, &Jy[3], 4);
|
||||
my_dcopy(N, &J1[0+2], 8, &Jy[4*N], 4);
|
||||
my_dcopy(N, &J1[0+3], 8, &Jy[4*N+1], 4);
|
||||
my_dcopy(N, &J1[0+6], 8, &Jy[4*N+2], 4);
|
||||
my_dcopy(N, &J1[0+7], 8, &Jy[4*N+3], 4);
|
||||
|
||||
/* min ||X - Y U|| find U */
|
||||
|
||||
/* work for SVD */
|
||||
complex double *WORK=0;
|
||||
complex double w[1];
|
||||
double RWORK[32]; /* size > 5*max_matrix_dimension */
|
||||
complex double JTJ[4],U[4],VT[4];
|
||||
double S[2];
|
||||
|
||||
int status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,w,-1,RWORK);
|
||||
if (status!=0) {
|
||||
fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
|
||||
exit(1);
|
||||
}
|
||||
int lwork=(int)w[0];
|
||||
if ((WORK=(complex double*)malloc((size_t)(int)lwork*sizeof(complex double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* find product JTJ = Y^H X */
|
||||
my_zgemm('C','N',2,2,2*N,1.0+_Complex_I*0.0,Y,2*N,X,2*N,0.0+_Complex_I*0.0,JTJ,2);
|
||||
/* JTJ = U S V^H */
|
||||
status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,WORK,lwork,RWORK);
|
||||
/* find JTJ= U V^H */
|
||||
my_zgemm('N','N',2,2,2,1.0+_Complex_I*0.0,U,2,VT,2,0.0+_Complex_I*0.0,JTJ,2);
|
||||
/* find Y*(JTJ) : projected matrix -> store in X */
|
||||
my_zgemm('N','N',2*N,2,2,1.0+_Complex_I*0.0,Y,2*N,JTJ,2,0.0+_Complex_I*0.0,X,2*N);
|
||||
|
||||
my_dcopy(N, &Jx[0], 4, &J1[0], 8);
|
||||
my_dcopy(N, &Jx[1], 4, &J1[0+1], 8);
|
||||
my_dcopy(N, &Jx[2], 4, &J1[0+4], 8);
|
||||
my_dcopy(N, &Jx[3], 4, &J1[0+5], 8);
|
||||
my_dcopy(N, &Jx[4*N], 4, &J1[0+2], 8);
|
||||
my_dcopy(N, &Jx[4*N+1], 4, &J1[0+3], 8);
|
||||
my_dcopy(N, &Jx[4*N+2], 4, &J1[0+6], 8);
|
||||
my_dcopy(N, &Jx[4*N+3], 4, &J1[0+7], 8);
|
||||
|
||||
|
||||
free(WORK);
|
||||
free(X);
|
||||
free(Y);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int
|
||||
project_procrustes_block(int N,complex double *X,complex double *Y) {
|
||||
/* min ||X - Y U || find U */
|
||||
complex double *Jlocal;
|
||||
/* local storage */
|
||||
if ((Jlocal=(complex double*)malloc((size_t)N*4*sizeof(complex double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* work for SVD */
|
||||
complex double *WORK=0;
|
||||
complex double w[1];
|
||||
double RWORK[32]; /* size > 5*max_matrix_dimension */
|
||||
complex double JTJ[4],U[4],VT[4];
|
||||
double S[2];
|
||||
|
||||
int status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,w,-1,RWORK);
|
||||
if (status!=0) {
|
||||
fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,status);
|
||||
exit(1);
|
||||
}
|
||||
int lwork=(int)w[0];
|
||||
if ((WORK=(complex double*)malloc((size_t)(int)lwork*sizeof(complex double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* find product JTJ = Y^H X */
|
||||
my_zgemm('C','N',2,2,2*N,1.0+_Complex_I*0.0,Y,2*N,X,2*N,0.0+_Complex_I*0.0,JTJ,2);
|
||||
/* JTJ = U S V^H */
|
||||
status=my_zgesvd('A','A',2,2,JTJ,2,S,U,2,VT,2,WORK,lwork,RWORK);
|
||||
/* find JTJ= U V^H */
|
||||
my_zgemm('N','N',2,2,2,1.0+_Complex_I*0.0,U,2,VT,2,0.0+_Complex_I*0.0,JTJ,2);
|
||||
/* find Y*(JTJ) : projected matrix -> store in Jlocal */
|
||||
my_zgemm('N','N',2*N,2,2,1.0+_Complex_I*0.0,Y,2*N,JTJ,2,0.0+_Complex_I*0.0,Jlocal,2*N);
|
||||
|
||||
/* copy Jlocal -> Y */
|
||||
my_dcopy(8*N, (double*)Jlocal, 1, (double*)Y, 1);
|
||||
|
||||
free(WORK);
|
||||
free(Jlocal);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//#define DEBUG
|
||||
/* Extract only the phase of diagonal entries from solutions
|
||||
p: 8Nx1 solutions, orders as [(real,imag)vec(J1),(real,imag)vec(J2),...]
|
||||
pout: 8Nx1 phases (exp(j*phase)) of solutions, after joint diagonalization of p
|
||||
N: no. of 2x2 Jones matrices in p, having common unitary ambiguity
|
||||
niter: no of iterations for Jacobi rotation */
|
||||
int
|
||||
extract_phases(double *p, double *pout, int N, int niter) {
|
||||
|
||||
/* local storage */
|
||||
complex double *J,*Jcopy;
|
||||
/* local storage, change ordering of solutions [J_1^T,J_2^T,...]^T */
|
||||
if ((J=(complex double*)malloc((size_t)N*4*sizeof(complex double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((Jcopy=(complex double*)malloc((size_t)N*4*sizeof(complex double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
double *Jx=(double *)J;
|
||||
/* copy to get correct format */
|
||||
my_dcopy(N, &p[0], 8, &Jx[0], 4);
|
||||
my_dcopy(N, &p[0+1], 8, &Jx[1], 4);
|
||||
my_dcopy(N, &p[0+4], 8, &Jx[2], 4);
|
||||
my_dcopy(N, &p[0+5], 8, &Jx[3], 4);
|
||||
my_dcopy(N, &p[0+2], 8, &Jx[4*N], 4);
|
||||
my_dcopy(N, &p[0+3], 8, &Jx[4*N+1], 4);
|
||||
my_dcopy(N, &p[0+6], 8, &Jx[4*N+2], 4);
|
||||
my_dcopy(N, &p[0+7], 8, &Jx[4*N+3], 4);
|
||||
|
||||
complex double h[3],Hc[9];
|
||||
double H[9];
|
||||
double W[3],Z[3];
|
||||
double w[1],*WORK;
|
||||
int IWORK[15],IFAIL[3],info;
|
||||
int ni,ci;
|
||||
complex double c,s,G[4];
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("J=[\n");
|
||||
for (ci=0; ci<N; ci++) {
|
||||
printf("%lf+j*(%lf), %lf+j*(%lf)\n",p[8*ci],p[8*ci+1],p[8*ci+2],p[8*ci+3]);
|
||||
printf("%lf+j*(%lf), %lf+j*(%lf)\n",p[8*ci+4],p[8*ci+5],p[8*ci+6],p[8*ci+7]);
|
||||
}
|
||||
printf("];\n");
|
||||
#endif
|
||||
/* setup workspace for eigenvalue decomposition */
|
||||
info=my_dsyevx('V','I','L',3,H,3,0.0,0.0,3,3,dlamch('S'),1,W,Z,3,w,-1,IWORK,IFAIL);
|
||||
if (info) {
|
||||
fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,info);
|
||||
exit(1);
|
||||
}
|
||||
/* get work size */
|
||||
int lwork=(int)w[0];
|
||||
/* allocate memory */
|
||||
if ((WORK=(double*)malloc((size_t)lwork*sizeof(double)))==0) {
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
/* iteration loop */
|
||||
for (ni=0; ni<niter; ni++) {
|
||||
|
||||
/************** for element (1,2) **********************/
|
||||
/* accumulate h*h^H product */
|
||||
memset(Hc,0,9*sizeof(complex double));
|
||||
for (ci=0; ci<N; ci++) {
|
||||
/* [a_ii-a_jj,a_ij+a_ji,I*(a_ji-a_ij)] */
|
||||
h[0]=conj(J[2*ci]-J[2*ci+2*N+1]);
|
||||
h[1]=conj(J[2*ci+2*N]+J[2*ci+1]);
|
||||
h[2]=conj(_Complex_I*(J[2*ci+1]-J[2*ci+2*N]));
|
||||
/* store results onto lower triangle */
|
||||
my_zher('L',3,1.0,h,1,Hc,3);
|
||||
}
|
||||
/* get real part, copy it to lower triangle */
|
||||
H[0]=creal(Hc[0]);
|
||||
H[1]=creal(Hc[1]);
|
||||
H[2]=creal(Hc[2]);
|
||||
H[4]=creal(Hc[4]);
|
||||
H[5]=creal(Hc[5]);
|
||||
H[8]=creal(Hc[8]);
|
||||
#ifdef DEBUG
|
||||
printf("H=[\n");
|
||||
printf("%e %e %e\n",H[0],H[1],H[2]);
|
||||
printf("%e %e %e\n",H[1],H[4],H[5]);
|
||||
printf("%e %e %e\n",H[2],H[5],H[8]);
|
||||
printf("];\n");
|
||||
#endif
|
||||
info=my_dsyevx('V','I','L',3,H,3,0.0,0.0,3,3,dlamch('S'),1,W,Z,3,WORK,lwork,IWORK,IFAIL);
|
||||
if (info<0) {
|
||||
fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,info);
|
||||
exit(1);
|
||||
}
|
||||
#ifdef DEBUG
|
||||
printf("max eigenvalue=%e\n",W[0]);
|
||||
printf("ev=[\n");
|
||||
printf("%e\n",Z[0]);
|
||||
printf("%e\n",Z[1]);
|
||||
printf("%e\n",Z[2]);
|
||||
printf("];\n");
|
||||
#endif
|
||||
|
||||
/* form sin,cos values */
|
||||
if (Z[0]>=0.0) {
|
||||
c=sqrt(0.5+Z[0]*0.5)+_Complex_I*0.0;
|
||||
s=0.5*(Z[1]-_Complex_I*Z[2])/c;
|
||||
} else {
|
||||
/* flip sign of eigenvector */
|
||||
c=sqrt(0.5-Z[0]*0.5)+_Complex_I*0.0;
|
||||
s=0.5*(-Z[1]+_Complex_I*Z[2])/c;
|
||||
}
|
||||
/* form Givens rotation matrix */
|
||||
G[0]=c;
|
||||
G[1]=-s;
|
||||
G[2]=conj(s);
|
||||
G[3]=conj(c);
|
||||
#ifdef DEBUG
|
||||
printf("G=[\n");
|
||||
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(G[0]),cimag(G[0]),creal(G[2]),cimag(G[2]));
|
||||
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(G[1]),cimag(G[1]),creal(G[3]),cimag(G[3]));
|
||||
printf("];\n");
|
||||
#endif
|
||||
/* rotate J <= J * G^H: Jcopy = 1 x J x G^H + 0 x Jcopy */
|
||||
my_zgemm('N','C',2*N,2,2,1.0+_Complex_I*0.0,J,2*N,G,2,0.0+_Complex_I*0.0,Jcopy,2*N);
|
||||
memcpy(J,Jcopy,(size_t)4*N*sizeof(complex double));
|
||||
#ifdef DEBUG
|
||||
printf("JGH=[\n");
|
||||
for (ci=0; ci<N; ci++) {
|
||||
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci]),cimag(J[2*ci]),creal(J[2*N+2*ci]),cimag(J[2*N+2*ci]));
|
||||
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci+1]),cimag(J[2*ci+1]),creal(J[2*N+2*ci+1]),cimag(J[2*N+2*ci+1]));
|
||||
}
|
||||
printf("];\n");
|
||||
#endif
|
||||
|
||||
/************** for element (2,1) **********************/
|
||||
/* accumulate h*h^H product */
|
||||
memset(Hc,0,9*sizeof(complex double));
|
||||
for (ci=0; ci<N; ci++) {
|
||||
/* [a_ii-a_jj,a_ij+a_ji,I*(a_ji-a_ij)] */
|
||||
h[0]=conj(J[2*ci+2*N+1]-J[2*ci]);
|
||||
h[1]=conj(J[2*ci+1]+J[2*ci+2*N]);
|
||||
h[2]=conj(_Complex_I*(J[2*ci+2*N]-J[2*ci+1]));
|
||||
/* store results onto lower triangle */
|
||||
my_zher('L',3,1.0,h,1,Hc,3);
|
||||
}
|
||||
/* get real part, copy it to lower triangle */
|
||||
H[0]=creal(Hc[0]);
|
||||
H[1]=creal(Hc[1]);
|
||||
H[2]=creal(Hc[2]);
|
||||
H[4]=creal(Hc[4]);
|
||||
H[5]=creal(Hc[5]);
|
||||
H[8]=creal(Hc[8]);
|
||||
#ifdef DEBUG
|
||||
printf("H=[\n");
|
||||
printf("%e %e %e\n",H[0],H[1],H[2]);
|
||||
printf("%e %e %e\n",H[1],H[4],H[5]);
|
||||
printf("%e %e %e\n",H[2],H[5],H[8]);
|
||||
printf("];\n");
|
||||
#endif
|
||||
info=my_dsyevx('V','I','L',3,H,3,0.0,0.0,3,3,dlamch('S'),1,W,Z,3,WORK,lwork,IWORK,IFAIL);
|
||||
if (info<0) {
|
||||
fprintf(stderr,"%s: %d: LAPACK error %d\n",__FILE__,__LINE__,info);
|
||||
exit(1);
|
||||
}
|
||||
#ifdef DEBUG
|
||||
printf("max eigenvalue=%e\n",W[0]);
|
||||
printf("ev=[\n");
|
||||
printf("%e\n",Z[0]);
|
||||
printf("%e\n",Z[1]);
|
||||
printf("%e\n",Z[2]);
|
||||
printf("];\n");
|
||||
#endif
|
||||
|
||||
/* form sin,cos values */
|
||||
if (Z[0]>=0.0) {
|
||||
c=sqrt(0.5+Z[0]*0.5)+_Complex_I*0.0;
|
||||
s=0.5*(Z[1]-_Complex_I*Z[2])/c;
|
||||
} else {
|
||||
/* flip sign of eigenvector */
|
||||
c=sqrt(0.5-Z[0]*0.5)+_Complex_I*0.0;
|
||||
s=0.5*(-Z[1]+_Complex_I*Z[2])/c;
|
||||
}
|
||||
/* form Givens rotation matrix */
|
||||
G[0]=c;
|
||||
G[1]=-s;
|
||||
G[2]=conj(s);
|
||||
G[3]=conj(c);
|
||||
#ifdef DEBUG
|
||||
printf("G=[\n");
|
||||
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(G[0]),cimag(G[0]),creal(G[2]),cimag(G[2]));
|
||||
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(G[1]),cimag(G[1]),creal(G[3]),cimag(G[3]));
|
||||
printf("];\n");
|
||||
#endif
|
||||
/* rotate J <= J * G^H: Jcopy = 1 x J x G^H + 0 x Jcopy */
|
||||
my_zgemm('N','C',2*N,2,2,1.0+_Complex_I*0.0,J,2*N,G,2,0.0+_Complex_I*0.0,Jcopy,2*N);
|
||||
/* before copying updated result, find residual norm */
|
||||
/* J = -Jcopy + J */
|
||||
my_caxpy(4*N,Jcopy,-1.0+_Complex_I*0.0,J);
|
||||
#ifdef DEBUG
|
||||
printf("Iter %d residual=%lf\n",ni,my_cnrm2(4*N,J));
|
||||
#endif
|
||||
memcpy(J,Jcopy,(size_t)4*N*sizeof(complex double));
|
||||
#ifdef DEBUG
|
||||
printf("JGH=[\n");
|
||||
for (ci=0; ci<N; ci++) {
|
||||
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci]),cimag(J[2*ci]),creal(J[2*N+2*ci]),cimag(J[2*N+2*ci]));
|
||||
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci+1]),cimag(J[2*ci+1]),creal(J[2*N+2*ci+1]),cimag(J[2*N+2*ci+1]));
|
||||
}
|
||||
printf("];\n");
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
free(WORK);
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Jfinal=[\n");
|
||||
for (ci=0; ci<N; ci++) {
|
||||
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci]),cimag(J[2*ci]),creal(J[2*N+2*ci]),cimag(J[2*N+2*ci]));
|
||||
printf("%lf+j*(%lf), %lf+j*(%lf)\n",creal(J[2*ci+1]),cimag(J[2*ci+1]),creal(J[2*N+2*ci+1]),cimag(J[2*N+2*ci+1]));
|
||||
}
|
||||
printf("];\n");
|
||||
#endif
|
||||
|
||||
|
||||
/* extract phase only from diagonal elements */
|
||||
for (ci=0; ci<N; ci++) {
|
||||
J[2*ci]=J[2*ci]/cabs(J[2*ci]);
|
||||
J[2*ci+2*N+1]=J[2*ci+2*N+1]/cabs(J[2*ci+2*N+1]);
|
||||
}
|
||||
|
||||
/* copy back to output (only the diagonal values) */
|
||||
memset(pout,0,sizeof(double)*8*N);
|
||||
my_dcopy(N, &Jx[0], 4, &pout[0], 8);
|
||||
my_dcopy(N, &Jx[1], 4, &pout[0+1], 8);
|
||||
my_dcopy(N, &Jx[4*N+2], 4, &pout[0+6], 8);
|
||||
my_dcopy(N, &Jx[4*N+3], 4, &pout[0+7], 8);
|
||||
|
||||
free(J);
|
||||
free(Jcopy);
|
||||
return 0;
|
||||
}
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -1,380 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
#include "cuda.h"
|
||||
#include <cuComplex.h>
|
||||
#include <stdio.h>
|
||||
|
||||
/* enable this for checking for kernel failure */
|
||||
//#define CUDA_DBG
|
||||
|
||||
__global__ void kernel_diagdiv_fl(int M, float eps, float *y, float *x){
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* make sure to use only M threads */
|
||||
if (tid<M) {
|
||||
if (x[tid]>eps) {
|
||||
y[tid]=y[tid]/x[tid];
|
||||
} else {
|
||||
y[tid]=0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kernel_diagmu_fl(int M, float *A,float mu){
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* make sure to use only M threads */
|
||||
if (tid<M) {
|
||||
A[tid*(M+1)]=A[tid*(M+1)]+mu;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void kernel_func_fl(int Nbase, float *x, float *coh, float *p, short *bb, int N){
|
||||
/* global thread index : equal to the baseline */
|
||||
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
|
||||
/* this thread works on
|
||||
x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
|
||||
bb[2*n:2*n+1] (sta1,sta2)
|
||||
organization of p (N stations and M clusters)
|
||||
sta 0 sta 1 sta 2 .... sta N-1
|
||||
clus 0 0...7 8...15 16...23 ... 8N-8 8N-1
|
||||
clus 1 8N..8N+7 8N+8..8N+15 8N+16..8N+23 .... 8N+8N-8...8N+8N-1
|
||||
......
|
||||
clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15.... ...(M-1)N+8N-8 (M-1)N+8N-1
|
||||
|
||||
organization of coherencies (coh)
|
||||
[0, 8*M-1] : baseline 0
|
||||
[8*M, 8*M+8*M-1]: baseline 1
|
||||
[n*8*M, n*8*M+8*M-1]: baseline n
|
||||
......
|
||||
[n*8*M+cm*8, n*8*M+cm*8+7] cluster cm, baseline n
|
||||
|
||||
residual error stored at sum[n]
|
||||
*/
|
||||
|
||||
if(n<Nbase) {
|
||||
int sta1=(int)bb[2*n];
|
||||
int sta2=(int)bb[2*n+1];
|
||||
|
||||
/* condition for calculating this baseline sum is
|
||||
1) its not flagged (sta1,sta2)>=0
|
||||
*/
|
||||
if (sta1>=0 && sta2>=0) {
|
||||
cuFloatComplex G1[4];
|
||||
float pp[8];
|
||||
pp[0]=p[sta1*8];
|
||||
pp[1]=p[sta1*8+1];
|
||||
pp[2]=p[sta1*8+2];
|
||||
pp[3]=p[sta1*8+3];
|
||||
pp[4]=p[sta1*8+4];
|
||||
pp[5]=p[sta1*8+5];
|
||||
pp[6]=p[sta1*8+6];
|
||||
pp[7]=p[sta1*8+7];
|
||||
G1[0].x=pp[0];
|
||||
G1[0].y=pp[1];
|
||||
G1[1].x=pp[2];
|
||||
G1[1].y=pp[3];
|
||||
G1[2].x=pp[4];
|
||||
G1[2].y=pp[5];
|
||||
G1[3].x=pp[6];
|
||||
G1[3].y=pp[7];
|
||||
|
||||
|
||||
cuFloatComplex C[4];
|
||||
C[0].x=coh[8*n];
|
||||
C[0].y=coh[8*n+1];
|
||||
C[1].x=coh[8*n+2];
|
||||
C[1].y=coh[8*n+3];
|
||||
C[2].x=coh[8*n+4];
|
||||
C[2].y=coh[8*n+5];
|
||||
C[3].x=coh[8*n+6];
|
||||
C[3].y=coh[8*n+7];
|
||||
|
||||
cuFloatComplex T1[4];
|
||||
/* T=G1*C */
|
||||
T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
|
||||
T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
|
||||
T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
|
||||
T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
|
||||
|
||||
cuFloatComplex G2[4];
|
||||
/* conjugate this */
|
||||
pp[0]=p[sta2*8];
|
||||
pp[1]=-p[sta2*8+1];
|
||||
pp[2]=p[sta2*8+2];
|
||||
pp[3]=-p[sta2*8+3];
|
||||
pp[4]=p[sta2*8+4];
|
||||
pp[5]=-p[sta2*8+5];
|
||||
pp[6]=p[sta2*8+6];
|
||||
pp[7]=-p[sta2*8+7];
|
||||
G2[0].x=pp[0];
|
||||
G2[0].y=pp[1];
|
||||
G2[2].x=pp[2];
|
||||
G2[2].y=pp[3];
|
||||
G2[1].x=pp[4];
|
||||
G2[1].y=pp[5];
|
||||
G2[3].x=pp[6];
|
||||
G2[3].y=pp[7];
|
||||
|
||||
cuFloatComplex T2[4];
|
||||
T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
|
||||
T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
|
||||
T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
|
||||
T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
|
||||
/* update model vector */
|
||||
x[8*n]=T2[0].x;
|
||||
x[8*n+1]=T2[0].y;
|
||||
x[8*n+2]=T2[1].x;
|
||||
x[8*n+3]=T2[1].y;
|
||||
x[8*n+4]=T2[2].x;
|
||||
x[8*n+5]=T2[2].y;
|
||||
x[8*n+6]=T2[3].x;
|
||||
x[8*n+7]=T2[3].y;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void kernel_jacf_fl(int Nbase, int M, float *jac, float *coh, float *p, short *bb, int N){
|
||||
/* global thread index : equal to the baseline */
|
||||
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
/* which parameter:0...M */
|
||||
unsigned int m = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
|
||||
/* this thread works on
|
||||
x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
|
||||
bb[2*n:2*n+1] (sta1,sta2)
|
||||
organization of p (N stations and M clusters)
|
||||
sta 0 sta 1 sta 2 .... sta N-1
|
||||
clus 0 0...7 8...15 16...23 ... 8N-8 8N-1
|
||||
clus 1 8N..8N+7 8N+8..8N+15 8N+16..8N+23 .... 8N+8N-8...8N+8N-1
|
||||
......
|
||||
clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15.... ...(M-1)N+8N-8 (M-1)N+8N-1
|
||||
|
||||
organization of coherencies (coh)
|
||||
[0, 8*M-1] : baseline 0
|
||||
[8*M, 8*M+8*M-1]: baseline 1
|
||||
[n*8*M, n*8*M+8*M-1]: baseline n
|
||||
......
|
||||
[n*8*M+cm*8, n*8*M+cm*8+7] cluster cm, baseline n
|
||||
|
||||
residual error stored at sum[n]
|
||||
*/
|
||||
|
||||
if(n<Nbase && m<M) {
|
||||
int sta1=(int)bb[2*n];
|
||||
int sta2=(int)bb[2*n+1];
|
||||
/* condition for calculating this baseline sum is
|
||||
If this baseline is flagged,
|
||||
or if this parameter does not belong to sta1 or sta2
|
||||
we do not compute
|
||||
*/
|
||||
//int stc=m/8; /* 0...Ns-1 (because M=total par= 8 * Nstations */
|
||||
int stc=m>>3; /* 0...Ns-1 (because M=total par= 8 * Nstations */
|
||||
|
||||
if (((stc==sta2)||(stc==sta1)) && sta1>=0 && sta2>=0 ) {
|
||||
|
||||
cuFloatComplex C[4];
|
||||
C[0].x=coh[8*n];
|
||||
C[0].y=coh[8*n+1];
|
||||
C[1].x=coh[8*n+2];
|
||||
C[1].y=coh[8*n+3];
|
||||
C[2].x=coh[8*n+4];
|
||||
C[2].y=coh[8*n+5];
|
||||
C[3].x=coh[8*n+6];
|
||||
C[3].y=coh[8*n+7];
|
||||
|
||||
/* which parameter exactly 0..7 */
|
||||
//int stoff=m%8;
|
||||
int stoff=m-stc*8;
|
||||
float pp1[8];
|
||||
float pp2[8];
|
||||
if (stc==sta1) {
|
||||
for (int cn=0; cn<8; cn++) {
|
||||
pp1[cn]=0.0f;
|
||||
pp2[cn]=p[sta2*8+cn];
|
||||
}
|
||||
pp1[stoff]=1.0f;
|
||||
} else if (stc==sta2) {
|
||||
for (int cn=0; cn<8; cn++) {
|
||||
pp2[cn]=0.0f;
|
||||
pp1[cn]=p[sta1*8+cn];
|
||||
}
|
||||
pp2[stoff]=1.0f;
|
||||
}
|
||||
|
||||
|
||||
cuFloatComplex G1[4];
|
||||
G1[0].x=pp1[0];
|
||||
G1[0].y=pp1[1];
|
||||
G1[1].x=pp1[2];
|
||||
G1[1].y=pp1[3];
|
||||
G1[2].x=pp1[4];
|
||||
G1[2].y=pp1[5];
|
||||
G1[3].x=pp1[6];
|
||||
G1[3].y=pp1[7];
|
||||
|
||||
cuFloatComplex T1[4];
|
||||
/* T=G1*C */
|
||||
T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
|
||||
T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
|
||||
T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
|
||||
T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
|
||||
|
||||
cuFloatComplex G2[4];
|
||||
/* conjugate this */
|
||||
G2[0].x=pp2[0];
|
||||
G2[0].y=-pp2[1];
|
||||
G2[2].x=pp2[2];
|
||||
G2[2].y=-pp2[3];
|
||||
G2[1].x=pp2[4];
|
||||
G2[1].y=-pp2[5];
|
||||
G2[3].x=pp2[6];
|
||||
G2[3].y=-pp2[7];
|
||||
|
||||
cuFloatComplex T2[4];
|
||||
T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
|
||||
T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
|
||||
T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
|
||||
T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
|
||||
/* update jacobian */
|
||||
/* NOTE: row major order */
|
||||
jac[m+M*8*n]=T2[0].x;
|
||||
jac[m+M*(8*n+1)]=T2[0].y;
|
||||
jac[m+M*(8*n+2)]=T2[1].x;
|
||||
jac[m+M*(8*n+3)]=T2[1].y;
|
||||
jac[m+M*(8*n+4)]=T2[2].x;
|
||||
jac[m+M*(8*n+5)]=T2[2].y;
|
||||
jac[m+M*(8*n+6)]=T2[3].x;
|
||||
jac[m+M*(8*n+7)]=T2[3].y;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* only use extern if calling code is C */
|
||||
extern "C"
|
||||
{
|
||||
|
||||
|
||||
/* divide by singular values Dpd[]/Sd[] for Sd[]> eps */
|
||||
void
|
||||
cudakernel_diagdiv_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float eps, float *Dpd, float *Sd) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_diagdiv_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, eps, Dpd, Sd);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
/* cuda driver for calculating
|
||||
A<= A+mu I, adding mu to diagonal entries of A
|
||||
A: size MxM
|
||||
ThreadsPerBlock, BlocksPerGrid calculated to meet M
|
||||
*/
|
||||
void
|
||||
cudakernel_diagmu_fl(int ThreadsPerBlock, int BlocksPerGrid, int M, float *A, float mu) {
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_diagmu_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(M, A, mu);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/* cuda driver for calculating f() */
|
||||
/* p: params (Mx1), x: data (Nx1), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
|
||||
void
|
||||
cudakernel_func_fl(int ThreadsPerBlock, int BlocksPerGrid, float *p, float *x, int M, int N, float *coh, short *bbh, int Nbase, int Mclus, int Nstations) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
cudaMemset(x, 0, N*sizeof(float));
|
||||
// printf("Kernel data size=%d, block=%d, thread=%d, baselines=%d\n",N,BlocksPerGrid, ThreadsPerBlock,Nbase);
|
||||
kernel_func_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(Nbase, x, coh, p, bbh, Nstations);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
/* cuda driver for calculating jacf() */
|
||||
/* p: params (Mx1), jac: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
|
||||
void
|
||||
cudakernel_jacf_fl(int ThreadsPerBlock_row, int ThreadsPerBlock_col, float *p, float *jac, int M, int N, float *coh, short *bbh, int Nbase, int Mclus, int Nstations) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
/* NOTE: use small value for ThreadsPerBlock here, like 8 */
|
||||
dim3 threadsPerBlock(16, 8);
|
||||
/* jacobian: Nbase x Nstations (proportional to N), so */
|
||||
dim3 numBlocks((Nbase+threadsPerBlock.x-1)/threadsPerBlock.x,
|
||||
(M+threadsPerBlock.y-1)/threadsPerBlock.y);
|
||||
/* set memory of jac to zero */
|
||||
cudaMemset(jac, 0, N*M*sizeof(float));
|
||||
// printf("Kernel Jax data size=%d, params=%d, block=%d,%d, thread=%d,%d, baselines=%d\n",N, M, numBlocks.x,numBlocks.y, threadsPerBlock.x, threadsPerBlock.y, Nbase);
|
||||
kernel_jacf_fl<<< numBlocks, threadsPerBlock>>>(Nbase, M, jac, coh, p, bbh, Nstations);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
|
@ -1,462 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
|
||||
#include "Solvers.h"
|
||||
#include <string.h> /* for memcpy */
|
||||
|
||||
/* machine precision */
|
||||
double
|
||||
dlamch(char CMACH) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern double dlamch_(char *CMACH);
|
||||
return(dlamch_(&CMACH));
|
||||
}
|
||||
|
||||
|
||||
/* blas dcopy */
|
||||
/* y = x */
|
||||
/* read x values spaced by Nx (so x size> N*Nx) */
|
||||
/* write to y values spaced by Ny (so y size > N*Ny) */
|
||||
void
|
||||
my_dcopy(int N, double *x, int Nx, double *y, int Ny) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void dcopy_(int *N, double *x, int *incx, double *y, int *incy);
|
||||
/* use memcpy if Nx=Ny=1 */
|
||||
if (Nx==1&&Ny==1) {
|
||||
memcpy((void*)y,(void*)x,sizeof(double)*(size_t)N);
|
||||
} else {
|
||||
dcopy_(&N,x,&Nx,y,&Ny);
|
||||
}
|
||||
}
|
||||
/* blas scale */
|
||||
/* x = a. x */
|
||||
void
|
||||
my_dscal(int N, double a, double *x) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void dscal_(int *N, double *alpha, double *x, int *incx);
|
||||
int i=1;
|
||||
dscal_(&N,&a,x,&i);
|
||||
}
|
||||
void
|
||||
my_sscal(int N, float a, float *x) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void sscal_(int *N, float *alpha, float *x, int *incx);
|
||||
int i=1;
|
||||
sscal_(&N,&a,x,&i);
|
||||
}
|
||||
|
||||
/* x^T*y */
|
||||
double
|
||||
my_ddot(int N, double *x, double *y) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern double ddot_(int *N, double *x, int *incx, double *y, int *incy);
|
||||
int i=1;
|
||||
return(ddot_(&N,x,&i,y,&i));
|
||||
}
|
||||
|
||||
/* ||x||_2 */
|
||||
double
|
||||
my_dnrm2(int N, double *x) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern double dnrm2_(int *N, double *x, int *incx);
|
||||
int i=1;
|
||||
return(dnrm2_(&N,x,&i));
|
||||
}
|
||||
float
|
||||
my_fnrm2(int N, float *x) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern float snrm2_(int *N, float *x, int *incx);
|
||||
int i=1;
|
||||
return(snrm2_(&N,x,&i));
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* sum||x||_1 */
|
||||
double
|
||||
my_dasum(int N, double *x) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern double dasum_(int *N, double *x, int *incx);
|
||||
int i=1;
|
||||
return(dasum_(&N,x,&i));
|
||||
}
|
||||
float
|
||||
my_fasum(int N, float *x) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern float sasum_(int *N, float *x, int *incx);
|
||||
int i=1;
|
||||
return(sasum_(&N,x,&i));
|
||||
}
|
||||
|
||||
/* BLAS y = a.x + y */
|
||||
void
|
||||
my_daxpy(int N, double *x, double a, double *y) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void daxpy_(int *N, double *alpha, double *x, int *incx, double *y, int *incy);
|
||||
int i=1; /* strides */
|
||||
daxpy_(&N,&a,x,&i,y,&i);
|
||||
}
|
||||
|
||||
/* BLAS y = a.x + y */
|
||||
void
|
||||
my_daxpys(int N, double *x, int incx, double a, double *y, int incy) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void daxpy_(int *N, double *alpha, double *x, int *incx, double *y, int *incy);
|
||||
daxpy_(&N,&a,x,&incx,y,&incy);
|
||||
}
|
||||
|
||||
void
|
||||
my_saxpy(int N, float *x, float a, float *y) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void saxpy_(int *N, float *alpha, float *x, int *incx, float *y, int *incy);
|
||||
int i=1; /* strides */
|
||||
saxpy_(&N,&a,x,&i,y,&i);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* max |x| index (start from 1...)*/
|
||||
int
|
||||
my_idamax(int N, double *x, int incx) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern int idamax_(int *N, double *x, int *incx);
|
||||
return idamax_(&N,x,&incx);
|
||||
}
|
||||
|
||||
int
|
||||
my_isamax(int N, float *x, int incx) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern int isamax_(int *N, float *x, int *incx);
|
||||
return isamax_(&N,x,&incx);
|
||||
}
|
||||
|
||||
/* min |x| index (start from 1...)*/
|
||||
int
|
||||
my_idamin(int N, double *x, int incx) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern int idamin_(int *N, double *x, int *incx);
|
||||
return idamin_(&N,x,&incx);
|
||||
}
|
||||
|
||||
/* BLAS DGEMM C = alpha*op(A)*op(B)+ beta*C */
|
||||
void
|
||||
my_dgemm(char transa, char transb, int M, int N, int K, double alpha, double *A, int lda, double *B, int ldb, double beta, double *C, int ldc) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void dgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, double *ALPHA, double *A, int *LDA, double *B, int * LDB, double *BETA, double *C, int *LDC);
|
||||
dgemm_(&transa, &transb, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
|
||||
}
|
||||
|
||||
/* BLAS DGEMV y = alpha*op(A)*x+ beta*y : op 'T' or 'N' */
|
||||
void
|
||||
my_dgemv(char trans, int M, int N, double alpha, double *A, int lda, double *x, int incx, double beta, double *y, int incy) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void dgemv_(char *TRANS, int *M, int *N, double *ALPHA, double *A, int *LDA, double *X, int *INCX, double *BETA, double *Y, int *INCY);
|
||||
dgemv_(&trans, &M, &N, &alpha, A, &lda, x, &incx, &beta, y, &incy);
|
||||
}
|
||||
|
||||
|
||||
/* following routines used in LAPACK solvers */
|
||||
/* cholesky factorization: real symmetric */
|
||||
int
|
||||
my_dpotrf(char uplo, int N, double *A, int lda) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void dpotrf_(char *uplo, int *N, double *A, int *lda, int *info);
|
||||
int info;
|
||||
dpotrf_(&uplo,&N,A,&lda,&info);
|
||||
return info;
|
||||
}
|
||||
|
||||
/* solve Ax=b using cholesky factorization */
|
||||
int
|
||||
my_dpotrs(char uplo, int N, int nrhs, double *A, int lda, double *b, int ldb){
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void dpotrs_(char *uplo, int *N, int *nrhs, double *A, int *lda, double *b, int *ldb, int *info);
|
||||
int info;
|
||||
dpotrs_(&uplo,&N,&nrhs,A,&lda,b,&ldb,&info);
|
||||
return info;
|
||||
}
|
||||
|
||||
/* solve Ax=b using QR factorization */
|
||||
int
|
||||
my_dgels(char TRANS, int M, int N, int NRHS, double *A, int LDA, double *B, int LDB, double *WORK, int LWORK) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void dgels_(char *TRANS, int *M, int *N, int *NRHS, double *A, int *LDA, double *B, int *LDB, double *WORK, int *LWORK, int *INFO);
|
||||
int info;
|
||||
dgels_(&TRANS,&M,&N,&NRHS,A,&LDA,B,&LDB,WORK,&LWORK,&info);
|
||||
return info;
|
||||
}
|
||||
|
||||
|
||||
/* A=U S VT, so V needs NOT to be transposed */
|
||||
int
|
||||
my_dgesvd(char JOBU, char JOBVT, int M, int N, double *A, int LDA, double *S,
|
||||
double *U, int LDU, double *VT, int LDVT, double *WORK, int LWORK) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void dgesvd_(char *JOBU, char *JOBVT, int *M, int *N, double *A,
|
||||
int *LDA, double *S, double *U, int *LDU, double *VT, int *LDVT,
|
||||
double *WORK, int *LWORK, int *info);
|
||||
int info;
|
||||
dgesvd_(&JOBU,&JOBVT,&M,&N,A,&LDA,S,U,&LDU,VT,&LDVT,WORK,&LWORK,&info);
|
||||
return info;
|
||||
}
|
||||
|
||||
/* QR factorization QR=A, only TAU is used for Q, R stored in A*/
|
||||
int
|
||||
my_dgeqrf(int M, int N, double *A, int LDA, double *TAU, double *WORK, int LWORK) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void dgeqrf_(int *M, int *N, double *A, int *LDA, double *TAU, double *WORK, int *LWORK, int *INFO);
|
||||
int info;
|
||||
dgeqrf_(&M,&N,A,&LDA,TAU,WORK,&LWORK,&info);
|
||||
return info;
|
||||
}
|
||||
|
||||
/* calculate Q using elementary reflections */
|
||||
int
|
||||
my_dorgqr(int M,int N,int K,double *A,int LDA,double *TAU,double *WORK,int LWORK) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void dorgqr_(int *M, int *N, int *K, double *A, int *LDA, double *TAU, double *WORK, int *LWORK, int *INFO);
|
||||
int info;
|
||||
dorgqr_(&M, &N, &K, A, &LDA, TAU, WORK, &LWORK, &info);
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
/* solves a triangular system of equations Ax=b, A triangular */
|
||||
int
|
||||
my_dtrtrs(char UPLO, char TRANS, char DIAG,int N,int NRHS,double *A,int LDA,double *B,int LDB) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void dtrtrs_(char *UPLO,char *TRANS,char *DIAG,int *N,int *NRHS,double *A,int *LDA,double *B,int *LDB,int *INFO);
|
||||
int info;
|
||||
dtrtrs_(&UPLO,&TRANS,&DIAG,&N,&NRHS,A,&LDA,B,&LDB,&info);
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
|
||||
/* blas ccopy */
|
||||
/* y = x */
|
||||
/* read x values spaced by Nx (so x size> N*Nx) */
|
||||
/* write to y values spaced by Ny (so y size > N*Ny) */
|
||||
void
|
||||
my_ccopy(int N, complex double *x, int Nx, complex double *y, int Ny) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void zcopy_(int *N, complex double *x, int *incx, complex double *y, int *incy);
|
||||
/* use memcpy if Nx=Ny=1 */
|
||||
if (Nx==1&&Ny==1) {
|
||||
memcpy((void*)y,(void*)x,sizeof(complex double)*(size_t)N);
|
||||
} else {
|
||||
zcopy_(&N,x,&Nx,y,&Ny);
|
||||
}
|
||||
}
|
||||
|
||||
/* blas scale */
|
||||
/* x = a. x */
|
||||
void
|
||||
my_cscal(int N, complex double a, complex double *x) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void zscal_(int *N, complex double *alpha, complex double *x, int *incx);
|
||||
int i=1;
|
||||
zscal_(&N,&a,x,&i);
|
||||
}
|
||||
|
||||
/* BLAS y = a.x + y */
|
||||
void
|
||||
my_caxpy(int N, complex double *x, complex double a, complex double *y) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void zaxpy_(int *N, complex double *alpha, complex double *x, int *incx, complex double *y, int *incy);
|
||||
int i=1; /* strides */
|
||||
zaxpy_(&N,&a,x,&i,y,&i);
|
||||
}
|
||||
|
||||
|
||||
/* BLAS x^H*y */
|
||||
complex double
|
||||
my_cdot(int N, complex double *x, complex double *y) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern complex double zdotc_(int *N, complex double *x, int *incx, complex double *y, int *incy);
|
||||
int i=1;
|
||||
return(zdotc_(&N,x,&i,y,&i));
|
||||
}
|
||||
|
||||
/* A=U S VT, so V needs NOT to be transposed */
|
||||
int
|
||||
my_zgesvd(char JOBU, char JOBVT, int M, int N, complex double *A, int LDA, double *S,
|
||||
complex double *U, int LDU, complex double *VT, int LDVT, complex double *WORK, int LWORK, double *RWORK) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void zgesvd_(char *JOBU, char *JOBVT, int *M, int *N, complex double *A,
|
||||
int *LDA, double *S, complex double *U, int *LDU, complex double *VT, int *LDVT,
|
||||
complex double *WORK, int *LWORK, double *RWORK, int *info);
|
||||
int info;
|
||||
zgesvd_(&JOBU,&JOBVT,&M,&N,A,&LDA,S,U,&LDU,VT,&LDVT,WORK,&LWORK,RWORK,&info);
|
||||
return info;
|
||||
}
|
||||
|
||||
/* solve Ax=b using QR factorization */
|
||||
int
|
||||
my_zgels(char TRANS, int M, int N, int NRHS, complex double *A, int LDA, complex double *B, int LDB, complex double *WORK, int LWORK) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void zgels_(char *TRANS, int *M, int *N, int *NRHS, complex double *A, int *LDA, complex double *B, int *LDB, complex double *WORK, int *LWORK, int *INFO);
|
||||
int info;
|
||||
zgels_(&TRANS,&M,&N,&NRHS,A,&LDA,B,&LDB,WORK,&LWORK,&info);
|
||||
return info;
|
||||
}
|
||||
|
||||
|
||||
/* solve Ax=b using QR factorization */
|
||||
int
|
||||
my_cgels(char TRANS, int M, int N, int NRHS, complex float *A, int LDA, complex float *B, int LDB, complex float *WORK, int LWORK) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void cgels_(char *TRANS, int *M, int *N, int *NRHS, complex float *A, int *LDA, complex float *B, int *LDB, complex float *WORK, int *LWORK, int *INFO);
|
||||
int info;
|
||||
cgels_(&TRANS,&M,&N,&NRHS,A,&LDA,B,&LDB,WORK,&LWORK,&info);
|
||||
return info;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* BLAS ZGEMM C = alpha*op(A)*op(B)+ beta*C */
|
||||
void
|
||||
my_zgemm(char transa, char transb, int M, int N, int K, complex double alpha, complex double *A, int lda, complex double *B, int ldb, complex double beta, complex double *C, int ldc) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void zgemm_(char *TRANSA, char *TRANSB, int *M, int *N, int *K, complex double *ALPHA, complex double *A, int *LDA, complex double *B, int * LDB, complex double *BETA, complex double *C, int *LDC);
|
||||
zgemm_(&transa, &transb, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
|
||||
}
|
||||
|
||||
/* ||x||_2 */
|
||||
double
|
||||
my_cnrm2(int N, complex double *x) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern double dznrm2_(int *N, complex double *x, int *incx);
|
||||
int i=1;
|
||||
return(dznrm2_(&N,x,&i));
|
||||
}
|
||||
|
||||
/* blas fcopy */
|
||||
/* y = x */
|
||||
/* read x values spaced by Nx (so x size> N*Nx) */
|
||||
/* write to y values spaced by Ny (so y size > N*Ny) */
|
||||
void
|
||||
my_fcopy(int N, float *x, int Nx, float *y, int Ny) {
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
extern void scopy_(int *N, float *x, int *incx, float *y, int *incy);
|
||||
/* use memcpy if Nx=Ny=1 */
|
||||
if (Nx==1&&Ny==1) {
|
||||
memcpy((void*)y,(void*)x,sizeof(float)*(size_t)N);
|
||||
} else {
|
||||
scopy_(&N,x,&Nx,y,&Ny);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* LAPACK eigen value expert routine, real symmetric matrix */
|
||||
int
|
||||
my_dsyevx(char jobz, char range, char uplo, int N, double *A, int lda,
|
||||
double vl, double vu, int il, int iu, double abstol, int M, double *W,
|
||||
double *Z, int ldz, double *WORK, int lwork, int *iwork, int *ifail) {
|
||||
|
||||
extern void dsyevx_(char *JOBZ, char *RANGE, char *UPLO, int *N, double *A, int *LDA,
|
||||
double *VL, double *VU, int *IL, int *IU, double *ABSTOL, int *M, double *W, double *Z,
|
||||
int *LDZ, double *WORK, int *LWORK, int *IWORK, int *IFAIL, int *INFO);
|
||||
int info;
|
||||
dsyevx_(&jobz,&range,&uplo,&N,A,&lda,&vl,&vu,&il,&iu,&abstol,&M,W,Z,&ldz,WORK,&lwork,iwork,ifail,&info);
|
||||
return info;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* BLAS vector outer product
|
||||
A= alpha x x^H + A
|
||||
*/
|
||||
void
|
||||
my_zher(char uplo, int N, double alpha, complex double *x, int incx, complex double *A, int lda) {
|
||||
|
||||
extern void zher_(char *UPLO, int *N, double *ALPHA, complex double *X, int *INCX, complex double *A, int *LDA);
|
||||
|
||||
zher_(&uplo,&N,&alpha,x,&incx,A,&lda);
|
||||
}
|
Binary file not shown.
|
@ -1,705 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
#include "Solvers.h"
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
//#define DEBUG
|
||||
/* helper functions for diagnostics */
|
||||
static void
|
||||
checkCudaError(cudaError_t err, char *file, int line)
|
||||
{
|
||||
#ifdef CUDA_DEBUG
|
||||
if(!err)
|
||||
return;
|
||||
fprintf(stderr,"GPU (CUDA): %s %s %d\n", cudaGetErrorString(err),file,line);
|
||||
exit(EXIT_FAILURE);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
checkCublasError(cublasStatus_t cbstatus, char *file, int line)
|
||||
{
|
||||
#ifdef CUDA_DEBUG
|
||||
if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
|
||||
fprintf(stderr,"%s: %d: CUBLAS failure\n",file,line);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* OS-LM, but f() and jac() calculations are done
|
||||
entirely in the GPU */
|
||||
int
|
||||
oslevmar_der_single_cuda(
|
||||
void (*func)(double *p, double *hx, int m, int n, void *adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in R^n */
|
||||
void (*jacf)(double *p, double *j, int m, int n, void *adata), /* function to evaluate the Jacobian \part x / \part p */
|
||||
double *p, /* I/O: initial parameter estimates. On output has the estimated solution */
|
||||
double *x, /* I: measurement vector. NULL implies a zero vector */
|
||||
int M, /* I: parameter vector dimension (i.e. #unknowns) */
|
||||
int N, /* I: measurement vector dimension */
|
||||
int itmax, /* I: maximum number of iterations */
|
||||
double opts[4], /* I: minim. options [\mu, \epsilon1, \epsilon2, \epsilon3]. Respectively the scale factor for initial \mu,
|
||||
* stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2. Set to NULL for defaults to be used
|
||||
*/
|
||||
double info[10],
|
||||
/* O: information regarding the minimization. Set to NULL if don't care
|
||||
* info[0]= ||e||_2 at initial p.
|
||||
* info[1-4]=[ ||e||_2, ||J^T e||_inf, ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p.
|
||||
* info[5]= # iterations,
|
||||
* info[6]=reason for terminating: 1 - stopped by small gradient J^T e
|
||||
* 2 - stopped by small Dp
|
||||
* 3 - stopped by itmax
|
||||
* 4 - singular matrix. Restart from current p with increased mu
|
||||
* 5 - no further error reduction is possible. Restart with increased mu
|
||||
* 6 - stopped by small ||e||_2
|
||||
* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
|
||||
* info[7]= # function evaluations
|
||||
* info[8]= # Jacobian evaluations
|
||||
* info[9]= # linear systems solved, i.e. # attempts for reducing error
|
||||
*/
|
||||
|
||||
cublasHandle_t cbhandle, /* device handle */
|
||||
cusolverDnHandle_t solver_handle, /* solver handle */
|
||||
double *gWORK, /* GPU allocated memory */
|
||||
int linsolv, /* 0 Cholesky, 1 QR, 2 SVD */
|
||||
int tileoff, /* tile offset when solving for many chunks */
|
||||
int ntiles, /* total tile (data) size being solved for */
|
||||
int randomize, /* if >0 randomize */
|
||||
void *adata) /* pointer to possibly additional data, passed uninterpreted to func & jacf.
|
||||
* Set to NULL if not needed
|
||||
*/
|
||||
{
|
||||
|
||||
/* general note: all device variables end with a 'd' */
|
||||
int stop=0;
|
||||
cudaError_t err;
|
||||
cublasStatus_t cbstatus;
|
||||
|
||||
int nu=2,nu2;
|
||||
double p_L2, Dp_L2=DBL_MAX, dF, dL, p_eL2, jacTe_inf=0.0, pDp_eL2, init_p_eL2;
|
||||
double tmp,mu=0.0;
|
||||
double tau, eps1, eps2, eps2_sq, eps3;
|
||||
int k,ci,issolved;
|
||||
|
||||
double *hxd;
|
||||
|
||||
double *ed;
|
||||
double *xd;
|
||||
|
||||
double *jacd;
|
||||
|
||||
double *jacTjacd,*jacTjacd0;
|
||||
|
||||
double *Dpd,*bd;
|
||||
double *pd,*pnewd;
|
||||
double *jacTed;
|
||||
|
||||
/* used in QR solver */
|
||||
double *taud;
|
||||
|
||||
/* used in SVD solver */
|
||||
double *Ud;
|
||||
double *VTd;
|
||||
double *Sd;
|
||||
|
||||
/* ME data */
|
||||
me_data_t *dp=(me_data_t*)adata;
|
||||
int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
|
||||
/* coherency on device */
|
||||
double *cohd;
|
||||
/* baseline-station map on device/host */
|
||||
short *bbd;
|
||||
|
||||
int solve_axb=linsolv;
|
||||
|
||||
/* setup default settings */
|
||||
if(opts){
|
||||
tau=opts[0];
|
||||
eps1=opts[1];
|
||||
eps2=opts[2];
|
||||
eps2_sq=opts[2]*opts[2];
|
||||
eps3=opts[3];
|
||||
} else {
|
||||
tau=CLM_INIT_MU;
|
||||
eps1=CLM_STOP_THRESH;
|
||||
eps2=CLM_STOP_THRESH;
|
||||
eps2_sq=CLM_STOP_THRESH*CLM_STOP_THRESH;
|
||||
eps3=CLM_STOP_THRESH;
|
||||
}
|
||||
|
||||
/* calculate no of cuda threads and blocks */
|
||||
int ThreadsPerBlock=DEFAULT_TH_PER_BK;
|
||||
int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
|
||||
|
||||
|
||||
unsigned long int moff;
|
||||
if (!gWORK) {
|
||||
err=cudaMalloc((void**)&xd, N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&jacd, M*N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&jacTjacd, M*M*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&jacTed, M*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&jacTjacd0, M*M*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&Dpd, M*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&bd, M*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&pd, M*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&pnewd, M*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* needed for calculating f() and jac() */
|
||||
err=cudaMalloc((void**) &bbd, Nbase*2*sizeof(short));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* we need coherencies for only this cluster */
|
||||
err=cudaMalloc((void**) &cohd, Nbase*8*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&hxd, N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&ed, N*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* memory allocation: different solvers */
|
||||
if (solve_axb==1) {
|
||||
err=cudaMalloc((void**)&taud, M*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (solve_axb==2) {
|
||||
err=cudaMalloc((void**)&Ud, M*M*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&VTd, M*M*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&Sd, M*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
} else {
|
||||
moff=0;
|
||||
xd=&gWORK[moff];
|
||||
moff+=N;
|
||||
jacd=&gWORK[moff];
|
||||
moff+=M*N;
|
||||
jacTjacd=&gWORK[moff];
|
||||
moff+=M*M;
|
||||
jacTed=&gWORK[moff];
|
||||
moff+=M;
|
||||
jacTjacd0=&gWORK[moff];
|
||||
moff+=M*M;
|
||||
Dpd=&gWORK[moff];
|
||||
moff+=M;
|
||||
bd=&gWORK[moff];
|
||||
moff+=M;
|
||||
pd=&gWORK[moff];
|
||||
moff+=M;
|
||||
pnewd=&gWORK[moff];
|
||||
moff+=M;
|
||||
cohd=&gWORK[moff];
|
||||
moff+=Nbase*8;
|
||||
hxd=&gWORK[moff];
|
||||
moff+=N;
|
||||
ed=&gWORK[moff];
|
||||
moff+=N;
|
||||
if (solve_axb==1) {
|
||||
taud=&gWORK[moff];
|
||||
moff+=M;
|
||||
} else if (solve_axb==2) {
|
||||
Ud=&gWORK[moff];
|
||||
moff+=M*M;
|
||||
VTd=&gWORK[moff];
|
||||
moff+=M*M;
|
||||
Sd=&gWORK[moff];
|
||||
moff+=M;
|
||||
}
|
||||
bbd=(short*)&gWORK[moff];
|
||||
moff+=(Nbase*2*sizeof(short))/sizeof(double);
|
||||
}
|
||||
|
||||
/* extra storage for cusolver */
|
||||
int work_size=0;
|
||||
int *devInfo;
|
||||
int devInfo_h=0;
|
||||
err=cudaMalloc((void**)&devInfo, sizeof(int));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
double *work;
|
||||
double *rwork;
|
||||
if (solve_axb==0) {
|
||||
cusolverDnDpotrf_bufferSize(solver_handle, CUBLAS_FILL_MODE_UPPER, M, jacTjacd, M, &work_size);
|
||||
err=cudaMalloc((void**)&work, work_size*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else if (solve_axb==1) {
|
||||
cusolverDnDgeqrf_bufferSize(solver_handle, M, M, jacTjacd, M, &work_size);
|
||||
err=cudaMalloc((void**)&work, work_size*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
} else {
|
||||
cusolverDnDgesvd_bufferSize(solver_handle, M, M, &work_size);
|
||||
err=cudaMalloc((void**)&work, work_size*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
err=cudaMalloc((void**)&rwork, 5*M*sizeof(double));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
|
||||
err=cudaMemcpyAsync(pd, p, M*sizeof(double), cudaMemcpyHostToDevice,0);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* need to give right offset for coherencies */
|
||||
/* offset: cluster offset+time offset */
|
||||
err=cudaMemcpyAsync(cohd, &(dp->ddcoh[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(double), cudaMemcpyHostToDevice,0);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* correct offset for baselines */
|
||||
err=cudaMemcpyAsync(bbd, &(dp->ddbase[2*(dp->Nbase)*(tileoff)]), Nbase*2*sizeof(short), cudaMemcpyHostToDevice,0);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
cudaDeviceSynchronize();
|
||||
/* xd <=x */
|
||||
err=cudaMemcpyAsync(xd, x, N*sizeof(double), cudaMemcpyHostToDevice,0);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
/* ### compute e=x - f(p) and its L2 norm */
|
||||
/* ### e=x-hx, p_eL2=||e|| */
|
||||
/* p: params (Mx1), x: data (Nx1), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations*/
|
||||
cudakernel_func(ThreadsPerBlock, (Nbase+ThreadsPerBlock-1)/ThreadsPerBlock, pd,hxd,M,N, cohd, bbd, Nbase, dp->M, dp->N);
|
||||
|
||||
/* e=x */
|
||||
cbstatus=cublasDcopy(cbhandle, N, xd, 1, ed, 1);
|
||||
/* e=x-hx */
|
||||
double alpha=-1.0;
|
||||
cbstatus=cublasDaxpy(cbhandle, N, &alpha, hxd, 1, ed, 1);
|
||||
|
||||
/* norm ||e|| */
|
||||
cbstatus=cublasDnrm2(cbhandle, N, ed, 1, &p_eL2);
|
||||
/* square */
|
||||
p_eL2=p_eL2*p_eL2;
|
||||
|
||||
init_p_eL2=p_eL2;
|
||||
if(!finite(p_eL2)) stop=7;
|
||||
|
||||
/* setup OS subsets and stating offsets */
|
||||
/* ed : N, cohd : Nbase*8, bbd : Nbase*2 full size */
|
||||
/* if ntiles<Nsubsets, make Nsubsets=ntiles */
|
||||
int Nsubsets=10;
|
||||
if (ntiles<Nsubsets) { Nsubsets=ntiles; }
|
||||
/* FIXME: is 0.1 enough ? */
|
||||
int max_os_iter=(int)ceil(0.1*(double)Nsubsets);
|
||||
int Npersubset=(N+Nsubsets-1)/Nsubsets;
|
||||
int Nbasepersubset=(Nbase+Nsubsets-1)/Nsubsets;
|
||||
int *Nos,*Nbaseos,*edI,*NbI,*subI=0;
|
||||
if ((Nos=(int*)calloc((size_t)Nsubsets,sizeof(int)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((Nbaseos=(int*)calloc((size_t)Nsubsets,sizeof(int)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((edI=(int*)calloc((size_t)Nsubsets,sizeof(int)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
if ((NbI=(int*)calloc((size_t)Nsubsets,sizeof(int)))==0) {
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
exit(1);
|
||||
}
|
||||
int l,ositer;
|
||||
k=l=0;
|
||||
for (ci=0; ci<Nsubsets; ci++) {
|
||||
edI[ci]=k;
|
||||
NbI[ci]=l;
|
||||
if (k+Npersubset<N) {
|
||||
Nos[ci]=Npersubset;
|
||||
Nbaseos[ci]=Nbasepersubset;
|
||||
} else {
|
||||
Nos[ci]=N-k;
|
||||
Nbaseos[ci]=Nbase-l;
|
||||
}
|
||||
k=k+Npersubset;
|
||||
l=l+Nbasepersubset;
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
for (ci=0; ci<Nsubsets; ci++) {
|
||||
printf("ci=%d, Nos=%d, edI=%d, Nbseos=%d, NbI=%d\n",ci,Nos[ci],edI[ci],Nbaseos[ci],NbI[ci]);
|
||||
}
|
||||
#endif
|
||||
|
||||
/**** iteration loop ***********/
|
||||
for(k=0; k<itmax && !stop; ++k){
|
||||
#ifdef DEBUG
|
||||
printf("iter=%d err=%lf\n",k,p_eL2);
|
||||
#endif
|
||||
if(p_eL2<=eps3){ /* error is small */
|
||||
stop=6;
|
||||
break;
|
||||
}
|
||||
|
||||
if (randomize) {
|
||||
/* random permutation of subsets */
|
||||
subI=random_permutation(Nsubsets,0,0);
|
||||
}
|
||||
/**************** OS loop ***************************/
|
||||
for (ositer=0; ositer<max_os_iter; ositer++) {
|
||||
/* select subset to compute Jacobian */
|
||||
if (randomize) {
|
||||
l=subI[ositer];
|
||||
} else {
|
||||
l=(k+ositer)%Nsubsets;
|
||||
}
|
||||
/* NOTE: no. of subsets >= no. of OS iterations, so select
|
||||
a random set of subsets */
|
||||
/* N, Nbase changes with subset, cohd,bbd,ed gets offsets */
|
||||
/* ed : N, cohd : Nbase*8, bbd : Nbase*2 full size */
|
||||
/* p: params (Mx1), jacd: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations*/
|
||||
/* FIXME thread/block sizes 16x16=256, so 16 is chosen */
|
||||
//cudakernel_jacf(ThreadsPerBlock, ThreadsPerBlock/4, pd, jacd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
|
||||
cudakernel_jacf(ThreadsPerBlock, ThreadsPerBlock/4, pd, jacd, M, Nos[l], &cohd[8*NbI[l]], &bbd[2*NbI[l]], Nbaseos[l], dp->M, dp->N);
|
||||
|
||||
/* Compute J^T J and J^T e */
|
||||
/* Cache efficient computation of J^T J based on blocking
|
||||
*/
|
||||
/* since J is in ROW major order, assume it is transposed,
|
||||
so actually calculate A=J*J^T, where J is size MxN */
|
||||
//status=culaDeviceDgemm('N','T',M,M,Nos[l],1.0,jacd,M,jacd,M,0.0,jacTjacd,M);
|
||||
//checkStatus(status,__FILE__,__LINE__);
|
||||
double cone=1.0; double czero=0.0;
|
||||
cbstatus=cublasDgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_T,M,M,Nos[l],&cone,jacd,M,jacd,M,&czero,jacTjacd,M);
|
||||
|
||||
/* create backup */
|
||||
/* copy jacTjacd0<=jacTjacd */
|
||||
cbstatus=cublasDcopy(cbhandle, M*M, jacTjacd, 1, jacTjacd0, 1);
|
||||
/* J^T e */
|
||||
/* calculate b=J^T*e (actually compute b=J*e, where J in row major (size MxN) */
|
||||
//status=culaDeviceDgemv('N',M,Nos[l],1.0,jacd,M,&ed[edI[l]],1,0.0,jacTed,1);
|
||||
//checkStatus(status,__FILE__,__LINE__);
|
||||
cbstatus=cublasDgemv(cbhandle,CUBLAS_OP_N,M,Nos[l],&cone,jacd,M,&ed[edI[l]],1,&czero,jacTed,1);
|
||||
|
||||
|
||||
|
||||
/* Compute ||J^T e||_inf and ||p||^2 */
|
||||
/* find infinity norm of J^T e, 1 based indexing*/
|
||||
cbstatus=cublasIdamax(cbhandle, M, jacTed, 1, &ci);
|
||||
err=cudaMemcpy(&jacTe_inf,&(jacTed[ci-1]),sizeof(double),cudaMemcpyDeviceToHost);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* L2 norm of current parameter values */
|
||||
/* norm ||Dp|| */
|
||||
cbstatus=cublasDnrm2(cbhandle, M, pd, 1, &p_L2);
|
||||
p_L2=p_L2*p_L2;
|
||||
if(jacTe_inf<0.0) {jacTe_inf=-jacTe_inf;}
|
||||
#ifdef DEBUG
|
||||
printf("Inf norm=%lf\n",jacTe_inf);
|
||||
#endif
|
||||
|
||||
/* check for convergence */
|
||||
if((jacTe_inf <= eps1)){
|
||||
Dp_L2=0.0; /* no increment for p in this case */
|
||||
stop=1;
|
||||
break;
|
||||
}
|
||||
|
||||
/* compute initial (k=0) damping factor */
|
||||
if (k==0) {
|
||||
/* find max diagonal element (stride is M+1) */
|
||||
/* should be MAX not MAX(ABS) */
|
||||
cbstatus=cublasIdamax(cbhandle, M, jacTjacd, M+1, &ci); /* 1 based index */
|
||||
ci=(ci-1)*(M+1); /* right value of the diagonal */
|
||||
|
||||
err=cudaMemcpy(&tmp,&(jacTjacd[ci]),sizeof(double),cudaMemcpyDeviceToHost);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
mu=tau*tmp;
|
||||
}
|
||||
|
||||
|
||||
/* determine increment using adaptive damping */
|
||||
while(1){
|
||||
/* augment normal equations */
|
||||
/* increment A => A+ mu*I, increment diagonal entries */
|
||||
/* copy jacTjacd<=jacTjacd0 */
|
||||
cbstatus=cublasDcopy(cbhandle, M*M, jacTjacd0, 1, jacTjacd, 1);
|
||||
cudakernel_diagmu(ThreadsPerBlock, BlocksPerGrid, M, jacTjacd, mu);
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("mu=%lf\n",mu);
|
||||
#endif
|
||||
/*************************************************************************/
|
||||
issolved=0;
|
||||
/* solve augmented equations A x = b */
|
||||
/* A==jacTjacd, b==Dpd, after solving, x==Dpd */
|
||||
/* b=jacTed : intially right hand side, at exit the solution */
|
||||
if (solve_axb==0) {
|
||||
/* Cholesky solver **********************/
|
||||
/* lower triangle of Ad is destroyed */
|
||||
//status=culaDeviceDpotrf('U',M,jacTjacd,M);
|
||||
cusolverDnDpotrf(solver_handle, CUBLAS_FILL_MODE_UPPER, M, jacTjacd, M, work, work_size, devInfo);
|
||||
cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
if (!devInfo_h) {
|
||||
issolved=1;
|
||||
} else {
|
||||
issolved=0;
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr,"Singular matrix\n");
|
||||
#endif
|
||||
}
|
||||
if (issolved) {
|
||||
/* copy Dpd<=jacTed */
|
||||
cbstatus=cublasDcopy(cbhandle, M, jacTed, 1, Dpd, 1);
|
||||
#ifdef DEBUG
|
||||
checkCublasError(cbstatus,__FILE__,__LINE__);
|
||||
#endif
|
||||
//status=culaDeviceDpotrs('U',M,1,jacTjacd,M,Dpd,M);
|
||||
cusolverDnDpotrs(solver_handle, CUBLAS_FILL_MODE_UPPER,M,1,jacTjacd,M,Dpd,M,devInfo);
|
||||
cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
if (devInfo_h) {
|
||||
issolved=0;
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr,"Singular matrix\n");
|
||||
#endif
|
||||
}
|
||||
}
|
||||
} else if (solve_axb==1) {
|
||||
/* QR solver ********************************/
|
||||
//status=culaDeviceDgeqrf(M,M,jacTjacd,M,taud);
|
||||
cusolverDnDgeqrf(solver_handle, M, M, jacTjacd, M, taud, work, work_size, devInfo);
|
||||
cudaDeviceSynchronize();
|
||||
cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
if (!devInfo_h) {
|
||||
issolved=1;
|
||||
} else {
|
||||
issolved=0;
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr,"Singular matrix\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
if (issolved) {
|
||||
/* copy Dpd<=jacTed */
|
||||
cbstatus=cublasDcopy(cbhandle, M, jacTed, 1, Dpd, 1);
|
||||
//status=culaDeviceDgeqrs(M,M,1,jacTjacd,M,taud,Dpd,M);
|
||||
cusolverDnDormqr(solver_handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, M, 1, M, jacTjacd, M, taud, Dpd, M, work, work_size, devInfo);
|
||||
cudaDeviceSynchronize();
|
||||
cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
if (devInfo_h) {
|
||||
issolved=0;
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr,"Singular matrix\n");
|
||||
#endif
|
||||
} else {
|
||||
cone=1.0;
|
||||
cbstatus=cublasDtrsm(cbhandle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_UPPER,CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,M,1,&cone,jacTjacd,M,Dpd,M);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* SVD solver *********************************/
|
||||
/* U S VT = A */
|
||||
//status=culaDeviceDgesvd('A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M);
|
||||
//checkStatus(status,__FILE__,__LINE__);
|
||||
cusolverDnDgesvd(solver_handle,'A','A',M,M,jacTjacd,M,Sd,Ud,M,VTd,M,work,work_size,rwork,devInfo);
|
||||
cudaDeviceSynchronize();
|
||||
/* copy Dpd<=jacTed */
|
||||
cbstatus=cublasDcopy(cbhandle, M, jacTed, 1, Dpd, 1);
|
||||
/* b<=U^T * b */
|
||||
//status=culaDeviceDgemv('T',M,M,1.0,Ud,M,Dpd,1,0.0,Dpd,1);
|
||||
//checkStatus(status,__FILE__,__LINE__);
|
||||
cone=1.0; czero=0.0;
|
||||
cbstatus=cublasDgemv(cbhandle,CUBLAS_OP_T,M,M,&cone,Ud,M,Dpd,1,&czero,Dpd,1);
|
||||
|
||||
/* divide by singular values Dpd[]/Sd[] for Sd[]> eps1 */
|
||||
cudakernel_diagdiv(ThreadsPerBlock, BlocksPerGrid, M, eps1, Dpd, Sd);
|
||||
|
||||
/* b<=VT^T * b */
|
||||
//status=culaDeviceDgemv('T',M,M,1.0,VTd,M,Dpd,1,0.0,Dpd,1);
|
||||
//checkStatus(status,__FILE__,__LINE__);
|
||||
cbstatus=cublasDgemv(cbhandle,CUBLAS_OP_T,M,M,&cone,VTd,M,Dpd,1,&czero,Dpd,1);
|
||||
|
||||
|
||||
issolved=1;
|
||||
}
|
||||
/*************************************************************************/
|
||||
|
||||
/* compute p's new estimate and ||Dp||^2 */
|
||||
if (issolved) {
|
||||
/* compute p's new estimate and ||Dp||^2 */
|
||||
/* pnew=p+Dp */
|
||||
/* pnew=p */
|
||||
cbstatus=cublasDcopy(cbhandle, M, pd, 1, pnewd, 1);
|
||||
/* pnew=pnew+Dp */
|
||||
alpha=1.0;
|
||||
cbstatus=cublasDaxpy(cbhandle, M, &alpha, Dpd, 1, pnewd, 1);
|
||||
|
||||
/* norm ||Dp|| */
|
||||
cbstatus=cublasDnrm2(cbhandle, M, Dpd, 1, &Dp_L2);
|
||||
Dp_L2=Dp_L2*Dp_L2;
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("norm ||dp|| =%lf, norm ||p||=%lf\n",Dp_L2,p_L2);
|
||||
#endif
|
||||
if(Dp_L2<=eps2_sq*p_L2){ /* relative change in p is small, stop */
|
||||
stop=2;
|
||||
break;
|
||||
}
|
||||
|
||||
if(Dp_L2>=(p_L2+eps2)/(CLM_EPSILON*CLM_EPSILON)){ /* almost singular */
|
||||
stop=4;
|
||||
break;
|
||||
}
|
||||
|
||||
/* new function value */
|
||||
/* compute ||e(pDp)||_2 */
|
||||
/* ### hx=x-hx, pDp_eL2=||hx|| */
|
||||
/* copy to device */
|
||||
/* hxd<=hx */
|
||||
cudakernel_func(ThreadsPerBlock, (Nbase+ThreadsPerBlock-1)/ThreadsPerBlock, pnewd, hxd, M, N, cohd, bbd, Nbase, dp->M, dp->N);
|
||||
|
||||
/* e=x */
|
||||
cbstatus=cublasDcopy(cbhandle, N, xd, 1, ed, 1);
|
||||
/* e=x-hx */
|
||||
alpha=-1.0;
|
||||
cbstatus=cublasDaxpy(cbhandle, N, &alpha, hxd, 1, ed, 1);
|
||||
/* note: e is updated */
|
||||
|
||||
/* norm ||e|| */
|
||||
cbstatus=cublasDnrm2(cbhandle, N, ed, 1, &pDp_eL2);
|
||||
pDp_eL2=pDp_eL2*pDp_eL2;
|
||||
|
||||
|
||||
if(!finite(pDp_eL2)){ /* sum of squares is not finite, most probably due to a user error.
|
||||
*/
|
||||
stop=7;
|
||||
break;
|
||||
}
|
||||
|
||||
/* dL=Dp'*(mu*Dp+jacTe) */
|
||||
/* bd=jacTe+mu*Dp */
|
||||
cbstatus=cublasDcopy(cbhandle, M, jacTed, 1, bd, 1);
|
||||
cbstatus=cublasDaxpy(cbhandle, M, &mu, Dpd, 1, bd, 1);
|
||||
cbstatus=cublasDdot(cbhandle, M, Dpd, 1, bd, 1, &dL);
|
||||
|
||||
dF=p_eL2-pDp_eL2;
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("dF=%lf, dL=%lf\n",dF,dL);
|
||||
#endif
|
||||
if(dL>0.0 && dF>0.0){ /* reduction in error, increment is accepted */
|
||||
tmp=(2.0*dF/dL-1.0);
|
||||
tmp=1.0-tmp*tmp*tmp;
|
||||
mu=mu*((tmp>=CLM_ONE_THIRD)? tmp : CLM_ONE_THIRD);
|
||||
nu=2;
|
||||
|
||||
/* update p's estimate */
|
||||
cbstatus=cublasDcopy(cbhandle, M, pnewd, 1, pd, 1);
|
||||
|
||||
/* update ||e||_2 */
|
||||
p_eL2=pDp_eL2;
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
/* if this point is reached, either the linear system could not be solved or
|
||||
* the error did not reduce; in any case, the increment must be rejected
|
||||
*/
|
||||
|
||||
mu*=(double)nu;
|
||||
nu2=nu<<1; // 2*nu;
|
||||
if(nu2<=nu){ /* nu has wrapped around (overflown). */
|
||||
stop=5;
|
||||
break;
|
||||
}
|
||||
|
||||
nu=nu2;
|
||||
|
||||
} /* inner loop */
|
||||
|
||||
}
|
||||
if (randomize) {
|
||||
free(subI);
|
||||
}
|
||||
/**************** end OS loop ***************************/
|
||||
|
||||
}
|
||||
/**** end iteration loop ***********/
|
||||
free(Nos);
|
||||
free(Nbaseos);
|
||||
free(edI);
|
||||
free(NbI);
|
||||
|
||||
if(k>=itmax) stop=3;
|
||||
|
||||
/* copy back current solution */
|
||||
err=cudaMemcpyAsync(p,pd,M*sizeof(double),cudaMemcpyDeviceToHost,0);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
checkCublasError(cbstatus,__FILE__,__LINE__);
|
||||
|
||||
/* synchronize async operations */
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
if (!gWORK) {
|
||||
cudaFree(xd);
|
||||
cudaFree(jacd);
|
||||
cudaFree(jacTjacd);
|
||||
cudaFree(jacTjacd0);
|
||||
cudaFree(jacTed);
|
||||
cudaFree(Dpd);
|
||||
cudaFree(bd);
|
||||
cudaFree(pd);
|
||||
cudaFree(pnewd);
|
||||
cudaFree(hxd);
|
||||
cudaFree(ed);
|
||||
if (solve_axb==1) {
|
||||
cudaFree(taud);
|
||||
} else if (solve_axb==2) {
|
||||
cudaFree(Ud);
|
||||
cudaFree(VTd);
|
||||
cudaFree(Sd);
|
||||
}
|
||||
cudaFree(cohd);
|
||||
cudaFree(bbd);
|
||||
}
|
||||
|
||||
cudaFree(devInfo);
|
||||
cudaFree(work);
|
||||
if (solve_axb==2) {
|
||||
cudaFree(rwork);
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("stop=%d\n",stop);
|
||||
#endif
|
||||
if(info){
|
||||
info[0]=init_p_eL2;
|
||||
info[1]=p_eL2;
|
||||
info[2]=jacTe_inf;
|
||||
info[3]=Dp_L2;
|
||||
info[4]=mu;
|
||||
info[5]=(double)k;
|
||||
info[6]=(double)stop;
|
||||
info[7]=(double)0;
|
||||
info[8]=(double)0;
|
||||
info[9]=(double)0;
|
||||
}
|
||||
return 0;
|
||||
}
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -1,721 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
#include "cuda.h"
|
||||
#include <cuComplex.h>
|
||||
#include <stdio.h>
|
||||
|
||||
/* enable this for kernel failure detection */
|
||||
//#define CUDA_DBG
|
||||
|
||||
__global__ void kernel_deriv_robust(int Nbase, int tilesz, int M, int Ns, int Nparam, int goff, double robust_nu, double *x, double *coh, double *p, short *bb, int *ptoclus, double *grad){
|
||||
/* global thread index */
|
||||
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
/* parameter number of this thread */
|
||||
unsigned int np=n+goff;
|
||||
|
||||
|
||||
/* this thread works on
|
||||
x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
|
||||
bb[2*n:2*n+1] (sta1,sta2)
|
||||
organization of p (N stations and M clusters)
|
||||
sta 0 sta 1 sta 2 .... sta N-1
|
||||
clus 0 0...7 8...15 16...23 ... 8N-8 8N-1
|
||||
clus 1 8N..8N+7 8N+8..8N+15 8N+16..8N+23 .... 8N+8N-8...8N+8N-1
|
||||
......
|
||||
clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15.... ...(M-1)N+8N-8 (M-1)N+8N-1
|
||||
|
||||
organization of coherencies (coh)
|
||||
[0, 8*M-1] : baseline 0
|
||||
[8*M, 8*M+8*M-1]: baseline 1
|
||||
[n*8*M, n*8*M+8*M-1]: baseline n
|
||||
......
|
||||
[n*8*M+cm*8, n*8*M+cm*8+7] cluster cm, baseline n
|
||||
|
||||
residual error stored at sum[n]
|
||||
*/
|
||||
|
||||
if (n<Nparam) {
|
||||
/* this thread will calculate derivative for parameter np,
|
||||
and store it in grad[n] */
|
||||
double gsum=0.0;
|
||||
|
||||
/* find which cluster this parameter belongs to */
|
||||
/* ptoclus[0,1] are nchunk, and p[start index] for each cluster */
|
||||
int cli=0;
|
||||
/* np should be within ptoclus[2*cli+1]....ptoclus[2*cli+1]+ptoclus[2*cli]*8*Ns-1 */
|
||||
while ((cli<M) && (np<ptoclus[2*cli+1] || np>ptoclus[2*cli+1]+ptoclus[2*cli]*8*Ns-1)) { cli++; }
|
||||
/* now either ci>=M: cluster not found
|
||||
or ci<M and ci is the right cluster */
|
||||
if ((cli==M) && np>=ptoclus[2*cli-1] && np<=ptoclus[2*cli-1]+ptoclus[2*cli-2]*8*Ns-1) {
|
||||
cli--;
|
||||
}
|
||||
|
||||
if (cli<M) {
|
||||
int pstart=ptoclus[2*cli+1];
|
||||
int nchunk=ptoclus[2*cli];
|
||||
|
||||
/* find station and which parameter for this thread (parameter) */
|
||||
/* this cluster has parameters ptoclus[2*cli+1] ..... +ptoclus[2*cli]*8*Ns-1 */
|
||||
unsigned int np_s=(np-pstart)%(8*Ns);
|
||||
unsigned int stc=np_s/8; /* this is the station of this param */
|
||||
/* which chunk does this parameter belong to */
|
||||
unsigned int tpchunk=(np-pstart)/(8*Ns);
|
||||
int tilesperchunk=(tilesz+nchunk-1)/nchunk;
|
||||
|
||||
/* total baselines in one tile */
|
||||
int Nbase0=(Ns-1)*Ns/2;
|
||||
for(unsigned int nb=0; nb<Nbase; nb++) {
|
||||
/* which tile is this ? */
|
||||
int ttile=nb/Nbase0;
|
||||
/* which chunk this tile belongs to */
|
||||
int tptile=ttile/tilesperchunk;
|
||||
/* now tptile has to match tpchunk, otherwise ignore calculation */
|
||||
if (tptile==tpchunk) {
|
||||
int sta1=(int)bb[2*nb];
|
||||
int sta2=(int)bb[2*nb+1];
|
||||
/* only calculate deriv if baseline corresponds
|
||||
to this station and baseline is not flagged */
|
||||
/* flagged baselines will have sta1==sta2==-1 */
|
||||
if (((stc==sta1)||(stc==sta2)) && sta1>=0 && sta2>=0) {
|
||||
/* which parameter 0..7 */
|
||||
unsigned int stoff=np_s-stc*8;
|
||||
/* which cluster 0..M-1 */
|
||||
unsigned int stm=cli;
|
||||
|
||||
/* read residual vector, real,imag separate*/
|
||||
double xr[8];
|
||||
xr[0]=x[nb*8];
|
||||
xr[1]=x[nb*8+1];
|
||||
xr[2]=x[nb*8+2];
|
||||
xr[3]=x[nb*8+3];
|
||||
xr[4]=x[nb*8+4];
|
||||
xr[5]=x[nb*8+5];
|
||||
xr[6]=x[nb*8+6];
|
||||
xr[7]=x[nb*8+7];
|
||||
|
||||
/* read in coherency */
|
||||
cuDoubleComplex C[4];
|
||||
C[0].x=coh[8*nb*M+8*stm];
|
||||
C[0].y=coh[8*nb*M+8*stm+1];
|
||||
C[1].x=coh[8*nb*M+8*stm+2];
|
||||
C[1].y=coh[8*nb*M+8*stm+3];
|
||||
C[2].x=coh[8*nb*M+8*stm+4];
|
||||
C[2].y=coh[8*nb*M+8*stm+5];
|
||||
C[3].x=coh[8*nb*M+8*stm+6];
|
||||
C[3].y=coh[8*nb*M+8*stm+7];
|
||||
|
||||
cuDoubleComplex G1[4];
|
||||
cuDoubleComplex G2[4];
|
||||
if(stc==sta1) {
|
||||
double pp[8];
|
||||
pp[0]=0.0;
|
||||
pp[1]=0.0;
|
||||
pp[2]=0.0;
|
||||
pp[3]=0.0;
|
||||
pp[4]=0.0;
|
||||
pp[5]=0.0;
|
||||
pp[6]=0.0;
|
||||
pp[7]=0.0;
|
||||
pp[stoff]=1.0;
|
||||
G1[0].x=pp[0];
|
||||
G1[0].y=pp[1];
|
||||
G1[1].x=pp[2];
|
||||
G1[1].y=pp[3];
|
||||
G1[2].x=pp[4];
|
||||
G1[2].y=pp[5];
|
||||
G1[3].x=pp[6];
|
||||
G1[3].y=pp[7];
|
||||
|
||||
/* conjugate and transpose G2 */
|
||||
G2[0].x=p[pstart+tpchunk*8*Ns+sta2*8];
|
||||
G2[0].y=-p[pstart+tpchunk*8*Ns+sta2*8+1];
|
||||
G2[2].x=p[pstart+tpchunk*8*Ns+sta2*8+2];
|
||||
G2[2].y=-p[pstart+tpchunk*8*Ns+sta2*8+3];
|
||||
G2[1].x=p[pstart+tpchunk*8*Ns+sta2*8+4];
|
||||
G2[1].y=-p[pstart+tpchunk*8*Ns+sta2*8+5];
|
||||
G2[3].x=p[pstart+tpchunk*8*Ns+sta2*8+6];
|
||||
G2[3].y=-p[pstart+tpchunk*8*Ns+sta2*8+7];
|
||||
} else if (stc==sta2) {
|
||||
double pp[8];
|
||||
pp[0]=0.0;
|
||||
pp[1]=0.0;
|
||||
pp[2]=0.0;
|
||||
pp[3]=0.0;
|
||||
pp[4]=0.0;
|
||||
pp[5]=0.0;
|
||||
pp[6]=0.0;
|
||||
pp[7]=0.0;
|
||||
pp[stoff]=1.0;
|
||||
/* conjugate and transpose G2 */
|
||||
G2[0].x=pp[0];
|
||||
G2[0].y=-pp[1];
|
||||
G2[2].x=pp[2];
|
||||
G2[2].y=-pp[3];
|
||||
G2[1].x=pp[4];
|
||||
G2[1].y=-pp[5];
|
||||
G2[3].x=pp[6];
|
||||
G2[3].y=-pp[7];
|
||||
|
||||
/* conjugate and transpose G2 */
|
||||
G1[0].x=p[pstart+tpchunk*8*Ns+sta1*8];
|
||||
G1[0].y=p[pstart+tpchunk*8*Ns+sta1*8+1];
|
||||
G1[1].x=p[pstart+tpchunk*8*Ns+sta1*8+2];
|
||||
G1[1].y=p[pstart+tpchunk*8*Ns+sta1*8+3];
|
||||
G1[2].x=p[pstart+tpchunk*8*Ns+sta1*8+4];
|
||||
G1[2].y=p[pstart+tpchunk*8*Ns+sta1*8+5];
|
||||
G1[3].x=p[pstart+tpchunk*8*Ns+sta1*8+6];
|
||||
G1[3].y=p[pstart+tpchunk*8*Ns+sta1*8+7];
|
||||
}
|
||||
cuDoubleComplex T1[4];
|
||||
/* T1=G1*C */
|
||||
T1[0]=cuCadd(cuCmul(G1[0],C[0]),cuCmul(G1[1],C[2]));
|
||||
T1[1]=cuCadd(cuCmul(G1[0],C[1]),cuCmul(G1[1],C[3]));
|
||||
T1[2]=cuCadd(cuCmul(G1[2],C[0]),cuCmul(G1[3],C[2]));
|
||||
T1[3]=cuCadd(cuCmul(G1[2],C[1]),cuCmul(G1[3],C[3]));
|
||||
|
||||
cuDoubleComplex T2[4];
|
||||
/* T2=T1*G2 , G2 conjugate transposed */
|
||||
T2[0]=cuCadd(cuCmul(T1[0],G2[0]),cuCmul(T1[1],G2[2]));
|
||||
T2[1]=cuCadd(cuCmul(T1[0],G2[1]),cuCmul(T1[1],G2[3]));
|
||||
T2[2]=cuCadd(cuCmul(T1[2],G2[0]),cuCmul(T1[3],G2[2]));
|
||||
T2[3]=cuCadd(cuCmul(T1[2],G2[1]),cuCmul(T1[3],G2[3]));
|
||||
|
||||
/* calculate product xr*vec(J_p C J_q^H )/(nu+residual^2) */
|
||||
double dsum;
|
||||
dsum=xr[0]*T2[0].x/(robust_nu+xr[0]*xr[0]);
|
||||
dsum+=xr[1]*T2[0].y/(robust_nu+xr[1]*xr[1]);
|
||||
dsum+=xr[2]*T2[1].x/(robust_nu+xr[2]*xr[2]);
|
||||
dsum+=xr[3]*T2[1].y/(robust_nu+xr[3]*xr[3]);
|
||||
dsum+=xr[4]*T2[2].x/(robust_nu+xr[4]*xr[4]);
|
||||
dsum+=xr[5]*T2[2].y/(robust_nu+xr[5]*xr[5]);
|
||||
dsum+=xr[6]*T2[3].x/(robust_nu+xr[6]*xr[6]);
|
||||
dsum+=xr[7]*T2[3].y/(robust_nu+xr[7]*xr[7]);
|
||||
/* accumulate sum NOTE
|
||||
its important to get the sign right,
|
||||
depending on res=data-model or res=model-data */
|
||||
gsum+=2.0*dsum;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
grad[n]=gsum;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
__global__ void kernel_func_wt(int Nbase, double *x, double *coh, double *p, short *bb, double *wt, int N){
|
||||
/* global thread index : equal to the baseline */
|
||||
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
|
||||
/* this thread works on
|
||||
x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
|
||||
bb[2*n:2*n+1] (sta1,sta2)
|
||||
organization of p (N stations and M clusters)
|
||||
sta 0 sta 1 sta 2 .... sta N-1
|
||||
clus 0 0...7 8...15 16...23 ... 8N-8 8N-1
|
||||
clus 1 8N..8N+7 8N+8..8N+15 8N+16..8N+23 .... 8N+8N-8...8N+8N-1
|
||||
......
|
||||
clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15.... ...(M-1)N+8N-8 (M-1)N+8N-1
|
||||
|
||||
organization of coherencies (coh)
|
||||
[0, 8*M-1] : baseline 0
|
||||
[8*M, 8*M+8*M-1]: baseline 1
|
||||
[n*8*M, n*8*M+8*M-1]: baseline n
|
||||
......
|
||||
[n*8*M+cm*8, n*8*M+cm*8+7] cluster cm, baseline n
|
||||
|
||||
residual error stored at sum[n]
|
||||
*/
|
||||
|
||||
if(n<Nbase) {
|
||||
int sta1=(int)bb[2*n];
|
||||
int sta2=(int)bb[2*n+1];
|
||||
|
||||
/* condition for calculating this baseline sum is
|
||||
1) its not flagged (sta1,sta2)>=0
|
||||
*/
|
||||
if (sta1>=0 && sta2>=0) {
|
||||
cuDoubleComplex G1[4];
|
||||
double pp[8];
|
||||
pp[0]=p[sta1*8];
|
||||
pp[1]=p[sta1*8+1];
|
||||
pp[2]=p[sta1*8+2];
|
||||
pp[3]=p[sta1*8+3];
|
||||
pp[4]=p[sta1*8+4];
|
||||
pp[5]=p[sta1*8+5];
|
||||
pp[6]=p[sta1*8+6];
|
||||
pp[7]=p[sta1*8+7];
|
||||
G1[0].x=pp[0];
|
||||
G1[0].y=pp[1];
|
||||
G1[1].x=pp[2];
|
||||
G1[1].y=pp[3];
|
||||
G1[2].x=pp[4];
|
||||
G1[2].y=pp[5];
|
||||
G1[3].x=pp[6];
|
||||
G1[3].y=pp[7];
|
||||
|
||||
|
||||
cuDoubleComplex C[4];
|
||||
C[0].x=coh[8*n];
|
||||
C[0].y=coh[8*n+1];
|
||||
C[1].x=coh[8*n+2];
|
||||
C[1].y=coh[8*n+3];
|
||||
C[2].x=coh[8*n+4];
|
||||
C[2].y=coh[8*n+5];
|
||||
C[3].x=coh[8*n+6];
|
||||
C[3].y=coh[8*n+7];
|
||||
|
||||
cuDoubleComplex T1[4];
|
||||
/* T=G1*C */
|
||||
T1[0]=cuCadd(cuCmul(G1[0],C[0]),cuCmul(G1[1],C[2]));
|
||||
T1[1]=cuCadd(cuCmul(G1[0],C[1]),cuCmul(G1[1],C[3]));
|
||||
T1[2]=cuCadd(cuCmul(G1[2],C[0]),cuCmul(G1[3],C[2]));
|
||||
T1[3]=cuCadd(cuCmul(G1[2],C[1]),cuCmul(G1[3],C[3]));
|
||||
|
||||
cuDoubleComplex G2[4];
|
||||
/* conjugate this */
|
||||
pp[0]=p[sta2*8];
|
||||
pp[1]=-p[sta2*8+1];
|
||||
pp[2]=p[sta2*8+2];
|
||||
pp[3]=-p[sta2*8+3];
|
||||
pp[4]=p[sta2*8+4];
|
||||
pp[5]=-p[sta2*8+5];
|
||||
pp[6]=p[sta2*8+6];
|
||||
pp[7]=-p[sta2*8+7];
|
||||
G2[0].x=pp[0];
|
||||
G2[0].y=pp[1];
|
||||
G2[2].x=pp[2];
|
||||
G2[2].y=pp[3];
|
||||
G2[1].x=pp[4];
|
||||
G2[1].y=pp[5];
|
||||
G2[3].x=pp[6];
|
||||
G2[3].y=pp[7];
|
||||
|
||||
cuDoubleComplex T2[4];
|
||||
T2[0]=cuCadd(cuCmul(T1[0],G2[0]),cuCmul(T1[1],G2[2]));
|
||||
T2[1]=cuCadd(cuCmul(T1[0],G2[1]),cuCmul(T1[1],G2[3]));
|
||||
T2[2]=cuCadd(cuCmul(T1[2],G2[0]),cuCmul(T1[3],G2[2]));
|
||||
T2[3]=cuCadd(cuCmul(T1[2],G2[1]),cuCmul(T1[3],G2[3]));
|
||||
/* update model vector, with weights */
|
||||
x[8*n]=wt[8*n]*T2[0].x;
|
||||
x[8*n+1]=wt[8*n+1]*T2[0].y;
|
||||
x[8*n+2]=wt[8*n+2]*T2[1].x;
|
||||
x[8*n+3]=wt[8*n+3]*T2[1].y;
|
||||
x[8*n+4]=wt[8*n+4]*T2[2].x;
|
||||
x[8*n+5]=wt[8*n+5]*T2[2].y;
|
||||
x[8*n+6]=wt[8*n+6]*T2[3].x;
|
||||
x[8*n+7]=wt[8*n+7]*T2[3].y;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void kernel_jacf_wt(int Nbase, int M, double *jac, double *coh, double *p, short *bb, double *wt, int N){
|
||||
/* global thread index : equal to the baseline */
|
||||
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
/* which parameter:0...M */
|
||||
unsigned int m = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
|
||||
/* this thread works on
|
||||
x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
|
||||
bb[2*n:2*n+1] (sta1,sta2)
|
||||
organization of p (N stations and M clusters)
|
||||
sta 0 sta 1 sta 2 .... sta N-1
|
||||
clus 0 0...7 8...15 16...23 ... 8N-8 8N-1
|
||||
clus 1 8N..8N+7 8N+8..8N+15 8N+16..8N+23 .... 8N+8N-8...8N+8N-1
|
||||
......
|
||||
clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15.... ...(M-1)N+8N-8 (M-1)N+8N-1
|
||||
|
||||
organization of coherencies (coh)
|
||||
[0, 8*M-1] : baseline 0
|
||||
[8*M, 8*M+8*M-1]: baseline 1
|
||||
[n*8*M, n*8*M+8*M-1]: baseline n
|
||||
......
|
||||
[n*8*M+cm*8, n*8*M+cm*8+7] cluster cm, baseline n
|
||||
|
||||
residual error stored at sum[n]
|
||||
*/
|
||||
|
||||
if(n<Nbase && m<M) {
|
||||
int sta1=(int)bb[2*n];
|
||||
int sta2=(int)bb[2*n+1];
|
||||
/* condition for calculating this baseline sum is
|
||||
If this baseline is flagged,
|
||||
or if this parameter does not belong to sta1 or sta2
|
||||
we do not compute
|
||||
*/
|
||||
//int stc=m/8; /* 0...Ns-1 (because M=total par= 8 * Nstations */
|
||||
int stc=m>>3; /* 0...Ns-1 (because M=total par= 8 * Nstations */
|
||||
|
||||
if (((stc==sta2)||(stc==sta1)) && sta1>=0 && sta2>=0 ) {
|
||||
|
||||
cuDoubleComplex C[4];
|
||||
C[0].x=coh[8*n];
|
||||
C[0].y=coh[8*n+1];
|
||||
C[1].x=coh[8*n+2];
|
||||
C[1].y=coh[8*n+3];
|
||||
C[2].x=coh[8*n+4];
|
||||
C[2].y=coh[8*n+5];
|
||||
C[3].x=coh[8*n+6];
|
||||
C[3].y=coh[8*n+7];
|
||||
|
||||
/* which parameter exactly 0..7 */
|
||||
//int stoff=m%8;
|
||||
int stoff=m-stc*8;
|
||||
double pp1[8];
|
||||
double pp2[8];
|
||||
if (stc==sta1) {
|
||||
for (int cn=0; cn<8; cn++) {
|
||||
pp1[cn]=0.0;
|
||||
pp2[cn]=p[sta2*8+cn];
|
||||
}
|
||||
pp1[stoff]=1.0;
|
||||
} else if (stc==sta2) {
|
||||
for (int cn=0; cn<8; cn++) {
|
||||
pp2[cn]=0.0;
|
||||
pp1[cn]=p[sta1*8+cn];
|
||||
}
|
||||
pp2[stoff]=1.0;
|
||||
}
|
||||
|
||||
|
||||
cuDoubleComplex G1[4];
|
||||
G1[0].x=pp1[0];
|
||||
G1[0].y=pp1[1];
|
||||
G1[1].x=pp1[2];
|
||||
G1[1].y=pp1[3];
|
||||
G1[2].x=pp1[4];
|
||||
G1[2].y=pp1[5];
|
||||
G1[3].x=pp1[6];
|
||||
G1[3].y=pp1[7];
|
||||
|
||||
cuDoubleComplex T1[4];
|
||||
/* T=G1*C */
|
||||
T1[0]=cuCadd(cuCmul(G1[0],C[0]),cuCmul(G1[1],C[2]));
|
||||
T1[1]=cuCadd(cuCmul(G1[0],C[1]),cuCmul(G1[1],C[3]));
|
||||
T1[2]=cuCadd(cuCmul(G1[2],C[0]),cuCmul(G1[3],C[2]));
|
||||
T1[3]=cuCadd(cuCmul(G1[2],C[1]),cuCmul(G1[3],C[3]));
|
||||
|
||||
cuDoubleComplex G2[4];
|
||||
/* conjugate this */
|
||||
G2[0].x=pp2[0];
|
||||
G2[0].y=-pp2[1];
|
||||
G2[2].x=pp2[2];
|
||||
G2[2].y=-pp2[3];
|
||||
G2[1].x=pp2[4];
|
||||
G2[1].y=-pp2[5];
|
||||
G2[3].x=pp2[6];
|
||||
G2[3].y=-pp2[7];
|
||||
|
||||
cuDoubleComplex T2[4];
|
||||
T2[0]=cuCadd(cuCmul(T1[0],G2[0]),cuCmul(T1[1],G2[2]));
|
||||
T2[1]=cuCadd(cuCmul(T1[0],G2[1]),cuCmul(T1[1],G2[3]));
|
||||
T2[2]=cuCadd(cuCmul(T1[2],G2[0]),cuCmul(T1[3],G2[2]));
|
||||
T2[3]=cuCadd(cuCmul(T1[2],G2[1]),cuCmul(T1[3],G2[3]));
|
||||
/* update jacobian , with row weights */
|
||||
/* NOTE: row major order */
|
||||
jac[m+M*8*n]=wt[8*n]*T2[0].x;
|
||||
jac[m+M*(8*n+1)]=wt[8*n+1]*T2[0].y;
|
||||
jac[m+M*(8*n+2)]=wt[8*n+2]*T2[1].x;
|
||||
jac[m+M*(8*n+3)]=wt[8*n+3]*T2[1].y;
|
||||
jac[m+M*(8*n+4)]=wt[8*n+4]*T2[2].x;
|
||||
jac[m+M*(8*n+5)]=wt[8*n+5]*T2[2].y;
|
||||
jac[m+M*(8*n+6)]=wt[8*n+6]*T2[3].x;
|
||||
jac[m+M*(8*n+7)]=wt[8*n+7]*T2[3].y;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void kernel_setweights(int N, double *wt, double alpha){
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* make sure to use only M threads */
|
||||
if (tid<N) {
|
||||
wt[tid]=alpha;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kernel_hadamard(int N, double *wt, double *x){
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* make sure to use only M threads */
|
||||
if (tid<N) {
|
||||
x[tid]*=wt[tid];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kernel_updateweights(int N, double *wt, double *x, double *q, double nu){
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* make sure to use only M threads */
|
||||
if (tid<N) {
|
||||
wt[tid]=((nu+1.0)/(nu+x[tid]*x[tid]));
|
||||
q[tid]=wt[tid]-log(wt[tid]); /* so that its +ve */
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kernel_sqrtweights(int N, double *wt){
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* make sure to use only M threads */
|
||||
if (tid<N) {
|
||||
wt[tid]=sqrt(wt[tid]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__device__ double
|
||||
digamma(double x) {
|
||||
double result = 0.0, xx, xx2, xx4;
|
||||
for ( ; x < 7.0; ++x) { /* reduce x till x<7 */
|
||||
result -= 1.0/x;
|
||||
}
|
||||
x -= 1.0/2.0;
|
||||
xx = 1.0/x;
|
||||
xx2 = xx*xx;
|
||||
xx4 = xx2*xx2;
|
||||
result += log(x)+(1./24.)*xx2-(7.0/960.0)*xx4+(31.0/8064.0)*xx4*xx2-(127.0/30720.0)*xx4*xx4;
|
||||
return result;
|
||||
}
|
||||
|
||||
__global__ void kernel_evaluatenu(int Nd, double qsum, double *q, double deltanu,double nulow) {
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if (tid<Nd) {
|
||||
double thisnu=(nulow+((double)tid)*deltanu);
|
||||
double dgm=digamma(thisnu*0.5+0.5);
|
||||
q[tid]=dgm-log((thisnu+1.0)*0.5); /* psi((nu+1)/2)-log((nu+1)/2) */
|
||||
dgm=digamma(thisnu*0.5);
|
||||
q[tid]+=-dgm+log((thisnu)*0.5); /* -psi((nu)/2)+log((nu)/2) */
|
||||
q[tid]+=-qsum+1.0; /* -(-sum(ln(w_i))/N+sum(w_i)/N)+1 */
|
||||
}
|
||||
}
|
||||
|
||||
/* only use extern if calling code is C */
|
||||
extern "C"
|
||||
{
|
||||
|
||||
/* set initial weights to 1 by a cuda kernel */
|
||||
void
|
||||
cudakernel_setweights(int ThreadsPerBlock, int BlocksPerGrid, int N, double *wt, double alpha) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_setweights<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, alpha);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* hadamard product by a cuda kernel x<= x*wt */
|
||||
void
|
||||
cudakernel_hadamard(int ThreadsPerBlock, int BlocksPerGrid, int N, double *wt, double *x) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_hadamard<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, x);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* update weights by a cuda kernel */
|
||||
void
|
||||
cudakernel_updateweights(int ThreadsPerBlock, int BlocksPerGrid, int N, double *wt, double *x, double *q, double robust_nu) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_updateweights<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, x, q, robust_nu);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* update weights by a cuda kernel */
|
||||
void
|
||||
cudakernel_sqrtweights(int ThreadsPerBlock, int BlocksPerGrid, int N, double *wt) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_sqrtweights<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* evaluate expression for finding optimum nu for
|
||||
a range of nu values */
|
||||
void
|
||||
cudakernel_evaluatenu(int ThreadsPerBlock, int BlocksPerGrid, int Nd, double qsum, double *q, double deltanu,double nulow) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_evaluatenu<<< BlocksPerGrid, ThreadsPerBlock >>>(Nd, qsum, q, deltanu, nulow);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* cuda driver for calculating wt \odot f() */
|
||||
/* p: params (Mx1), x: data (Nx1), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
|
||||
void
|
||||
cudakernel_func_wt(int ThreadsPerBlock, int BlocksPerGrid, double *p, double *x, int M, int N, double *coh, short *bbh, double *wt, int Nbase, int Mclus, int Nstations) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
cudaMemset(x, 0, N*sizeof(double));
|
||||
// printf("Kernel data size=%d, block=%d, thread=%d, baselines=%d\n",N,BlocksPerGrid, ThreadsPerBlock,Nbase);
|
||||
kernel_func_wt<<< BlocksPerGrid, ThreadsPerBlock >>>(Nbase, x, coh, p, bbh, wt, Nstations);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
/* cuda driver for calculating wt \odot jacf() */
|
||||
/* p: params (Mx1), jac: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
|
||||
void
|
||||
cudakernel_jacf_wt(int ThreadsPerBlock_row, int ThreadsPerBlock_col, double *p, double *jac, int M, int N, double *coh, short *bbh, double *wt, int Nbase, int Mclus, int Nstations, int clus) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
/* NOTE: use small value for ThreadsPerBlock here, like 8 */
|
||||
dim3 threadsPerBlock(16, 8);
|
||||
/* jacobian: Nbase x Nstations (proportional to N), so */
|
||||
dim3 numBlocks((Nbase+threadsPerBlock.x-1)/threadsPerBlock.x,
|
||||
(M+threadsPerBlock.y-1)/threadsPerBlock.y);
|
||||
/* set memory of jac to zero */
|
||||
cudaMemset(jac, 0, N*M*sizeof(double));
|
||||
// printf("Kernel Jax data size=%d, params=%d, block=%d,%d, thread=%d,%d, baselines=%d\n",N, M, numBlocks.x,numBlocks.y, threadsPerBlock.x, threadsPerBlock.y, Nbase);
|
||||
kernel_jacf_wt<<< numBlocks, threadsPerBlock>>>(Nbase, M, jac, coh, p, bbh, wt, Nstations);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
/* cuda driver for kernel */
|
||||
/* ThreadsPerBlock: keep <= 128 ???
|
||||
BlocksPerGrid: depends on the threads/baselines> Threads*Blocks approx baselines
|
||||
Nbase: no of baselines (total, including tilesz >1)
|
||||
tilesz: tile size
|
||||
M: no of clusters
|
||||
Ns: no of stations
|
||||
Nparam: no of actual parameters <=total
|
||||
goff: starting point of gradient calculation 0..Nparams
|
||||
x: N*8 x 1 residual
|
||||
coh: N*8*M x 1
|
||||
p: M*Ns*8 x 1
|
||||
bb: 2*N x 1
|
||||
ptoclus: 2*M x 1
|
||||
|
||||
grad: Nparamsx1 gradient values
|
||||
*/
|
||||
void cudakernel_lbfgs_robust(int ThreadsPerBlock, int BlocksPerGrid, int Nbase, int tilesz, int M, int Ns, int Nparam, int goff, double robust_nu, double *x, double *coh, double *p, short *bb, int *ptoclus, double *grad){
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
/* invoke device on this block/thread grid (last argument is buffer size in bytes) */
|
||||
kernel_deriv_robust<<< BlocksPerGrid, ThreadsPerBlock, ThreadsPerBlock*sizeof(double) >>> (Nbase, tilesz, M, Ns, Nparam, goff, robust_nu, x, coh, p, bb, ptoclus, grad);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
|
@ -1,536 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
#include "cuda.h"
|
||||
#include <cuComplex.h>
|
||||
#include <stdio.h>
|
||||
|
||||
/* enable this for checking for kernel failure */
|
||||
//#define CUDA_DBG
|
||||
|
||||
__global__ void
|
||||
kernel_func_wt_fl(int Nbase, float *x, float *coh, float *p, short *bb, float *wt, int N){
|
||||
/* global thread index : equal to the baseline */
|
||||
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
|
||||
/* this thread works on
|
||||
x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
|
||||
bb[2*n:2*n+1] (sta1,sta2)
|
||||
organization of p (N stations and M clusters)
|
||||
sta 0 sta 1 sta 2 .... sta N-1
|
||||
clus 0 0...7 8...15 16...23 ... 8N-8 8N-1
|
||||
clus 1 8N..8N+7 8N+8..8N+15 8N+16..8N+23 .... 8N+8N-8...8N+8N-1
|
||||
......
|
||||
clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15.... ...(M-1)N+8N-8 (M-1)N+8N-1
|
||||
|
||||
organization of coherencies (coh)
|
||||
[0, 8*M-1] : baseline 0
|
||||
[8*M, 8*M+8*M-1]: baseline 1
|
||||
[n*8*M, n*8*M+8*M-1]: baseline n
|
||||
......
|
||||
[n*8*M+cm*8, n*8*M+cm*8+7] cluster cm, baseline n
|
||||
|
||||
residual error stored at sum[n]
|
||||
*/
|
||||
|
||||
if(n<Nbase) {
|
||||
int sta1=(int)bb[2*n];
|
||||
int sta2=(int)bb[2*n+1];
|
||||
|
||||
/* condition for calculating this baseline sum is
|
||||
1) its not flagged (sta1,sta2)>=0
|
||||
*/
|
||||
if (sta1>=0 && sta2>=0) {
|
||||
cuComplex G1[4];
|
||||
float pp[8];
|
||||
pp[0]=p[sta1*8];
|
||||
pp[1]=p[sta1*8+1];
|
||||
pp[2]=p[sta1*8+2];
|
||||
pp[3]=p[sta1*8+3];
|
||||
pp[4]=p[sta1*8+4];
|
||||
pp[5]=p[sta1*8+5];
|
||||
pp[6]=p[sta1*8+6];
|
||||
pp[7]=p[sta1*8+7];
|
||||
G1[0].x=pp[0];
|
||||
G1[0].y=pp[1];
|
||||
G1[1].x=pp[2];
|
||||
G1[1].y=pp[3];
|
||||
G1[2].x=pp[4];
|
||||
G1[2].y=pp[5];
|
||||
G1[3].x=pp[6];
|
||||
G1[3].y=pp[7];
|
||||
|
||||
|
||||
cuComplex C[4];
|
||||
C[0].x=coh[8*n];
|
||||
C[0].y=coh[8*n+1];
|
||||
C[1].x=coh[8*n+2];
|
||||
C[1].y=coh[8*n+3];
|
||||
C[2].x=coh[8*n+4];
|
||||
C[2].y=coh[8*n+5];
|
||||
C[3].x=coh[8*n+6];
|
||||
C[3].y=coh[8*n+7];
|
||||
|
||||
cuComplex T1[4];
|
||||
/* T=G1*C */
|
||||
T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
|
||||
T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
|
||||
T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
|
||||
T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
|
||||
|
||||
cuComplex G2[4];
|
||||
/* conjugate this */
|
||||
pp[0]=p[sta2*8];
|
||||
pp[1]=-p[sta2*8+1];
|
||||
pp[2]=p[sta2*8+2];
|
||||
pp[3]=-p[sta2*8+3];
|
||||
pp[4]=p[sta2*8+4];
|
||||
pp[5]=-p[sta2*8+5];
|
||||
pp[6]=p[sta2*8+6];
|
||||
pp[7]=-p[sta2*8+7];
|
||||
G2[0].x=pp[0];
|
||||
G2[0].y=pp[1];
|
||||
G2[2].x=pp[2];
|
||||
G2[2].y=pp[3];
|
||||
G2[1].x=pp[4];
|
||||
G2[1].y=pp[5];
|
||||
G2[3].x=pp[6];
|
||||
G2[3].y=pp[7];
|
||||
|
||||
cuComplex T2[4];
|
||||
T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
|
||||
T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
|
||||
T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
|
||||
T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
|
||||
/* update model vector, with weights */
|
||||
x[8*n]=wt[8*n]*T2[0].x;
|
||||
x[8*n+1]=wt[8*n+1]*T2[0].y;
|
||||
x[8*n+2]=wt[8*n+2]*T2[1].x;
|
||||
x[8*n+3]=wt[8*n+3]*T2[1].y;
|
||||
x[8*n+4]=wt[8*n+4]*T2[2].x;
|
||||
x[8*n+5]=wt[8*n+5]*T2[2].y;
|
||||
x[8*n+6]=wt[8*n+6]*T2[3].x;
|
||||
x[8*n+7]=wt[8*n+7]*T2[3].y;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void
|
||||
kernel_jacf_wt_fl(int Nbase, int M, float *jac, float *coh, float *p, short *bb, float *wt, int N){
|
||||
/* global thread index : equal to the baseline */
|
||||
unsigned int n = threadIdx.x + blockDim.x*blockIdx.x;
|
||||
/* which parameter:0...M */
|
||||
unsigned int m = threadIdx.y + blockDim.y*blockIdx.y;
|
||||
|
||||
/* this thread works on
|
||||
x[8*n:8*n+7], coh[8*M*n:8*M*n+8*M-1]
|
||||
bb[2*n:2*n+1] (sta1,sta2)
|
||||
organization of p (N stations and M clusters)
|
||||
sta 0 sta 1 sta 2 .... sta N-1
|
||||
clus 0 0...7 8...15 16...23 ... 8N-8 8N-1
|
||||
clus 1 8N..8N+7 8N+8..8N+15 8N+16..8N+23 .... 8N+8N-8...8N+8N-1
|
||||
......
|
||||
clus M-1 (M-1)N..(M-1)N+7 (M-1)N+8..(M-1)N+15.... ...(M-1)N+8N-8 (M-1)N+8N-1
|
||||
|
||||
organization of coherencies (coh)
|
||||
[0, 8*M-1] : baseline 0
|
||||
[8*M, 8*M+8*M-1]: baseline 1
|
||||
[n*8*M, n*8*M+8*M-1]: baseline n
|
||||
......
|
||||
[n*8*M+cm*8, n*8*M+cm*8+7] cluster cm, baseline n
|
||||
|
||||
residual error stored at sum[n]
|
||||
*/
|
||||
|
||||
if(n<Nbase && m<M) {
|
||||
int sta1=(int)bb[2*n];
|
||||
int sta2=(int)bb[2*n+1];
|
||||
/* condition for calculating this baseline sum is
|
||||
If this baseline is flagged,
|
||||
or if this parameter does not belong to sta1 or sta2
|
||||
we do not compute
|
||||
*/
|
||||
//int stc=m/8; /* 0...Ns-1 (because M=total par= 8 * Nstations */
|
||||
int stc=m>>3; /* 0...Ns-1 (because M=total par= 8 * Nstations */
|
||||
|
||||
if (((stc==sta2)||(stc==sta1)) && sta1>=0 && sta2>=0 ) {
|
||||
|
||||
cuComplex C[4];
|
||||
C[0].x=coh[8*n];
|
||||
C[0].y=coh[8*n+1];
|
||||
C[1].x=coh[8*n+2];
|
||||
C[1].y=coh[8*n+3];
|
||||
C[2].x=coh[8*n+4];
|
||||
C[2].y=coh[8*n+5];
|
||||
C[3].x=coh[8*n+6];
|
||||
C[3].y=coh[8*n+7];
|
||||
|
||||
/* which parameter exactly 0..7 */
|
||||
//int stoff=m%8;
|
||||
int stoff=m-stc*8;
|
||||
float pp1[8];
|
||||
float pp2[8];
|
||||
if (stc==sta1) {
|
||||
for (int cn=0; cn<8; cn++) {
|
||||
pp1[cn]=0.0f;
|
||||
pp2[cn]=p[sta2*8+cn];
|
||||
}
|
||||
pp1[stoff]=1.0f;
|
||||
} else if (stc==sta2) {
|
||||
for (int cn=0; cn<8; cn++) {
|
||||
pp2[cn]=0.0f;
|
||||
pp1[cn]=p[sta1*8+cn];
|
||||
}
|
||||
pp2[stoff]=1.0f;
|
||||
}
|
||||
|
||||
|
||||
cuComplex G1[4];
|
||||
G1[0].x=pp1[0];
|
||||
G1[0].y=pp1[1];
|
||||
G1[1].x=pp1[2];
|
||||
G1[1].y=pp1[3];
|
||||
G1[2].x=pp1[4];
|
||||
G1[2].y=pp1[5];
|
||||
G1[3].x=pp1[6];
|
||||
G1[3].y=pp1[7];
|
||||
|
||||
cuComplex T1[4];
|
||||
/* T=G1*C */
|
||||
T1[0]=cuCaddf(cuCmulf(G1[0],C[0]),cuCmulf(G1[1],C[2]));
|
||||
T1[1]=cuCaddf(cuCmulf(G1[0],C[1]),cuCmulf(G1[1],C[3]));
|
||||
T1[2]=cuCaddf(cuCmulf(G1[2],C[0]),cuCmulf(G1[3],C[2]));
|
||||
T1[3]=cuCaddf(cuCmulf(G1[2],C[1]),cuCmulf(G1[3],C[3]));
|
||||
|
||||
cuComplex G2[4];
|
||||
/* conjugate this */
|
||||
G2[0].x=pp2[0];
|
||||
G2[0].y=-pp2[1];
|
||||
G2[2].x=pp2[2];
|
||||
G2[2].y=-pp2[3];
|
||||
G2[1].x=pp2[4];
|
||||
G2[1].y=-pp2[5];
|
||||
G2[3].x=pp2[6];
|
||||
G2[3].y=-pp2[7];
|
||||
|
||||
cuComplex T2[4];
|
||||
T2[0]=cuCaddf(cuCmulf(T1[0],G2[0]),cuCmulf(T1[1],G2[2]));
|
||||
T2[1]=cuCaddf(cuCmulf(T1[0],G2[1]),cuCmulf(T1[1],G2[3]));
|
||||
T2[2]=cuCaddf(cuCmulf(T1[2],G2[0]),cuCmulf(T1[3],G2[2]));
|
||||
T2[3]=cuCaddf(cuCmulf(T1[2],G2[1]),cuCmulf(T1[3],G2[3]));
|
||||
/* update jacobian , with row weights */
|
||||
/* NOTE: row major order */
|
||||
jac[m+M*8*n]=wt[8*n]*T2[0].x;
|
||||
jac[m+M*(8*n+1)]=wt[8*n+1]*T2[0].y;
|
||||
jac[m+M*(8*n+2)]=wt[8*n+2]*T2[1].x;
|
||||
jac[m+M*(8*n+3)]=wt[8*n+3]*T2[1].y;
|
||||
jac[m+M*(8*n+4)]=wt[8*n+4]*T2[2].x;
|
||||
jac[m+M*(8*n+5)]=wt[8*n+5]*T2[2].y;
|
||||
jac[m+M*(8*n+6)]=wt[8*n+6]*T2[3].x;
|
||||
jac[m+M*(8*n+7)]=wt[8*n+7]*T2[3].y;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void
|
||||
kernel_setweights_fl(int N, float *wt, float alpha){
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* make sure to use only M threads */
|
||||
if (tid<N) {
|
||||
wt[tid]=alpha;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
kernel_hadamard_fl(int N, float *wt, float *x){
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* make sure to use only M threads */
|
||||
if (tid<N) {
|
||||
x[tid]*=wt[tid];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
kernel_updateweights_fl(int N, float *wt, float *x, float *q, float nu){
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* make sure to use only M threads */
|
||||
if (tid<N) {
|
||||
wt[tid]=((nu+1.0f)/(nu+x[tid]*x[tid]));
|
||||
q[tid]=wt[tid]-logf(wt[tid]); /* so that its +ve */
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
kernel_sqrtweights_fl(int N, float *wt){
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* make sure to use only M threads */
|
||||
if (tid<N) {
|
||||
wt[tid]=sqrtf(wt[tid]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__device__ float
|
||||
digamma_fl(float x) {
|
||||
float result = 0.0f, xx, xx2, xx4;
|
||||
for ( ; x < 7.0f; ++x) { /* reduce x till x<7 */
|
||||
result -= 1.0f/x;
|
||||
}
|
||||
x -= 1.0f/2.0f;
|
||||
xx = 1.0f/x;
|
||||
xx2 = xx*xx;
|
||||
xx4 = xx2*xx2;
|
||||
result += logf(x)+(1.0f/24.0f)*xx2-(7.0f/960.0f)*xx4+(31.0f/8064.0f)*xx4*xx2-(127.0f/30720.0f)*xx4*xx4;
|
||||
return result;
|
||||
}
|
||||
|
||||
__global__ void
|
||||
kernel_evaluatenu_fl(int Nd, float qsum, float *q, float deltanu,float nulow) {
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if (tid<Nd) {
|
||||
float thisnu=(nulow+((float)tid)*deltanu);
|
||||
float dgm=digamma_fl(thisnu*0.5f+0.5f);
|
||||
q[tid]=dgm-logf((thisnu+1.0f)*0.5f); /* psi((nu+1)/2)-log((nu+1)/2) */
|
||||
dgm=digamma_fl(thisnu*0.5f);
|
||||
q[tid]+=-dgm+logf((thisnu)*0.5f); /* -psi((nu)/2)+log((nu)/2) */
|
||||
q[tid]+=-qsum+1.0f; /* -(-sum(ln(w_i))/N+sum(w_i)/N)+1 */
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
kernel_evaluatenu_fl_eight(int Nd, float qsum, float *q, float deltanu,float nulow, float nu0) {
|
||||
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
/* each block calculte psi((nu+8)/2)-log((nu+8)/2) */
|
||||
/* actually p=2, so psi((nu+2)/2)-log((nu+2)/2) */
|
||||
float dgm0;
|
||||
if (threadIdx.x==0) {
|
||||
dgm0=digamma_fl(nu0*0.5f+1.0f);
|
||||
dgm0=dgm0-logf((nu0+2.0f)*0.5f); /* psi((nu0+8)/2)-log((nu0+8)/2) */
|
||||
}
|
||||
__syncthreads();
|
||||
if (tid<Nd) {
|
||||
float thisnu=(nulow+((float)tid)*deltanu);
|
||||
q[tid]=dgm0; /* psi((nu0+8)/2)-log((nu0+8)/2) */
|
||||
float dgm=digamma_fl(thisnu*0.5f);
|
||||
q[tid]+=-dgm+logf((thisnu)*0.5f); /* -psi((nu)/2)+log((nu)/2) */
|
||||
q[tid]+=-qsum+1.0f; /* -(-sum(ln(w_i))/N+sum(w_i)/N)+1 */
|
||||
}
|
||||
}
|
||||
|
||||
/* only use extern if calling code is C */
|
||||
extern "C"
|
||||
{
|
||||
|
||||
/* set initial weights to 1 by a cuda kernel */
|
||||
void
|
||||
cudakernel_setweights_fl(int ThreadsPerBlock, int BlocksPerGrid, int N, float *wt, float alpha) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_setweights_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, alpha);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* hadamard product by a cuda kernel x<= x*wt */
|
||||
void
|
||||
cudakernel_hadamard_fl(int ThreadsPerBlock, int BlocksPerGrid, int N, float *wt, float *x) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_hadamard_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, x);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* update weights by a cuda kernel */
|
||||
void
|
||||
cudakernel_updateweights_fl(int ThreadsPerBlock, int BlocksPerGrid, int N, float *wt, float *x, float *q, float robust_nu) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_updateweights_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt, x, q, robust_nu);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* update weights by a cuda kernel */
|
||||
void
|
||||
cudakernel_sqrtweights_fl(int ThreadsPerBlock, int BlocksPerGrid, int N, float *wt) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_sqrtweights_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(N, wt);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* evaluate expression for finding optimum nu for
|
||||
a range of nu values */
|
||||
void
|
||||
cudakernel_evaluatenu_fl(int ThreadsPerBlock, int BlocksPerGrid, int Nd, float qsum, float *q, float deltanu,float nulow) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_evaluatenu_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(Nd, qsum, q, deltanu,nulow);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* evaluate expression for finding optimum nu for
|
||||
a range of nu values, using AECM (p=8 before, but now p=2)
|
||||
nu0: current value of robust_nu*/
|
||||
void
|
||||
cudakernel_evaluatenu_fl_eight(int ThreadsPerBlock, int BlocksPerGrid, int Nd, float qsum, float *q, float deltanu,float nulow, float nu0) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
kernel_evaluatenu_fl_eight<<< BlocksPerGrid, ThreadsPerBlock >>>(Nd, qsum, q, deltanu,nulow, nu0);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
/* cuda driver for calculating wt \odot f() */
|
||||
/* p: params (Mx1), x: data (Nx1), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
|
||||
void
|
||||
cudakernel_func_wt_fl(int ThreadsPerBlock, int BlocksPerGrid, float *p, float *x, int M, int N, float *coh, short *bbh, float *wt, int Nbase, int Mclus, int Nstations) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
cudaMemset(x, 0, N*sizeof(float));
|
||||
// printf("Kernel data size=%d, block=%d, thread=%d, baselines=%d\n",N,BlocksPerGrid, ThreadsPerBlock,Nbase);
|
||||
kernel_func_wt_fl<<< BlocksPerGrid, ThreadsPerBlock >>>(Nbase, x, coh, p, bbh, wt, Nstations);
|
||||
cudaDeviceSynchronize();
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
/* cuda driver for calculating wt \odot jacf() */
|
||||
/* p: params (Mx1), jac: jacobian (NxM), other data : coh, baseline->stat mapping, Nbase, Mclusters, Nstations */
|
||||
void
|
||||
cudakernel_jacf_wt_fl(int ThreadsPerBlock_row, int ThreadsPerBlock_col, float *p, float *jac, int M, int N, float *coh, short *bbh, float *wt, int Nbase, int Mclus, int Nstations, int clus) {
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
cudaError_t error;
|
||||
#endif
|
||||
/* NOTE: use small value for ThreadsPerBlock here, like 8 */
|
||||
dim3 threadsPerBlock(16, 8);
|
||||
/* jacobian: Nbase x Nstations (proportional to N), so */
|
||||
dim3 numBlocks((Nbase+threadsPerBlock.x-1)/threadsPerBlock.x,
|
||||
(M+threadsPerBlock.y-1)/threadsPerBlock.y);
|
||||
/* set memory of jac to zero */
|
||||
cudaMemset(jac, 0, N*M*sizeof(float));
|
||||
// printf("Kernel Jax data size=%d, params=%d, block=%d,%d, thread=%d,%d, baselines=%d\n",N, M, numBlocks.x,numBlocks.y, threadsPerBlock.x, threadsPerBlock.y, Nbase);
|
||||
kernel_jacf_wt_fl<<< numBlocks, threadsPerBlock>>>(Nbase, M, jac, coh, p, bbh, wt, Nstations);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
#ifdef CUDA_DBG
|
||||
error = cudaGetLastError();
|
||||
if(error != cudaSuccess)
|
||||
{
|
||||
// print the CUDA error message and exit
|
||||
fprintf(stderr,"CUDA error: %s :%s: %d\n", cudaGetErrorString(error),__FILE__,__LINE__);
|
||||
exit(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -1,894 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
#include "Solvers.h"
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
//#define DEBUG
|
||||
/* helper functions for diagnostics */
|
||||
static void
|
||||
checkCudaError(cudaError_t err, char *file, int line)
|
||||
{
|
||||
#ifdef CUDA_DEBUG
|
||||
if(!err)
|
||||
return;
|
||||
fprintf(stderr,"GPU (CUDA): %s %s %d\n", cudaGetErrorString(err),file,line);
|
||||
exit(EXIT_FAILURE);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
checkCublasError(cublasStatus_t cbstatus, char *file, int line)
|
||||
{
|
||||
#ifdef CUDA_DEBUG
|
||||
if (cbstatus!=CUBLAS_STATUS_SUCCESS) {
|
||||
fprintf(stderr,"%s: %d: CUBLAS failure\n",file,line);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/* Retraction
|
||||
rnew: new value */
|
||||
/* rnew = x + r */
|
||||
void
|
||||
cudakernel_fns_R(int N, cuFloatComplex *x, cuFloatComplex *r, cuFloatComplex *rnew, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
|
||||
cublasStatus_t cbstatus;
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,x,1,rnew,1);
|
||||
cuFloatComplex alpha;
|
||||
alpha.x=1.0f; alpha.y=0.0f;
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, r, 1, rnew, 1);
|
||||
checkCublasError(cbstatus,__FILE__,__LINE__);
|
||||
}
|
||||
|
||||
|
||||
/* inner product (metric) */
|
||||
float
|
||||
cudakernel_fns_g(int N,cuFloatComplex *x,cuFloatComplex *eta, cuFloatComplex *gamma,cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
|
||||
/* 2 x real( trace(eta'*gamma) )
|
||||
= 2 x real( eta(:,1)'*gamma(:,1) + eta(:,2)'*gamma(:,2) )
|
||||
no need to calculate off diagonal terms
|
||||
)*/
|
||||
cublasStatus_t cbstatus;
|
||||
cuFloatComplex r1,r2;
|
||||
//complex double v1=my_cdot(2*N,eta,gamma);
|
||||
cbstatus=cublasCdotc(cbhandle,2*N,eta,1,gamma,1,&r1);
|
||||
//complex double v2=my_cdot(2*N,&eta[2*N],&gamma[2*N]);
|
||||
cbstatus=cublasCdotc(cbhandle,2*N,&eta[2*N],1,&gamma[2*N],1,&r2);
|
||||
|
||||
checkCublasError(cbstatus,__FILE__,__LINE__);
|
||||
return 2.0f*(r1.x+r2.x);
|
||||
}
|
||||
|
||||
|
||||
/* Projection
|
||||
rnew: new value */
|
||||
void
|
||||
cudakernel_fns_proj(int N, cuFloatComplex *x, cuFloatComplex *z, cuFloatComplex *rnew, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
|
||||
/* projection = Z-X Om, where
|
||||
Om X^H X+X^H X Om = X^H Z - Z^H X
|
||||
is solved to find Om */
|
||||
|
||||
cublasStatus_t cbstatus;
|
||||
/* find X^H X */
|
||||
cuFloatComplex xx00,xx01,xx10,xx11,*bd;
|
||||
//xx00=my_cdot(2*N,x,x);
|
||||
cbstatus=cublasCdotc(cbhandle,2*N,x,1,x,1,&xx00);
|
||||
//xx01=my_cdot(2*N,x,&x[2*N]);
|
||||
cbstatus=cublasCdotc(cbhandle,2*N,x,1,&x[2*N],1,&xx01);
|
||||
xx10=cuConjf(xx01);
|
||||
//xx11=my_cdot(2*N,&x[2*N],&x[2*N]);
|
||||
cbstatus=cublasCdotc(cbhandle,2*N,&x[2*N],1,&x[2*N],1,&xx11);
|
||||
|
||||
/* find X^H Z (and using this just calculte Z^H X directly) */
|
||||
cuFloatComplex xz00,xz01,xz10,xz11;
|
||||
//xz00=my_cdot(2*N,x,z);
|
||||
cbstatus=cublasCdotc(cbhandle,2*N,x,1,z,1,&xz00);
|
||||
//xz01=my_cdot(2*N,x,&z[2*N]);
|
||||
cbstatus=cublasCdotc(cbhandle,2*N,x,1,&z[2*N],1,&xz01);
|
||||
//xz10=my_cdot(2*N,&x[2*N],z);
|
||||
cbstatus=cublasCdotc(cbhandle,2*N,&x[2*N],1,z,1,&xz10);
|
||||
//xz11=my_cdot(2*N,&x[2*N],&z[2*N]);
|
||||
cbstatus=cublasCdotc(cbhandle,2*N,&x[2*N],1,&z[2*N],1,&xz11);
|
||||
|
||||
/* find X^H Z - Z^H X */
|
||||
cuFloatComplex rr00,rr01,rr10,rr11;
|
||||
//rr00=xz00-conj(xz00);
|
||||
rr00=cuCsubf(xz00,cuConjf(xz00));
|
||||
//rr01=xz01-conj(xz10);
|
||||
rr01=cuCsubf(xz01,cuConjf(xz10));
|
||||
//rr10=-conj(rr01);
|
||||
rr10.x=-rr01.x; rr10.y=rr01.y;
|
||||
//rr11=xz11-conj(xz11);
|
||||
rr11=cuCsubf(xz11,cuConjf(xz11));
|
||||
|
||||
/* find I_2 kron (X^H X) + (X^H X)^T kron I_2 */
|
||||
/* A = [2*xx00 xx01 xx10 0
|
||||
xx10 xx11+xx00 0 xx10
|
||||
xx01 0 xx11+xx00 xx01
|
||||
0 xx01 xx10 2*xx11 ]
|
||||
*/
|
||||
cuFloatComplex A[16],*Ad;
|
||||
A[0]=cuCmulf(make_cuFloatComplex(2.0f,0.0f),xx00);
|
||||
A[5]=A[10]=cuCaddf(xx00,xx11);
|
||||
A[15]=cuCmulf(make_cuFloatComplex(2.0f,0.0f),xx11);
|
||||
A[1]=A[8]=A[11]=A[13]=xx10;
|
||||
A[2]=A[4]=A[7]=A[14]=xx01;
|
||||
A[3]=A[6]=A[9]=A[12]=make_cuFloatComplex(0.0f,0.0f);
|
||||
cuFloatComplex b[4];
|
||||
b[0]=rr00;
|
||||
b[1]=rr10;
|
||||
b[2]=rr01;
|
||||
b[3]=rr11;
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("BEFOREA=[\n");
|
||||
printf("%f+j*(%f) %f+j*(%f) %f+j*(%f) %f+j*(%f)\n",A[0].x,A[0].y,A[4].x,A[4].y,A[8].x,A[8].y,A[12].x,A[12].y);
|
||||
printf("%f+j*(%f) %f+j*(%f) %f+j*(%f) %f+j*(%f)\n",A[1].x,A[1].y,A[5].x,A[5].y,A[9].x,A[9].y,A[13].x,A[13].y);
|
||||
printf("%f+j*(%f) %f+j*(%f) %f+j*(%f) %f+j*(%f)\n",A[2].x,A[2].y,A[6].x,A[6].y,A[10].x,A[10].y,A[14].x,A[14].y);
|
||||
printf("%f+j*(%f) %f+j*(%f) %f+j*(%f) %f+j*(%f)\n",A[3].x,A[3].y,A[7].x,A[7].y,A[11].x,A[11].y,A[15].x,A[15].y);
|
||||
printf("];\n");
|
||||
printf("BEFOREb=[\n");
|
||||
printf("%f+j*(%f)\n",b[0].x,b[0].y);
|
||||
printf("%f+j*(%f)\n",b[1].x,b[1].y);
|
||||
printf("%f+j*(%f)\n",b[2].x,b[2].y);
|
||||
printf("%f+j*(%f)\n",b[3].x,b[3].y);
|
||||
printf("];\n");
|
||||
#endif
|
||||
|
||||
|
||||
/* solve A u = b to find u , using double precision */
|
||||
cudaMalloc((void **)&Ad, 16*sizeof(cuFloatComplex));
|
||||
cudaMemcpy(Ad,A,16*sizeof(cuFloatComplex),cudaMemcpyHostToDevice);
|
||||
/* copy b to device */
|
||||
cudaMalloc((void **)&bd, 4*sizeof(cuFloatComplex));
|
||||
cudaMemcpy(bd,b,4*sizeof(cuFloatComplex),cudaMemcpyHostToDevice);
|
||||
|
||||
//culaStatus status;
|
||||
//status=culaDeviceCgels('N',4,4,1,(culaDeviceFloatComplex *)Ad,4,(culaDeviceFloatComplex *)bd,4);
|
||||
//checkStatus(status,__FILE__,__LINE__);
|
||||
int work_size=0;
|
||||
int *devInfo;
|
||||
cudaError_t err;
|
||||
err=cudaMalloc((void**)&devInfo, sizeof(int));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
cuFloatComplex *work,*taud;
|
||||
cusolverDnCgeqrf_bufferSize(solver_handle, 4, 4, (cuFloatComplex *)Ad, 4, &work_size);
|
||||
err=cudaMalloc((void**)&work, work_size*sizeof(cuFloatComplex));
|
||||
err=cudaMalloc((void**)&taud, 4*sizeof(cuFloatComplex));
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
cusolverDnCgeqrf(solver_handle, 4, 4, Ad, 4, taud, work, work_size, devInfo);
|
||||
cusolverDnCunmqr(solver_handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_C, 4, 1, 4, Ad, 4, taud, bd, 4, work, work_size, devInfo);
|
||||
cuFloatComplex cone; cone.x=1.0f; cone.y=0.0f;
|
||||
cbstatus=cublasCtrsm(cbhandle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_UPPER,CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,4,1,&cone,Ad,4,bd,4);
|
||||
|
||||
|
||||
cudaFree(work);
|
||||
cudaFree(taud);
|
||||
cudaFree(devInfo);
|
||||
|
||||
|
||||
#ifdef DEBUG
|
||||
cudaMemcpy(b,bd,4*sizeof(cuFloatComplex),cudaMemcpyDeviceToHost);
|
||||
printf("Afterb=[\n");
|
||||
printf("%f+j*(%f)\n",b[0].x,b[0].y);
|
||||
printf("%f+j*(%f)\n",b[1].x,b[1].y);
|
||||
printf("%f+j*(%f)\n",b[2].x,b[2].y);
|
||||
printf("%f+j*(%f)\n",b[3].x,b[3].y);
|
||||
printf("];\n");
|
||||
#endif
|
||||
|
||||
/* form Z - X * Om, where Om is given by solution b
|
||||
but no need to rearrange b because it is already in col major order */
|
||||
//my_ccopy(4*N,z,1,rnew,1);
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,z,1,rnew,1);
|
||||
checkCublasError(cbstatus,__FILE__,__LINE__);
|
||||
//my_zgemm('N','N',2*N,2,2,-1.0+0.0*_Complex_I,z,2*N,b,2,1.0+0.0*_Complex_I,rnew,2*N);
|
||||
cuFloatComplex a1,a2;
|
||||
a1.x=-1.0f; a1.y=0.0f;
|
||||
a2.x=1.0f; a2.y=0.0f;
|
||||
#ifdef DEBUG
|
||||
/* read back eta for checking */
|
||||
cuFloatComplex *etalocal;
|
||||
cudaHostAlloc((void **)&etalocal, sizeof(cuFloatComplex)*4*N,cudaHostAllocDefault);
|
||||
cudaMemcpy(etalocal, rnew, 4*N*sizeof(cuFloatComplex), cudaMemcpyDeviceToHost);
|
||||
printf("Rnewbefore=[\n");
|
||||
int ci;
|
||||
for (ci=0; ci<2*N; ci++) {
|
||||
printf("%f+j*(%f) %f+j*(%f);\n",etalocal[ci].x,etalocal[ci].y,etalocal[ci+2*N].x,etalocal[ci+2*N].y);
|
||||
}
|
||||
printf("]\n");
|
||||
#endif
|
||||
|
||||
cbstatus=cublasCgemm(cbhandle,CUBLAS_OP_N,CUBLAS_OP_N,2*N,2,2,&a1,x,2*N,bd,2,&a2,rnew,2*N);
|
||||
|
||||
#ifdef DEBUG
|
||||
checkCublasError(cbstatus,__FILE__,__LINE__);
|
||||
cudaMemcpy(etalocal, rnew, 4*N*sizeof(cuFloatComplex), cudaMemcpyDeviceToHost);
|
||||
printf("Rnewafter=[\n");
|
||||
for (ci=0; ci<2*N; ci++) {
|
||||
printf("%f+j*(%f) %f+j*(%f);\n",etalocal[ci].x,etalocal[ci].y,etalocal[ci+2*N].x,etalocal[ci+2*N].y);
|
||||
}
|
||||
printf("]\n");
|
||||
cudaFreeHost(etalocal);
|
||||
#endif
|
||||
checkCublasError(cbstatus,__FILE__,__LINE__);
|
||||
cudaFree(Ad);
|
||||
cudaFree(bd);
|
||||
}
|
||||
|
||||
/* gradient, also projected to tangent space */
|
||||
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
|
||||
static void
|
||||
cudakernel_fns_fgrad(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, float *y, float *coh, short *bbh, float *iw, int negate, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
|
||||
cuFloatComplex *tempeta,*tempb;
|
||||
cublasStatus_t cbstatus=CUBLAS_STATUS_SUCCESS;
|
||||
cuFloatComplex alpha;
|
||||
cudaMalloc((void**)&tempeta, sizeof(cuFloatComplex)*4*N);
|
||||
cudaMalloc((void**)&tempb, sizeof(cuFloatComplex)*4*N);
|
||||
/* max size of M for one kernel call, to determine optimal blocks */
|
||||
int T=DEFAULT_TH_PER_BK*ThreadsPerBlock;
|
||||
if (M<T) {
|
||||
cudakernel_fns_fgradflat(ThreadsPerBlock, BlocksPerGrid, N, M, x, tempeta, y, coh, bbh);
|
||||
} else {
|
||||
/* reset memory to zero */
|
||||
cudaMemset(tempeta, 0, sizeof(cuFloatComplex)*4*N);
|
||||
/* loop through M baselines */
|
||||
int L=(M+T-1)/T;
|
||||
int ct=0;
|
||||
int myT,ci;
|
||||
for (ci=0; ci<L; ci++) {
|
||||
if (ct+T<M) {
|
||||
myT=T;
|
||||
} else {
|
||||
myT=M-ct;
|
||||
}
|
||||
int B=(myT+ThreadsPerBlock-1)/ThreadsPerBlock;
|
||||
cudakernel_fns_fgradflat(ThreadsPerBlock, B, N, myT, x, tempb, &y[ct*8], &coh[ct*8], &bbh[ct*2]);
|
||||
alpha.x=1.0f;alpha.y=0.0f;
|
||||
/* tempeta <= tempeta + tempb */
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, tempb, 1, tempeta, 1);
|
||||
ct=ct+T;
|
||||
}
|
||||
}
|
||||
cudakernel_fns_fscale(N, tempeta, iw);
|
||||
/* find -ve gradient */
|
||||
if (negate) {
|
||||
alpha.x=-1.0f;alpha.y=0.0f;
|
||||
cbstatus=cublasCscal(cbhandle,4*N,&alpha,tempeta,1);
|
||||
}
|
||||
cudakernel_fns_proj(N, x, tempeta, eta, cbhandle, solver_handle);
|
||||
checkCublasError(cbstatus,__FILE__,__LINE__);
|
||||
cudaFree(tempeta);
|
||||
cudaFree(tempb);
|
||||
}
|
||||
|
||||
/* Hessian, also projected to tangent space */
|
||||
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
|
||||
static void
|
||||
cudakernel_fns_fhess(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *eta, cuFloatComplex *fhess, float *y, float *coh, short *bbh, float *iw, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
|
||||
cuFloatComplex *tempeta,*tempb;
|
||||
cudaMalloc((void**)&tempeta, sizeof(cuFloatComplex)*4*N);
|
||||
cudaMalloc((void**)&tempb, sizeof(cuFloatComplex)*4*N);
|
||||
|
||||
cuFloatComplex alpha;
|
||||
cublasStatus_t cbstatus=CUBLAS_STATUS_SUCCESS;
|
||||
/* max size of M for one kernel call, to determine optimal blocks */
|
||||
int T=DEFAULT_TH_PER_BK*ThreadsPerBlock;
|
||||
if (M<T) {
|
||||
cudakernel_fns_fhessflat(ThreadsPerBlock, BlocksPerGrid, N, M, x, eta, tempeta, y, coh, bbh);
|
||||
} else {
|
||||
/* reset memory to zero */
|
||||
cudaMemset(tempeta, 0, sizeof(cuFloatComplex)*4*N);
|
||||
/* loop through M baselines */
|
||||
int L=(M+T-1)/T;
|
||||
int ct=0;
|
||||
int myT,ci;
|
||||
for (ci=0; ci<L; ci++) {
|
||||
if (ct+T<M) {
|
||||
myT=T;
|
||||
} else {
|
||||
myT=M-ct;
|
||||
}
|
||||
int B=(myT+ThreadsPerBlock-1)/ThreadsPerBlock;
|
||||
cudakernel_fns_fhessflat(ThreadsPerBlock, B, N, myT, x, eta, tempb, &y[ct*8], &coh[ct*8], &bbh[ct*2]);
|
||||
alpha.x=1.0f;alpha.y=0.0f;
|
||||
/* tempeta <= tempeta + tempb */
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &alpha, tempb, 1, tempeta, 1);
|
||||
ct=ct+T;
|
||||
}
|
||||
}
|
||||
|
||||
cudakernel_fns_fscale(N, tempeta, iw);
|
||||
cudakernel_fns_proj(N, x, tempeta, fhess, cbhandle, solver_handle);
|
||||
checkCublasError(cbstatus,__FILE__,__LINE__);
|
||||
cudaFree(tempeta);
|
||||
cudaFree(tempb);
|
||||
}
|
||||
|
||||
/* Armijo step calculation,
|
||||
output teta: Armijo gradient
|
||||
return value: 0 : cost reduced, 1: no cost reduction, so do not run again
|
||||
mincost: minimum value of cost found, if possible
|
||||
*/
|
||||
/* need 8N*BlocksPerGrid+ 8N*2 float storage */
|
||||
static int
|
||||
armijostep(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *teta, float *y, float *coh, short *bbh, float *iw, float *mincost, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
|
||||
float alphabar=10.0f;
|
||||
float beta=0.2f;
|
||||
float sigma=0.5f;
|
||||
cublasStatus_t cbstatus;
|
||||
/* temp storage, re-using global storage */
|
||||
cuFloatComplex *eta, *x_prop;
|
||||
cudaMalloc((void**)&eta, sizeof(cuFloatComplex)*4*N);
|
||||
cudaMalloc((void**)&x_prop, sizeof(cuFloatComplex)*4*N);
|
||||
|
||||
|
||||
//double fx=fns_f(x,y,gdata);
|
||||
float fx=cudakernel_fns_f(ThreadsPerBlock,BlocksPerGrid,N,M,x,y,coh,bbh);
|
||||
//fns_fgrad(x,eta,y,gdata,0);
|
||||
cudakernel_fns_fgrad(ThreadsPerBlock,BlocksPerGrid,N,M,x,eta,y,coh,bbh,iw,0,cbhandle, solver_handle);
|
||||
#ifdef DEBUG
|
||||
float eta_nrm;
|
||||
cublasScnrm2(cbhandle,4*N,eta,1,&eta_nrm);
|
||||
printf("||eta||=%f\n",eta_nrm);
|
||||
/* read back eta for checking */
|
||||
cuFloatComplex *etalocal;
|
||||
cudaHostAlloc((void **)&etalocal, sizeof(cuFloatComplex)*4*N,cudaHostAllocDefault);
|
||||
cudaMemcpy(etalocal, eta, 4*N*sizeof(cuFloatComplex), cudaMemcpyDeviceToHost);
|
||||
printf("Eta=[\n");
|
||||
int ci;
|
||||
for (ci=0; ci<2*N; ci++) {
|
||||
printf("%f %f %f %f\n",etalocal[ci].x,etalocal[ci].y,etalocal[ci+2*N].x,etalocal[ci+2*N].y);
|
||||
}
|
||||
printf("]\n");
|
||||
cudaFreeHost(etalocal);
|
||||
#endif
|
||||
float beta0=beta;
|
||||
float minfx=fx; float minbeta=beta0;
|
||||
float lhs,rhs,metric;
|
||||
int m,nocostred=0;
|
||||
cuFloatComplex alpha;
|
||||
*mincost=fx;
|
||||
|
||||
float metric0=cudakernel_fns_g(N,x,eta,eta,cbhandle,solver_handle);
|
||||
for (m=0; m<50; m++) {
|
||||
/* abeta=(beta0)*alphabar*eta; */
|
||||
//my_ccopy(4*dp->N,eta,1,teta,1);
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,eta,1,teta,1);
|
||||
//my_cscal(4*dp->N,beta0*alphabar+0.0*_Complex_I,teta);
|
||||
alpha.x=beta0*alphabar;alpha.y=0.0f;
|
||||
cbstatus=cublasCscal(cbhandle,4*N,&alpha,teta,1);
|
||||
/* Rx=R(x,teta); */
|
||||
//fns_R(dp->N,x,teta,x_prop);
|
||||
cudakernel_fns_R(N,x,teta,x_prop,cbhandle,solver_handle);
|
||||
//lhs=fns_f(x_prop,y,gdata);
|
||||
lhs=cudakernel_fns_f(ThreadsPerBlock,BlocksPerGrid,N,M,x_prop,y,coh,bbh);
|
||||
if (lhs<minfx) {
|
||||
minfx=lhs;
|
||||
*mincost=minfx;
|
||||
minbeta=beta0;
|
||||
}
|
||||
//rhs=fx+sigma*fns_g(dp->N,x,eta,teta);
|
||||
//metric=cudakernel_fns_g(N,x,eta,teta,cbhandle);
|
||||
metric=beta0*alphabar*metric0;
|
||||
rhs=fx+sigma*metric;
|
||||
#ifdef DEBUG
|
||||
printf("m=%d lhs=%e rhs=%e rat=%e norm=%e\n",m,lhs,rhs,lhs/rhs,metric);
|
||||
#endif
|
||||
if ((!isnan(lhs) && lhs<=rhs)) {
|
||||
minbeta=beta0;
|
||||
break;
|
||||
}
|
||||
beta0=beta0*beta;
|
||||
}
|
||||
|
||||
/* if no further cost improvement is seen */
|
||||
if (lhs>fx) {
|
||||
nocostred=1;
|
||||
}
|
||||
|
||||
//my_ccopy(4*dp->N,eta,1,teta,1);
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,eta,1,teta,1);
|
||||
alpha.x=minbeta*alphabar; alpha.y=0.0f;
|
||||
//my_cscal(4*dp->N,minbeta*alphabar+0.0*_Complex_I,teta);
|
||||
cbstatus=cublasCscal(cbhandle,4*N,&alpha,teta,1);
|
||||
|
||||
checkCublasError(cbstatus,__FILE__,__LINE__);
|
||||
cudaFree(eta);
|
||||
cudaFree(x_prop);
|
||||
|
||||
return nocostred;
|
||||
}
|
||||
|
||||
|
||||
/* truncated conjugate gradient method
|
||||
x, grad, eta, r, z, delta, Hxd : size 2N x 2 complex
|
||||
so, vector size is 4N complex double
|
||||
|
||||
output: eta
|
||||
return value: stop_tCG code
|
||||
|
||||
y: vec(V) visibilities
|
||||
*/
|
||||
/* need 8N*(BlocksPerGrid+2)+ 8N*6 float storage */
|
||||
static int
|
||||
tcg_solve_cuda(int ThreadsPerBlock, int BlocksPerGrid, int N, int M, cuFloatComplex *x, cuFloatComplex *grad, cuFloatComplex *eta, cuFloatComplex *fhess, float Delta, float theta, float kappa, int max_inner, int min_inner, float *y, float *coh, short *bbh, float *iw, cublasHandle_t cbhandle, cusolverDnHandle_t solver_handle) {
|
||||
cuFloatComplex *r,*z,*delta,*Hxd, *rnew;
|
||||
float e_Pe, r_r, norm_r, z_r, d_Pd, d_Hd, alpha, e_Pe_new,
|
||||
e_Pd, Deltasq, tau, zold_rold, beta, norm_r0;
|
||||
int cj, stop_tCG;
|
||||
cudaMalloc((void**)&r, sizeof(cuFloatComplex)*4*N);
|
||||
cudaMalloc((void**)&z, sizeof(cuFloatComplex)*4*N);
|
||||
cudaMalloc((void**)&delta, sizeof(cuFloatComplex)*4*N);
|
||||
cudaMalloc((void**)&Hxd, sizeof(cuFloatComplex)*4*N);
|
||||
cudaMalloc((void**)&rnew, sizeof(cuFloatComplex)*4*N);
|
||||
|
||||
cublasStatus_t cbstatus;
|
||||
cuFloatComplex a0;
|
||||
|
||||
/*
|
||||
initial values
|
||||
*/
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,grad,1,r,1);
|
||||
e_Pe=0.0f;
|
||||
|
||||
|
||||
r_r=cudakernel_fns_g(N,x,r,r,cbhandle,solver_handle);
|
||||
norm_r=sqrtf(r_r);
|
||||
norm_r0=norm_r;
|
||||
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,r,1,z,1);
|
||||
|
||||
z_r=cudakernel_fns_g(N,x,z,r,cbhandle,solver_handle);
|
||||
d_Pd=z_r;
|
||||
|
||||
/*
|
||||
initial search direction
|
||||
*/
|
||||
cudaMemset(delta, 0, sizeof(cuFloatComplex)*4*N);
|
||||
a0.x=-1.0f; a0.y=0.0f;
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &a0, z, 1, delta, 1);
|
||||
e_Pd=cudakernel_fns_g(N,x,eta,delta,cbhandle,solver_handle);
|
||||
|
||||
stop_tCG=5;
|
||||
|
||||
/* % begin inner/tCG loop
|
||||
for j = 1:max_inner,
|
||||
*/
|
||||
for(cj=1; cj<=max_inner; cj++) {
|
||||
cudakernel_fns_fhess(ThreadsPerBlock,BlocksPerGrid,N,M,x,delta,Hxd,y,coh,bbh,iw, cbhandle, solver_handle);
|
||||
d_Hd=cudakernel_fns_g(N,x,delta,Hxd,cbhandle,solver_handle);
|
||||
|
||||
alpha=z_r/d_Hd;
|
||||
e_Pe_new = e_Pe + 2.0f*alpha*e_Pd + alpha*alpha*d_Pd;
|
||||
|
||||
|
||||
Deltasq=Delta*Delta;
|
||||
if (d_Hd <= 0.0f || e_Pe_new >= Deltasq) {
|
||||
tau = (-e_Pd + sqrtf(e_Pd*e_Pd + d_Pd*(Deltasq-e_Pe)))/d_Pd;
|
||||
a0.x=tau;
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &a0, delta, 1, eta, 1);
|
||||
/* Heta = Heta + tau *Hdelta */
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &a0, Hxd, 1, fhess, 1);
|
||||
stop_tCG=(d_Hd<=0.0f?1:2);
|
||||
break;
|
||||
}
|
||||
|
||||
e_Pe=e_Pe_new;
|
||||
a0.x=alpha;
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &a0, delta, 1, eta, 1);
|
||||
/* Heta = Heta + alpha*Hdelta */
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &a0, Hxd, 1, fhess, 1);
|
||||
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &a0, Hxd, 1, r, 1);
|
||||
cudakernel_fns_proj(N, x, r, rnew, cbhandle,solver_handle);
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,rnew,1,r,1);
|
||||
r_r=cudakernel_fns_g(N,x,r,r,cbhandle,solver_handle);
|
||||
norm_r=sqrtf(r_r);
|
||||
|
||||
/*
|
||||
check kappa/theta stopping criterion
|
||||
*/
|
||||
if (cj >= min_inner) {
|
||||
float norm_r0pow=powf(norm_r0,theta);
|
||||
if (norm_r <= norm_r0*MIN(norm_r0pow,kappa)) {
|
||||
stop_tCG=(kappa<norm_r0pow?3:4);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,r,1,z,1);
|
||||
zold_rold=z_r;
|
||||
|
||||
z_r=cudakernel_fns_g(N,x,z,r,cbhandle,solver_handle);
|
||||
|
||||
beta=z_r/zold_rold;
|
||||
a0.x=beta;
|
||||
cbstatus=cublasCscal(cbhandle,4*N,&a0,delta,1);
|
||||
a0.x=-1.0f;
|
||||
cbstatus=cublasCaxpy(cbhandle,4*N, &a0, z, 1, delta, 1);
|
||||
|
||||
|
||||
e_Pd = beta*(e_Pd + alpha*d_Pd);
|
||||
d_Pd = z_r + beta*beta*d_Pd;
|
||||
}
|
||||
|
||||
checkCublasError(cbstatus,__FILE__,__LINE__);
|
||||
cudaFree(r);
|
||||
cudaFree(z);
|
||||
cudaFree(delta);
|
||||
cudaFree(Hxd);
|
||||
cudaFree(rnew);
|
||||
return stop_tCG;
|
||||
}
|
||||
|
||||
|
||||
/* follow clmfit_fl.c */
|
||||
int
|
||||
rtr_solve_cuda_fl(
|
||||
float *x0, /* initial values and updated solution at output (size 8*N float) */
|
||||
float *y, /* data vector (size 8*M float) */
|
||||
int N, /* no of stations */
|
||||
int M, /* no of constraints */
|
||||
int itmax_sd, /* maximum number of iterations RSD */
|
||||
int itmax_rtr, /* maximum number of iterations RTR */
|
||||
float Delta_bar, float Delta0, /* Trust region radius and initial value */
|
||||
double *info, /* initial and final residuals */
|
||||
|
||||
cublasHandle_t cbhandle, /* device handle */
|
||||
cusolverDnHandle_t solver_handle, /* solver handle */
|
||||
int tileoff, /* tile offset when solving for many chunks */
|
||||
int ntiles, /* total tile (data) size being solved for */
|
||||
me_data_t *adata)
|
||||
{
|
||||
|
||||
/* general note: all device variables end with a 'd' */
|
||||
cudaError_t err;
|
||||
cublasStatus_t cbstatus=CUBLAS_STATUS_SUCCESS;
|
||||
|
||||
/* ME data */
|
||||
me_data_t *dp=(me_data_t*)adata;
|
||||
int Nbase=(dp->Nbase)*(ntiles); /* note: we do not use the total tile size */
|
||||
/* coherency on device */
|
||||
float *cohd;
|
||||
/* baseline-station map on device/host */
|
||||
short *bbd;
|
||||
|
||||
/* calculate no of cuda threads and blocks */
|
||||
int ThreadsPerBlock=DEFAULT_TH_PER_BK;
|
||||
int BlocksPerGrid= 2*(M+ThreadsPerBlock-1)/ThreadsPerBlock;
|
||||
|
||||
|
||||
/* reshape x to make J: 2Nx2 complex double
|
||||
*/
|
||||
complex float *x;
|
||||
if ((x=(complex float*)malloc((size_t)4*N*sizeof(complex float)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
/* map x: [(re,im)J_1(0,0) (re,im)J_1(0,1) (re,im)J_1(1,0) (re,im)J_1(1,1)...]
|
||||
to
|
||||
J: [J_1(0,0) J_1(1,0) J_2(0,0) J_2(1,0) ..... J_1(0,1) J_1(1,1) J_2(0,1) J_2(1,1)....]
|
||||
*/
|
||||
float *Jd=(float*)x;
|
||||
/* re J(0,0) */
|
||||
my_fcopy(N, &x0[0], 8, &Jd[0], 4);
|
||||
/* im J(0,0) */
|
||||
my_fcopy(N, &x0[1], 8, &Jd[1], 4);
|
||||
/* re J(1,0) */
|
||||
my_fcopy(N, &x0[4], 8, &Jd[2], 4);
|
||||
/* im J(1,0) */
|
||||
my_fcopy(N, &x0[5], 8, &Jd[3], 4);
|
||||
/* re J(0,1) */
|
||||
my_fcopy(N, &x0[2], 8, &Jd[4*N], 4);
|
||||
/* im J(0,1) */
|
||||
my_fcopy(N, &x0[3], 8, &Jd[4*N+1], 4);
|
||||
/* re J(1,1) */
|
||||
my_fcopy(N, &x0[6], 8, &Jd[4*N+2], 4);
|
||||
/* im J(1,1) */
|
||||
my_fcopy(N, &x0[7], 8, &Jd[4*N+3], 4);
|
||||
|
||||
|
||||
int ci;
|
||||
|
||||
/***************************************************/
|
||||
cuFloatComplex *xd,*fgradxd,*etad,*Hetad,*x_propd;
|
||||
float *yd;
|
||||
/* for counting how many baselines contribute to each station
|
||||
grad/hess calculation */
|
||||
float *iwd,*iw;
|
||||
if ((iw=(float*)malloc((size_t)N*sizeof(float)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
cudaMalloc((void**)&fgradxd, sizeof(cuFloatComplex)*4*N);
|
||||
cudaMalloc((void**)&etad, sizeof(cuFloatComplex)*4*N);
|
||||
cudaMalloc((void**)&Hetad, sizeof(cuFloatComplex)*4*N);
|
||||
cudaMalloc((void**)&x_propd, sizeof(cuFloatComplex)*4*N);
|
||||
cudaMalloc((void**)&xd, sizeof(cuFloatComplex)*4*N);
|
||||
cudaMalloc((void**)&yd, sizeof(float)*8*M);
|
||||
cudaMalloc((void**)&cohd, sizeof(float)*8*Nbase);
|
||||
cudaMalloc((void**)&bbd, sizeof(short)*2*Nbase);
|
||||
cudaMalloc((void**)&iwd, sizeof(float)*N);
|
||||
|
||||
/* need 8N*(BlocksPerGrid+8) for tcg_solve+grad/hess storage,
|
||||
so total storage needed is
|
||||
8N*(BlocksPerGrid+8) + 8N*5 + 8*M + 8*Nbase + 2*Nbase + N
|
||||
*/
|
||||
/* yd <=y : V */
|
||||
err=cudaMemcpy(yd, y, 8*M*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* need to give right offset for coherencies */
|
||||
/* offset: cluster offset+time offset */
|
||||
/* C */
|
||||
err=cudaMemcpy(cohd, &(dp->ddcohf[(dp->Nbase)*(dp->tilesz)*(dp->clus)*8+(dp->Nbase)*tileoff*8]), Nbase*8*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* correct offset for baselines */
|
||||
err=cudaMemcpy(bbd, &(dp->ddbase[2*(dp->Nbase)*(tileoff)]), Nbase*2*sizeof(short), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
/* xd <=x : solution */
|
||||
err=cudaMemcpy(xd, x, 8*N*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
float fx,fx0,norm_grad,Delta,fx_prop,rhonum,rhoden,rho;
|
||||
|
||||
/* count how many baselines contribute to each station, store (inverse) in iwd */
|
||||
count_baselines(Nbase,N,iw,&(dp->ddbase[2*(dp->Nbase)*(tileoff)]),dp->Nt);
|
||||
err=cudaMemcpy(iwd, iw, N*sizeof(float), cudaMemcpyHostToDevice);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
free(iw);
|
||||
|
||||
fx=cudakernel_fns_f(ThreadsPerBlock,BlocksPerGrid,N,M,xd,yd,cohd,bbd);
|
||||
fx0=fx;
|
||||
#ifdef DEBUG
|
||||
printf("Initial Cost=%g\n",fx0);
|
||||
#endif
|
||||
/***************************************************/
|
||||
int rsdstat=0;
|
||||
/* RSD solution */
|
||||
for (ci=0; ci<itmax_sd; ci++) {
|
||||
/* Armijo step */
|
||||
/* teta=armijostep(V,C,N,x); */
|
||||
//armijostep(N,x,eta,y,&gdata);
|
||||
rsdstat=armijostep(ThreadsPerBlock, BlocksPerGrid, N, M, xd, etad, yd, cohd, bbd,iwd,&fx,cbhandle,solver_handle);
|
||||
/* x=R(x,teta); */
|
||||
cudakernel_fns_R(N,xd,etad,x_propd,cbhandle,solver_handle);
|
||||
//my_ccopy(4*N,x_propd,1,xd,1);
|
||||
if (!rsdstat) {
|
||||
/* cost reduced, update solution */
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
|
||||
} else {
|
||||
/* no cost reduction, break loop */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Delta_bar=MIN(fx,0.01f);
|
||||
Delta0=Delta_bar*0.125f;
|
||||
//printf("fx=%g Delta_bar=%g Delta0=%g\n",fx,Delta_bar,Delta0);
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("NEW RSD cost=%g\n",fx);
|
||||
#endif
|
||||
/***************************************************/
|
||||
int min_inner,max_inner,min_outer,max_outer;
|
||||
float epsilon,kappa,theta,rho_prime;
|
||||
|
||||
min_inner=1; max_inner=itmax_rtr;//8*N;
|
||||
min_outer=3;//itmax_rtr; //3;
|
||||
max_outer=itmax_rtr;
|
||||
epsilon=(float)CLM_EPSILON;
|
||||
kappa=0.1f;
|
||||
theta=1.0f;
|
||||
/* default values 0.25, 0.75, 0.25, 2.0 */
|
||||
float eta1=0.0001f; float eta2=0.99f; float alpha1=0.25f; float alpha2=3.5f;
|
||||
rho_prime=eta1; /* should be <= 0.25, tune for parallel solve */
|
||||
float rho_regularization; /* use large damping */
|
||||
rho_regularization=fx*1e-6f;
|
||||
/* damping: too small => locally converge, globally diverge
|
||||
|\
|
||||
|\ | \___
|
||||
-|\ | \|
|
||||
\
|
||||
|
||||
|
||||
right damping: locally and globally converge
|
||||
-|\
|
||||
\|\
|
||||
\|\
|
||||
\____
|
||||
|
||||
*/
|
||||
float rho_reg;
|
||||
int model_decreased=0;
|
||||
|
||||
/* RTR solution */
|
||||
int k=0;
|
||||
int stop_outer=(itmax_rtr>0?0:1);
|
||||
int stop_inner=0;
|
||||
if (!stop_outer) {
|
||||
cudakernel_fns_fgrad(ThreadsPerBlock,BlocksPerGrid,N,M,xd,fgradxd,yd,cohd,bbd,iwd,1,cbhandle,solver_handle);
|
||||
norm_grad=sqrtf(cudakernel_fns_g(N,xd,fgradxd,fgradxd,cbhandle,solver_handle));
|
||||
}
|
||||
Delta=Delta0;
|
||||
/* initial residual */
|
||||
info[0]=fx0;
|
||||
|
||||
/*
|
||||
% ** Start of TR loop **
|
||||
*/
|
||||
while(!stop_outer) {
|
||||
/*
|
||||
% update counter
|
||||
*/
|
||||
k++;
|
||||
/* eta = 0*fgradx; */
|
||||
cudaMemset(etad, 0, sizeof(cuFloatComplex)*4*N);
|
||||
|
||||
|
||||
/* solve TR subproblem, also returns Hessian */
|
||||
stop_inner=tcg_solve_cuda(ThreadsPerBlock,BlocksPerGrid, N, M, xd, fgradxd, etad, Hetad, Delta, theta, kappa, max_inner, min_inner,yd,cohd,bbd,iwd,cbhandle, solver_handle);
|
||||
/*
|
||||
Heta = fns.fhess(x,eta);
|
||||
*/
|
||||
/*
|
||||
compute the retraction of the proposal
|
||||
*/
|
||||
cudakernel_fns_R(N,xd,etad,x_propd,cbhandle,solver_handle);
|
||||
|
||||
/*
|
||||
compute cost of the proposal
|
||||
*/
|
||||
fx_prop=cudakernel_fns_f(ThreadsPerBlock,BlocksPerGrid,N,M,x_propd,yd,cohd,bbd);
|
||||
|
||||
/*
|
||||
check the performance of the quadratic model
|
||||
*/
|
||||
rhonum=fx-fx_prop;
|
||||
rhoden=-cudakernel_fns_g(N,xd,fgradxd,etad,cbhandle,solver_handle)-0.5f*cudakernel_fns_g(N,xd,Hetad,etad,cbhandle,solver_handle);
|
||||
/* regularization of rho ratio */
|
||||
/*
|
||||
rho_reg = max(1, abs(fx)) * eps * options.rho_regularization;
|
||||
rhonum = rhonum + rho_reg;
|
||||
rhoden = rhoden + rho_reg;
|
||||
*/
|
||||
rho_reg=MAX(1.0f,fx)*rho_regularization; /* no epsilon */
|
||||
rhonum+=rho_reg;
|
||||
rhoden+=rho_reg;
|
||||
|
||||
/*
|
||||
rho = rhonum / rhoden;
|
||||
*/
|
||||
rho=rhonum/rhoden;
|
||||
|
||||
/* model_decreased = (rhoden >= 0); */
|
||||
/* OLD CODE if (fabsf(rhonum/fx) <sqrtf_epsilon) {
|
||||
rho=1.0f;
|
||||
} */
|
||||
model_decreased=(rhoden>=0.0f?1:0);
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("stop_inner=%d rho_reg=%g rho =%g/%g= %g rho'= %g\n",stop_inner,rho_reg,rhonum,rhoden,rho,rho_prime);
|
||||
#endif
|
||||
/*
|
||||
choose new TR radius based on performance
|
||||
*/
|
||||
if ( !model_decreased || rho<eta1 ) {
|
||||
Delta=alpha1*Delta;
|
||||
} else if (rho>eta2 && (stop_inner==2 || stop_inner==1)) {
|
||||
Delta=MIN(alpha2*Delta,Delta_bar);
|
||||
}
|
||||
|
||||
/*
|
||||
choose new iterate based on performance
|
||||
*/
|
||||
if (model_decreased && rho>rho_prime) {
|
||||
cbstatus=cublasCcopy(cbhandle,4*N,x_propd,1,xd,1);
|
||||
fx=fx_prop;
|
||||
cudakernel_fns_fgrad(ThreadsPerBlock,BlocksPerGrid,N,M,xd,fgradxd,yd,cohd,bbd,iwd,1,cbhandle,solver_handle);
|
||||
norm_grad=sqrtf(cudakernel_fns_g(N,xd,fgradxd,fgradxd,cbhandle,solver_handle));
|
||||
}
|
||||
|
||||
/*
|
||||
Testing for Stop Criteria
|
||||
*/
|
||||
if (norm_grad<epsilon && k>min_outer) {
|
||||
stop_outer=1;
|
||||
}
|
||||
|
||||
/*
|
||||
stop after max_outer iterations
|
||||
*/
|
||||
if (k>=max_outer) {
|
||||
stop_outer=1;
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Iter %d cost=%g\n",k,fx);
|
||||
#endif
|
||||
|
||||
}
|
||||
/* final residual */
|
||||
info[1]=fx;
|
||||
#ifdef DEBUG
|
||||
printf("NEW RTR cost=%g\n",fx);
|
||||
#endif
|
||||
|
||||
/***************************************************/
|
||||
checkCublasError(cbstatus,__FILE__,__LINE__);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
if(fx0>fx) {
|
||||
//printf("Cost final %g initial %g\n",fx,fx0);
|
||||
/* copy back current solution */
|
||||
err=cudaMemcpy(x,xd,8*N*sizeof(float),cudaMemcpyDeviceToHost);
|
||||
checkCudaError(err,__FILE__,__LINE__);
|
||||
|
||||
|
||||
/* copy back solution to x0 : format checked*/
|
||||
/* re J(0,0) */
|
||||
my_fcopy(N, &Jd[0], 4, &x0[0], 8);
|
||||
/* im J(0,0) */
|
||||
my_fcopy(N, &Jd[1], 4, &x0[1], 8);
|
||||
/* re J(1,0) */
|
||||
my_fcopy(N, &Jd[2], 4, &x0[4], 8);
|
||||
/* im J(1,0) */
|
||||
my_fcopy(N, &Jd[3], 4, &x0[5], 8);
|
||||
/* re J(0,1) */
|
||||
my_fcopy(N, &Jd[4*N], 4, &x0[2], 8);
|
||||
/* im J(0,1) */
|
||||
my_fcopy(N, &Jd[4*N+1], 4, &x0[3], 8);
|
||||
/* re J(1,1) */
|
||||
my_fcopy(N, &Jd[4*N+2], 4, &x0[6], 8);
|
||||
/* im J(1,1) */
|
||||
my_fcopy(N, &Jd[4*N+3], 4, &x0[7], 8);
|
||||
|
||||
}
|
||||
free(x);
|
||||
|
||||
cudaFree(fgradxd);
|
||||
cudaFree(etad);
|
||||
cudaFree(Hetad);
|
||||
cudaFree(x_propd);
|
||||
cudaFree(xd);
|
||||
cudaFree(yd);
|
||||
cudaFree(cohd);
|
||||
cudaFree(bbd);
|
||||
cudaFree(iwd);
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -1,443 +0,0 @@
|
|||
/*
|
||||
*
|
||||
Copyright (C) 2006-2008 Sarod Yatawatta <sarod@users.sf.net>
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
$Id$
|
||||
*/
|
||||
|
||||
#include "Solvers.h"
|
||||
#include <math.h>
|
||||
|
||||
/* Digamma function
|
||||
if x>7 use digamma(x) = digamma(x+1) - 1/x
|
||||
for accuracy
|
||||
using maple expansion
|
||||
series(Psi(x+1/2), x=infinity, 21);
|
||||
ln(x)+1/24/x^2-7/960/x^4+31/8064/x^6-127/30720/x^8+511/67584/x^10-1414477/67092480/x^12+8191/98304/x^14-118518239/267386880/x^16+5749691557/1882718208/x^18-91546277357/3460300800/x^20+O(1/x^21)
|
||||
|
||||
|
||||
based on code by Mark Johnson, 2nd September 2007
|
||||
*/
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
static double
|
||||
digamma(double x) {
|
||||
/* FIXME catch -ve value as input */
|
||||
double result = 0.0, xx, xx2, xx4;
|
||||
for ( ; x < 7.0; ++x) { /* reduce x till x<7 */
|
||||
result -= 1.0/x;
|
||||
}
|
||||
x -= 0.5;
|
||||
xx = 1.0/x;
|
||||
xx2 = xx*xx;
|
||||
xx4 = xx2*xx2;
|
||||
result += log(x)+(1./24.)*xx2-(7.0/960.0)*xx4+(31.0/8064.0)*xx4*xx2-(127.0/30720.0)*xx4*xx4;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* update w<= (nu+1)/(nu+delta^2)
|
||||
then q <= w-log(w), so that it is +ve
|
||||
*/
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
static void *
|
||||
w_nu_update_threadfn(void *data) {
|
||||
thread_data_vecnu_t *t=(thread_data_vecnu_t*)data;
|
||||
int ci;
|
||||
for (ci=t->starti; ci<=t->endi; ci++) {
|
||||
//t->ed[ci]*=t->wtd[ci]; ??
|
||||
t->wtd[ci]=(t->nu0+1.0)/(t->nu0+t->ed[ci]*t->ed[ci]);
|
||||
t->q[ci]=t->wtd[ci]-log(t->wtd[ci]);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* update w<= sqrt(w) */
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
static void *
|
||||
w_sqrt_threadfn(void *data) {
|
||||
thread_data_vecnu_t *t=(thread_data_vecnu_t*)data;
|
||||
int ci;
|
||||
for (ci=t->starti; ci<=t->endi; ci++) {
|
||||
t->wtd[ci]=sqrt(t->wtd[ci]);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* update nu */
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
static void *
|
||||
q_update_threadfn(void *data) {
|
||||
thread_data_vecnu_t *t=(thread_data_vecnu_t*)data;
|
||||
int ci;
|
||||
double thisnu,dgm;
|
||||
for (ci=t->starti; ci<=t->endi; ci++) {
|
||||
thisnu=(t->nulow+(double)ci*t->nu0); /* deltanu stored in nu0 */
|
||||
dgm=digamma(thisnu*0.5+0.5);
|
||||
t->q[ci]=dgm-log((thisnu+1.0)*0.5); /* psi((nu+1)/2)-log((nu+1)/2) */
|
||||
dgm=digamma(thisnu*0.5);
|
||||
t->q[ci]+=-dgm+log((thisnu)*0.5); /* -psi((nu)/2)+log((nu)/2) */
|
||||
t->q[ci]+=-t->sumq+1.0; /* q is w-log(w), so -ve: sum(ln(w_i))/N-sum(w_i)/N+1 */
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* update nu */
|
||||
#ifdef USE_MIC
|
||||
__attribute__ ((target(MIC)))
|
||||
#endif
|
||||
static void *
|
||||
q_update_threadfn_aecm(void *data) {
|
||||
thread_data_vecnu_t *t=(thread_data_vecnu_t*)data;
|
||||
int ci;
|
||||
double thisnu,dgm;
|
||||
for (ci=t->starti; ci<=t->endi; ci++) {
|
||||
thisnu=(t->nulow+(double)ci*t->nu0); /* deltanu stored in nu0 */
|
||||
dgm=digamma(thisnu*0.5);
|
||||
t->q[ci]=-dgm+log((thisnu)*0.5); /* -psi((nu)/2)+log((nu)/2) */
|
||||
t->q[ci]+=-t->sumq+1.0; /* q is w-log(w), so -ve: sum(ln(w_i))/N-sum(w_i)/N+1 */
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* update nu (degrees of freedom)
|
||||
also update w
|
||||
|
||||
nu0: current value of nu
|
||||
w: Nx1 weight vector
|
||||
ed: Nx1 residual error
|
||||
|
||||
|
||||
psi() : digamma function
|
||||
find soltion to
|
||||
psi((nu+1)/2)-ln((nu+1)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1 = 0
|
||||
use ln(gamma()) => lgamma_r
|
||||
*/
|
||||
double
|
||||
update_w_and_nu(double nu0, double *w, double *ed, int N, int Nt, double nulow, double nuhigh) {
|
||||
int Nd=30; /* no of samples to estimate nu */
|
||||
int nth,nth1,ci;
|
||||
int Nthb0,Nthb;
|
||||
pthread_attr_t attr;
|
||||
pthread_t *th_array;
|
||||
thread_data_vecnu_t *threaddata;
|
||||
|
||||
double deltanu,*q,thisnu,sumq;
|
||||
if ((q=(double*)calloc((size_t)N,sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* setup threads */
|
||||
pthread_attr_init(&attr);
|
||||
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
|
||||
|
||||
if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if ((threaddata=(thread_data_vecnu_t*)malloc((size_t)Nt*sizeof(thread_data_vecnu_t)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* calculate min values a thread can handle */
|
||||
Nthb0=(N+Nt-1)/Nt;
|
||||
/* iterate over threads, allocating indices per thread */
|
||||
ci=0;
|
||||
for (nth=0; nth<Nt && ci<N; nth++) {
|
||||
if (ci+Nthb0<N) {
|
||||
Nthb=Nthb0;
|
||||
} else {
|
||||
Nthb=N-ci;
|
||||
}
|
||||
threaddata[nth].starti=ci;
|
||||
threaddata[nth].endi=ci+Nthb-1;
|
||||
threaddata[nth].ed=ed;
|
||||
threaddata[nth].wtd=w;
|
||||
threaddata[nth].q=q;
|
||||
threaddata[nth].nu0=nu0;
|
||||
threaddata[nth].nulow=nulow;
|
||||
threaddata[nth].nuhigh=nuhigh;
|
||||
pthread_create(&th_array[nth],&attr,w_nu_update_threadfn,(void*)(&threaddata[nth]));
|
||||
/* next baseline set */
|
||||
ci=ci+Nthb;
|
||||
}
|
||||
/* now wait for threads to finish */
|
||||
for(nth1=0; nth1<nth; nth1++) {
|
||||
pthread_join(th_array[nth1],NULL);
|
||||
}
|
||||
sumq=my_dasum(N,q)/(double)N; /* sum(|w_i-log(w_i)|/N), assume all elements are +ve */
|
||||
for(nth1=0; nth1<nth; nth1++) {
|
||||
pthread_create(&th_array[nth1],&attr,w_sqrt_threadfn,(void*)(&threaddata[nth1]));
|
||||
}
|
||||
for(nth1=0; nth1<nth; nth1++) {
|
||||
pthread_join(th_array[nth1],NULL);
|
||||
}
|
||||
|
||||
/* search range 2 to 30 because if nu~=30, its Gaussian */
|
||||
deltanu=(double)(nuhigh-nulow)/(double)Nd;
|
||||
Nthb0=(Nd+Nt-1)/Nt;
|
||||
/* check for too low number of values per thread, halve the threads */
|
||||
if (Nthb0<=2) {
|
||||
Nt=Nt/2;
|
||||
Nthb0=(Nd+Nt-1)/Nt;
|
||||
}
|
||||
ci=0;
|
||||
for (nth=0; nth<Nt && ci<Nd; nth++) {
|
||||
if (ci+Nthb0<Nd) {
|
||||
Nthb=Nthb0;
|
||||
} else {
|
||||
Nthb=Nd-ci;
|
||||
}
|
||||
threaddata[nth].starti=ci;
|
||||
threaddata[nth].endi=ci+Nthb-1;
|
||||
threaddata[nth].q=q;
|
||||
threaddata[nth].nu0=deltanu;
|
||||
threaddata[nth].sumq=sumq;
|
||||
pthread_create(&th_array[nth],&attr,q_update_threadfn,(void*)(&threaddata[nth]));
|
||||
/* next baseline set */
|
||||
ci=ci+Nthb;
|
||||
}
|
||||
/* now wait for threads to finish */
|
||||
for(nth1=0; nth1<nth; nth1++) {
|
||||
pthread_join(th_array[nth1],NULL);
|
||||
}
|
||||
|
||||
pthread_attr_destroy(&attr);
|
||||
free(th_array);
|
||||
free(threaddata);
|
||||
|
||||
ci=my_idamin(Nd,q,1);
|
||||
thisnu=(nulow+(double)ci*deltanu);
|
||||
|
||||
free(q);
|
||||
return thisnu;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* update nu (degrees of freedom)
|
||||
nu_old: old nu
|
||||
logsumw = 1/N sum(log(w_i)-w_i)
|
||||
|
||||
use Nd values in [nulow,nuhigh] to find nu
|
||||
|
||||
|
||||
psi() : digamma function
|
||||
find soltion to
|
||||
psi((nu_old+p)/2)-ln((nu_old+p)/2)-psi(nu/2)+ln(nu/2)+1/N sum(ln(w_i)-w_i) +1 = 0
|
||||
use ln(gamma()) => lgamma_r
|
||||
|
||||
p: 1 or 8
|
||||
*/
|
||||
double
|
||||
update_nu(double logsumw, int Nd, int Nt, double nulow, double nuhigh, int p, double nu_old) {
|
||||
int ci,nth,nth1,Nthb,Nthb0;
|
||||
double deltanu,thisnu,*q;
|
||||
pthread_attr_t attr;
|
||||
pthread_t *th_array;
|
||||
thread_data_vecnu_t *threaddata;
|
||||
|
||||
if ((q=(double*)calloc((size_t)Nd,sizeof(double)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: no free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* setup threads */
|
||||
pthread_attr_init(&attr);
|
||||
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
|
||||
|
||||
if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
if ((threaddata=(thread_data_vecnu_t*)malloc((size_t)Nt*sizeof(thread_data_vecnu_t)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
/* calculate psi((nu_old+p)/2)-ln((nu_old+p)/2) */
|
||||
double dgm=digamma((nu_old+(double)p)*0.5);
|
||||
dgm=dgm-log((nu_old+(double)p)*0.5); /* psi((nu+p)/2)-log((nu+p)/2) */
|
||||
|
||||
|
||||
deltanu=(double)(nuhigh-nulow)/(double)Nd;
|
||||
Nthb0=(Nd+Nt-1)/Nt;
|
||||
/* check for too low number of values per thread, halve the threads */
|
||||
if (Nthb0<=2) {
|
||||
Nt=Nt/2;
|
||||
Nthb0=(Nd+Nt-1)/Nt;
|
||||
}
|
||||
ci=0;
|
||||
for (nth=0; nth<Nt && ci<Nd; nth++) {
|
||||
if (ci+Nthb0<Nd) {
|
||||
Nthb=Nthb0;
|
||||
} else {
|
||||
Nthb=Nd-ci;
|
||||
}
|
||||
threaddata[nth].starti=ci;
|
||||
threaddata[nth].endi=ci+Nthb-1;
|
||||
threaddata[nth].q=q;
|
||||
threaddata[nth].nu0=deltanu;
|
||||
threaddata[nth].nulow=nulow;
|
||||
threaddata[nth].nuhigh=nuhigh;
|
||||
threaddata[nth].sumq=-logsumw-dgm;
|
||||
pthread_create(&th_array[nth],&attr,q_update_threadfn_aecm,(void*)(&threaddata[nth]));
|
||||
/* next baseline set */
|
||||
ci=ci+Nthb;
|
||||
}
|
||||
/* now wait for threads to finish */
|
||||
for(nth1=0; nth1<nth; nth1++) {
|
||||
pthread_join(th_array[nth1],NULL);
|
||||
}
|
||||
|
||||
pthread_attr_destroy(&attr);
|
||||
free(th_array);
|
||||
free(threaddata);
|
||||
|
||||
ci=my_idamin(Nd,q,1);
|
||||
thisnu=(nulow+((double)ci)*deltanu);
|
||||
|
||||
free(q);
|
||||
return thisnu;
|
||||
}
|
||||
|
||||
|
||||
/* ud = sqrt(u^2+v^2) */
|
||||
static double
|
||||
ncp_weight(double ud) {
|
||||
/* fo(x) = 1/(1+alpha*exp(-x/A))
|
||||
A ~=30
|
||||
*/
|
||||
if (ud>400.0) return 1.0; /* no effect on long baselines */
|
||||
//return 1.0/(1.0+0.4*exp(-0.05*ud));
|
||||
return 1.0/(1.0+1.8*exp(-0.05*ud));
|
||||
}
|
||||
|
||||
static void *
|
||||
threadfn_setblweight(void *data) {
|
||||
thread_data_baselinewt_t *t=(thread_data_baselinewt_t*)data;
|
||||
|
||||
int ci;
|
||||
for (ci=0; ci<t->Nb; ci++) {
|
||||
/* get sqrt(u^2+v^2) */
|
||||
double uu=t->u[ci+t->boff]*t->freq0;
|
||||
double vv=t->v[ci+t->boff]*t->freq0;
|
||||
double a=ncp_weight(sqrt(uu*uu+vv*vv));
|
||||
t->wt[8*(ci+t->boff)]*=a;
|
||||
t->wt[8*(ci+t->boff)+1]*=a;
|
||||
t->wt[8*(ci+t->boff)+2]*=a;
|
||||
t->wt[8*(ci+t->boff)+3]*=a;
|
||||
t->wt[8*(ci+t->boff)+4]*=a;
|
||||
t->wt[8*(ci+t->boff)+5]*=a;
|
||||
t->wt[8*(ci+t->boff)+6]*=a;
|
||||
t->wt[8*(ci+t->boff)+7]*=a;
|
||||
//printf("%lf %lf %lf\n",uu,vv,a);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
taper data by weighting based on uv distance (for short baselines)
|
||||
for example: use weights as the inverse density function
|
||||
1/( 1+f(u,v) )
|
||||
as u,v->inf, f(u,v) -> 0 so long baselines are not affected
|
||||
x: Nbase*8 x 1 (input,output) data
|
||||
u,v : Nbase x 1
|
||||
note: u = u/c, v=v/c here, so need freq to convert to wavelengths */
|
||||
void
|
||||
whiten_data(int Nbase, double *x, double *u, double *v, double freq0, int Nt) {
|
||||
pthread_attr_t attr;
|
||||
pthread_t *th_array;
|
||||
thread_data_baselinewt_t *threaddata;
|
||||
|
||||
int ci,nth1,nth;
|
||||
int Nthb0,Nthb;
|
||||
|
||||
Nthb0=(Nbase+Nt-1)/Nt;
|
||||
|
||||
pthread_attr_init(&attr);
|
||||
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
|
||||
|
||||
if ((th_array=(pthread_t*)malloc((size_t)Nt*sizeof(pthread_t)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ((threaddata=(thread_data_baselinewt_t*)malloc((size_t)Nt*sizeof(thread_data_baselinewt_t)))==0) {
|
||||
#ifndef USE_MIC
|
||||
fprintf(stderr,"%s: %d: No free memory\n",__FILE__,__LINE__);
|
||||
#endif
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
/* iterate over threads, allocating baselines per thread */
|
||||
ci=0;
|
||||
for (nth=0; nth<Nt && ci<Nbase; nth++) {
|
||||
if (ci+Nthb0<Nbase) {
|
||||
Nthb=Nthb0;
|
||||
} else {
|
||||
Nthb=Nbase-ci;
|
||||
}
|
||||
|
||||
threaddata[nth].Nb=Nthb;
|
||||
threaddata[nth].boff=ci;
|
||||
threaddata[nth].wt=x;
|
||||
threaddata[nth].u=u;
|
||||
threaddata[nth].v=v;
|
||||
threaddata[nth].freq0=freq0;
|
||||
|
||||
pthread_create(&th_array[nth],&attr,threadfn_setblweight,(void*)(&threaddata[nth]));
|
||||
/* next baseline set */
|
||||
ci=ci+Nthb;
|
||||
}
|
||||
|
||||
/* now wait for threads to finish */
|
||||
for(nth1=0; nth1<nth; nth1++) {
|
||||
pthread_join(th_array[nth1],NULL);
|
||||
}
|
||||
|
||||
pthread_attr_destroy(&attr);
|
||||
free(th_array);
|
||||
free(threaddata);
|
||||
}
|
Binary file not shown.
2641
src/lib/sagecal.h
2641
src/lib/sagecal.h
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue