916 lines
28 KiB
C
916 lines
28 KiB
C
|
/*
|
||
|
Copyright (c) 2013 Julien Pommier.
|
||
|
Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
|
||
|
*/
|
||
|
|
||
|
#define _WANT_SNAN 1
|
||
|
|
||
|
#include "pffft.h"
|
||
|
#include "pffastconv.h"
|
||
|
|
||
|
#include <math.h>
|
||
|
#include <float.h>
|
||
|
#include <limits.h>
|
||
|
#include <inttypes.h>
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <time.h>
|
||
|
#include <assert.h>
|
||
|
#include <string.h>
|
||
|
|
||
|
#ifdef HAVE_SYS_TIMES
|
||
|
# include <sys/times.h>
|
||
|
# include <unistd.h>
|
||
|
#endif
|
||
|
|
||
|
/*
|
||
|
vector support macros: the rest of the code is independant of
|
||
|
SSE/Altivec/NEON -- adding support for other platforms with 4-element
|
||
|
vectors should be limited to these macros
|
||
|
*/
|
||
|
#if 0
|
||
|
#include "simd/pf_float.h"
|
||
|
#endif
|
||
|
|
||
|
#if defined(_MSC_VER)
|
||
|
# define RESTRICT __restrict
|
||
|
#elif defined(__GNUC__)
|
||
|
# define RESTRICT __restrict
|
||
|
#else
|
||
|
# define RESTRICT
|
||
|
#endif
|
||
|
|
||
|
|
||
|
#if defined(_MSC_VER)
|
||
|
#pragma warning( disable : 4244 )
|
||
|
#endif
|
||
|
|
||
|
|
||
|
#ifdef SNANF
|
||
|
#define INVALID_FLOAT_VAL SNANF
|
||
|
#elif defined(SNAN)
|
||
|
#define INVALID_FLOAT_VAL SNAN
|
||
|
#elif defined(NAN)
|
||
|
#define INVALID_FLOAT_VAL NAN
|
||
|
#elif defined(INFINITY)
|
||
|
#define INVALID_FLOAT_VAL INFINITY
|
||
|
#else
|
||
|
#define INVALID_FLOAT_VAL FLT_MAX
|
||
|
#endif
|
||
|
|
||
|
|
||
|
#if defined(HAVE_SYS_TIMES)
|
||
|
inline double uclock_sec(void) {
|
||
|
static double ttclk = 0.;
|
||
|
struct tms t;
|
||
|
if (ttclk == 0.)
|
||
|
ttclk = sysconf(_SC_CLK_TCK);
|
||
|
times(&t);
|
||
|
/* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
|
||
|
return ((double)t.tms_utime)) / ttclk;
|
||
|
}
|
||
|
# else
|
||
|
double uclock_sec(void)
|
||
|
{ return (double)clock()/(double)CLOCKS_PER_SEC; }
|
||
|
#endif
|
||
|
|
||
|
|
||
|
|
||
|
typedef int (*pfnConvolution) (void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush);
|
||
|
typedef void* (*pfnConvSetup) (float *Hfwd, int Nf, int * BlkLen, int flags);
|
||
|
typedef pfnConvolution (*pfnGetConvFnPtr) (void * setup);
|
||
|
typedef void (*pfnConvDestroy) (void * setup);
|
||
|
|
||
|
|
||
|
struct ConvSetup
|
||
|
{
|
||
|
pfnConvolution pfn;
|
||
|
int N;
|
||
|
int B;
|
||
|
float * H;
|
||
|
int flags;
|
||
|
};
|
||
|
|
||
|
|
||
|
void * convSetupRev( float * H, int N, int * BlkLen, int flags )
|
||
|
{
|
||
|
struct ConvSetup * s = pffastconv_malloc( sizeof(struct ConvSetup) );
|
||
|
int i, Nr = N;
|
||
|
if (flags & PFFASTCONV_CPLX_INP_OUT)
|
||
|
Nr *= 2;
|
||
|
Nr += 4;
|
||
|
s->pfn = NULL;
|
||
|
s->N = N;
|
||
|
s->B = *BlkLen;
|
||
|
s->H = pffastconv_malloc((unsigned)Nr * sizeof(float));
|
||
|
s->flags = flags;
|
||
|
memset(s->H, 0, (unsigned)Nr * sizeof(float));
|
||
|
if (flags & PFFASTCONV_CPLX_INP_OUT)
|
||
|
{
|
||
|
for ( i = 0; i < N; ++i ) {
|
||
|
s->H[2*(N-1 -i) ] = H[i];
|
||
|
s->H[2*(N-1 -i)+1] = H[i];
|
||
|
}
|
||
|
/* simpler detection of overruns */
|
||
|
s->H[ 2*N ] = INVALID_FLOAT_VAL;
|
||
|
s->H[ 2*N +1 ] = INVALID_FLOAT_VAL;
|
||
|
s->H[ 2*N +2 ] = INVALID_FLOAT_VAL;
|
||
|
s->H[ 2*N +3 ] = INVALID_FLOAT_VAL;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
for ( i = 0; i < N; ++i )
|
||
|
s->H[ N-1 -i ] = H[i];
|
||
|
/* simpler detection of overruns */
|
||
|
s->H[ N ] = INVALID_FLOAT_VAL;
|
||
|
s->H[ N +1 ] = INVALID_FLOAT_VAL;
|
||
|
s->H[ N +2 ] = INVALID_FLOAT_VAL;
|
||
|
s->H[ N +3 ] = INVALID_FLOAT_VAL;
|
||
|
}
|
||
|
return s;
|
||
|
}
|
||
|
|
||
|
void convDestroyRev( void * setup )
|
||
|
{
|
||
|
struct ConvSetup * s = (struct ConvSetup*)setup;
|
||
|
pffastconv_free(s->H);
|
||
|
pffastconv_free(setup);
|
||
|
}
|
||
|
|
||
|
|
||
|
pfnConvolution ConvGetFnPtrRev( void * setup )
|
||
|
{
|
||
|
struct ConvSetup * s = (struct ConvSetup*)setup;
|
||
|
if (!s)
|
||
|
return NULL;
|
||
|
return s->pfn;
|
||
|
}
|
||
|
|
||
|
|
||
|
void convSimdDestroy( void * setup )
|
||
|
{
|
||
|
convDestroyRev(setup);
|
||
|
}
|
||
|
|
||
|
|
||
|
void * fastConvSetup( float * H, int N, int * BlkLen, int flags )
|
||
|
{
|
||
|
void * p = pffastconv_new_setup( H, N, BlkLen, flags );
|
||
|
if (!p)
|
||
|
printf("fastConvSetup(N = %d, *BlkLen = %d, flags = %d) = NULL\n", N, *BlkLen, flags);
|
||
|
return p;
|
||
|
}
|
||
|
|
||
|
|
||
|
void fastConvDestroy( void * setup )
|
||
|
{
|
||
|
pffastconv_destroy_setup( (PFFASTCONV_Setup*)setup );
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
int slow_conv_R(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
|
||
|
{
|
||
|
struct ConvSetup * p = (struct ConvSetup*)setup;
|
||
|
const float * RESTRICT X = input;
|
||
|
const float * RESTRICT Hrev = p->H;
|
||
|
float * RESTRICT Y = output;
|
||
|
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
|
||
|
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
|
||
|
int i, j;
|
||
|
(void)Yref;
|
||
|
(void)applyFlush;
|
||
|
|
||
|
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
|
||
|
{
|
||
|
for ( i = 0; i <= lenNr; i += 2 )
|
||
|
{
|
||
|
float sumRe = 0.0F, sumIm = 0.0F;
|
||
|
for ( j = 0; j < Nr; j += 2 )
|
||
|
{
|
||
|
sumRe += X[i+j ] * Hrev[j];
|
||
|
sumIm += X[i+j+1] * Hrev[j+1];
|
||
|
}
|
||
|
Y[i ] = sumRe;
|
||
|
Y[i+1] = sumIm;
|
||
|
}
|
||
|
return i/2;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
for ( i = 0; i <= lenNr; ++i )
|
||
|
{
|
||
|
float sum = 0.0F;
|
||
|
for (j = 0; j < Nr; ++j )
|
||
|
sum += X[i+j] * Hrev[j];
|
||
|
Y[i] = sum;
|
||
|
}
|
||
|
return i;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
int slow_conv_A(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
|
||
|
{
|
||
|
float sum[4];
|
||
|
struct ConvSetup * p = (struct ConvSetup*)setup;
|
||
|
const float * RESTRICT X = input;
|
||
|
const float * RESTRICT Hrev = p->H;
|
||
|
float * RESTRICT Y = output;
|
||
|
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
|
||
|
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
|
||
|
int i, j;
|
||
|
(void)Yref;
|
||
|
(void)applyFlush;
|
||
|
|
||
|
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
|
||
|
{
|
||
|
if ( (Nr & 3) == 0 )
|
||
|
{
|
||
|
for ( i = 0; i <= lenNr; i += 2 )
|
||
|
{
|
||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||
|
for (j = 0; j < Nr; j += 4 )
|
||
|
{
|
||
|
sum[0] += X[i+j] * Hrev[j];
|
||
|
sum[1] += X[i+j+1] * Hrev[j+1];
|
||
|
sum[2] += X[i+j+2] * Hrev[j+2];
|
||
|
sum[3] += X[i+j+3] * Hrev[j+3];
|
||
|
}
|
||
|
Y[i ] = sum[0] + sum[2];
|
||
|
Y[i+1] = sum[1] + sum[3];
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
const int M = Nr & (~3);
|
||
|
for ( i = 0; i <= lenNr; i += 2 )
|
||
|
{
|
||
|
float tailSumRe = 0.0F, tailSumIm = 0.0F;
|
||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||
|
for (j = 0; j < M; j += 4 )
|
||
|
{
|
||
|
sum[0] += X[i+j ] * Hrev[j ];
|
||
|
sum[1] += X[i+j+1] * Hrev[j+1];
|
||
|
sum[2] += X[i+j+2] * Hrev[j+2];
|
||
|
sum[3] += X[i+j+3] * Hrev[j+3];
|
||
|
}
|
||
|
for ( ; j < Nr; j += 2 ) {
|
||
|
tailSumRe += X[i+j ] * Hrev[j ];
|
||
|
tailSumIm += X[i+j+1] * Hrev[j+1];
|
||
|
}
|
||
|
Y[i ] = ( sum[0] + sum[2] ) + tailSumRe;
|
||
|
Y[i+1] = ( sum[1] + sum[3] ) + tailSumIm;
|
||
|
}
|
||
|
}
|
||
|
return i/2;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if ( (Nr & 3) == 0 )
|
||
|
{
|
||
|
for ( i = 0; i <= lenNr; ++i )
|
||
|
{
|
||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||
|
for (j = 0; j < Nr; j += 4 )
|
||
|
{
|
||
|
sum[0] += X[i+j] * Hrev[j];
|
||
|
sum[1] += X[i+j+1] * Hrev[j+1];
|
||
|
sum[2] += X[i+j+2] * Hrev[j+2];
|
||
|
sum[3] += X[i+j+3] * Hrev[j+3];
|
||
|
}
|
||
|
Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
|
||
|
}
|
||
|
return i;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
const int M = Nr & (~3);
|
||
|
/* printf("A: Nr = %d, M = %d, H[M] = %f, H[M+1] = %f, H[M+2] = %f, H[M+3] = %f\n", Nr, M, Hrev[M], Hrev[M+1], Hrev[M+2], Hrev[M+3] ); */
|
||
|
for ( i = 0; i <= lenNr; ++i )
|
||
|
{
|
||
|
float tailSum = 0.0;
|
||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||
|
for (j = 0; j < M; j += 4 )
|
||
|
{
|
||
|
sum[0] += X[i+j] * Hrev[j];
|
||
|
sum[1] += X[i+j+1] * Hrev[j+1];
|
||
|
sum[2] += X[i+j+2] * Hrev[j+2];
|
||
|
sum[3] += X[i+j+3] * Hrev[j+3];
|
||
|
}
|
||
|
for ( ; j < Nr; ++j )
|
||
|
tailSum += X[i+j] * Hrev[j];
|
||
|
Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
|
||
|
}
|
||
|
return i;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
int slow_conv_B(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
|
||
|
{
|
||
|
float sum[4];
|
||
|
struct ConvSetup * p = (struct ConvSetup*)setup;
|
||
|
(void)Yref;
|
||
|
(void)applyFlush;
|
||
|
if (p->flags & PFFASTCONV_SYMMETRIC)
|
||
|
{
|
||
|
const float * RESTRICT X = input;
|
||
|
const float * RESTRICT Hrev = p->H;
|
||
|
float * RESTRICT Y = output;
|
||
|
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
|
||
|
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
|
||
|
const int h = Nr / 2 -4;
|
||
|
const int E = Nr -4;
|
||
|
int i, j;
|
||
|
|
||
|
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
|
||
|
{
|
||
|
for ( i = 0; i <= lenNr; i += 2 )
|
||
|
{
|
||
|
const int k = i + E;
|
||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||
|
for (j = 0; j <= h; j += 4 )
|
||
|
{
|
||
|
sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+2] );
|
||
|
sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+3] );
|
||
|
sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j ] );
|
||
|
sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j+1] );
|
||
|
}
|
||
|
Y[i ] = sum[0] + sum[2];
|
||
|
Y[i+1] = sum[1] + sum[3];
|
||
|
}
|
||
|
return i/2;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
for ( i = 0; i <= lenNr; ++i )
|
||
|
{
|
||
|
const int k = i + E;
|
||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||
|
for (j = 0; j <= h; j += 4 )
|
||
|
{
|
||
|
sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+3] );
|
||
|
sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+2] );
|
||
|
sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j+1] );
|
||
|
sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j ] );
|
||
|
}
|
||
|
Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
|
||
|
}
|
||
|
return i;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
const float * RESTRICT X = input;
|
||
|
const float * RESTRICT Hrev = p->H;
|
||
|
float * RESTRICT Y = output;
|
||
|
const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
|
||
|
const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
|
||
|
int i, j;
|
||
|
|
||
|
if (p->flags & PFFASTCONV_CPLX_INP_OUT)
|
||
|
{
|
||
|
for ( i = 0; i <= lenNr; i += 2 )
|
||
|
{
|
||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||
|
for (j = 0; j < Nr; j += 4 )
|
||
|
{
|
||
|
sum[0] += X[i+j] * Hrev[j];
|
||
|
sum[1] += X[i+j+1] * Hrev[j+1];
|
||
|
sum[2] += X[i+j+2] * Hrev[j+2];
|
||
|
sum[3] += X[i+j+3] * Hrev[j+3];
|
||
|
}
|
||
|
Y[i ] = sum[0] + sum[2];
|
||
|
Y[i+1] = sum[1] + sum[3];
|
||
|
}
|
||
|
return i/2;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if ( (Nr & 3) == 0 )
|
||
|
{
|
||
|
for ( i = 0; i <= lenNr; ++i )
|
||
|
{
|
||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||
|
for (j = 0; j < Nr; j += 4 )
|
||
|
{
|
||
|
sum[0] += X[i+j] * Hrev[j];
|
||
|
sum[1] += X[i+j+1] * Hrev[j+1];
|
||
|
sum[2] += X[i+j+2] * Hrev[j+2];
|
||
|
sum[3] += X[i+j+3] * Hrev[j+3];
|
||
|
}
|
||
|
Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]);
|
||
|
}
|
||
|
return i;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
const int M = Nr & (~3);
|
||
|
/* printf("B: Nr = %d\n", Nr ); */
|
||
|
for ( i = 0; i <= lenNr; ++i )
|
||
|
{
|
||
|
float tailSum = 0.0;
|
||
|
sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
|
||
|
for (j = 0; j < M; j += 4 )
|
||
|
{
|
||
|
sum[0] += X[i+j] * Hrev[j];
|
||
|
sum[1] += X[i+j+1] * Hrev[j+1];
|
||
|
sum[2] += X[i+j+2] * Hrev[j+2];
|
||
|
sum[3] += X[i+j+3] * Hrev[j+3];
|
||
|
}
|
||
|
for ( ; j < Nr; ++j )
|
||
|
tailSum += X[i+j] * Hrev[j];
|
||
|
Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
|
||
|
}
|
||
|
return i;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
int fast_conv(void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush)
|
||
|
{
|
||
|
(void)Yref;
|
||
|
return pffastconv_apply( (PFFASTCONV_Setup*)setup, X, len, Y, applyFlush );
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
void printFirst( const float * V, const char * st, const int N, const int perLine )
|
||
|
{
|
||
|
(void)V; (void)st; (void)N; (void)perLine;
|
||
|
return;
|
||
|
#if 0
|
||
|
int i;
|
||
|
for ( i = 0; i < N; ++i )
|
||
|
{
|
||
|
if ( (i % perLine) == 0 )
|
||
|
printf("\n%s[%d]", st, i);
|
||
|
printf("\t%.1f", V[i]);
|
||
|
}
|
||
|
printf("\n");
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
#define NUMY 11
|
||
|
|
||
|
|
||
|
int test(int FILTERLEN, int convFlags, const int testOutLen, int printDbg, int printSpeed) {
|
||
|
double t0, t1, tstop, td, tdref;
|
||
|
float *X, *H;
|
||
|
float *Y[NUMY];
|
||
|
int64_t outN[NUMY];
|
||
|
/* 256 KFloats or 16 MFloats data */
|
||
|
#if 1
|
||
|
const int len = testOutLen ? (1 << 18) : (1 << 24);
|
||
|
#elif 0
|
||
|
const int len = testOutLen ? (1 << 18) : (1 << 13);
|
||
|
#else
|
||
|
const int len = testOutLen ? (1 << 18) : (1024);
|
||
|
#endif
|
||
|
const int cplxFactor = ( convFlags & PFFASTCONV_CPLX_INP_OUT ) ? 2 : 1;
|
||
|
const int lenC = len / cplxFactor;
|
||
|
|
||
|
int yi, yc, posMaxErr;
|
||
|
float yRangeMin, yRangeMax, yErrLimit, maxErr = 0.0;
|
||
|
int i, j, numErrOverLimit, iter;
|
||
|
int retErr = 0;
|
||
|
|
||
|
/* 0 1 2 3 4 5 6 7 8 9 */
|
||
|
pfnConvSetup aSetup[NUMY] = { convSetupRev, convSetupRev, convSetupRev, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup };
|
||
|
pfnConvDestroy aDestroy[NUMY] = { convDestroyRev, convDestroyRev, convDestroyRev, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy };
|
||
|
pfnGetConvFnPtr aGetFnPtr[NUMY] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, };
|
||
|
pfnConvolution aConv[NUMY] = { slow_conv_R, slow_conv_A, slow_conv_B, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv };
|
||
|
const char * convText[NUMY] = { "R(non-simd)", "A(non-simd)", "B(non-simd)", "fast_conv_64", "fast_conv_128", "fast_conv_256", "fast_conv_512", "fast_conv_1K", "fast_conv_2K", "fast_conv_4K" };
|
||
|
int aFastAlgo[NUMY] = { 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 };
|
||
|
void * aSetupCfg[NUMY] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
|
||
|
int aBlkLen[NUMY] = { 1024, 1024, 1024, 64, 128, 256, 512, 1024, 2048, 4096 };
|
||
|
#if 1
|
||
|
int aRunAlgo[NUMY] = { 1, 1, 1, FILTERLEN<64, FILTERLEN<128, FILTERLEN<256, FILTERLEN<512, FILTERLEN<1024, FILTERLEN<2048, FILTERLEN<4096 };
|
||
|
#elif 0
|
||
|
int aRunAlgo[NUMY] = { 1, 0, 0, 0 && FILTERLEN<64, 1 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048, 0 && FILTERLEN<4096 };
|
||
|
#else
|
||
|
int aRunAlgo[NUMY] = { 1, 1, 1, 0 && FILTERLEN<64, 0 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048, 0 && FILTERLEN<4096 };
|
||
|
#endif
|
||
|
double aSpeedFactor[NUMY], aDuration[NUMY], procSmpPerSec[NUMY];
|
||
|
|
||
|
X = pffastconv_malloc( (unsigned)(len+4) * sizeof(float) );
|
||
|
for ( i=0; i < NUMY; ++i)
|
||
|
{
|
||
|
if ( 1 || i < 2 )
|
||
|
Y[i] = pffastconv_malloc( (unsigned)len * sizeof(float) );
|
||
|
else
|
||
|
Y[i] = Y[1];
|
||
|
|
||
|
Y[i][0] = 123.F; /* test for pffft_zconvolve_no_accu() */
|
||
|
aSpeedFactor[i] = -1.0;
|
||
|
aDuration[i] = -1.0;
|
||
|
procSmpPerSec[i] = -1.0;
|
||
|
}
|
||
|
|
||
|
H = pffastconv_malloc((unsigned)FILTERLEN * sizeof(float));
|
||
|
|
||
|
/* initialize input */
|
||
|
if ( convFlags & PFFASTCONV_CPLX_INP_OUT )
|
||
|
{
|
||
|
for ( i = 0; i < lenC; ++i )
|
||
|
{
|
||
|
X[2*i ] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
|
||
|
X[2*i+1] = (float)((i+2048) % 4093);
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
for ( i = 0; i < len; ++i )
|
||
|
X[i] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
|
||
|
}
|
||
|
X[ len ] = INVALID_FLOAT_VAL;
|
||
|
X[ len +1 ] = INVALID_FLOAT_VAL;
|
||
|
X[ len +2 ] = INVALID_FLOAT_VAL;
|
||
|
X[ len +3 ] = INVALID_FLOAT_VAL;
|
||
|
|
||
|
if (!testOutLen)
|
||
|
printFirst( X, "X", 64, 8 );
|
||
|
|
||
|
/* filter coeffs */
|
||
|
memset( H, 0, FILTERLEN * sizeof(float) );
|
||
|
#if 1
|
||
|
if ( convFlags & PFFASTCONV_SYMMETRIC )
|
||
|
{
|
||
|
const int half = FILTERLEN / 2;
|
||
|
for ( j = 0; j < half; ++j ) {
|
||
|
switch (j % 3) {
|
||
|
case 0: H[j] = H[FILTERLEN-1-j] = -1.0F; break;
|
||
|
case 1: H[j] = H[FILTERLEN-1-j] = 1.0F; break;
|
||
|
case 2: H[j] = H[FILTERLEN-1-j] = 0.5F; break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
for ( j = 0; j < FILTERLEN; ++j ) {
|
||
|
switch (j % 3) {
|
||
|
case 0: H[j] = -1.0F; break;
|
||
|
case 1: H[j] = 1.0F; break;
|
||
|
case 2: H[j] = 0.5F; break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
#else
|
||
|
H[0] = 1.0F;
|
||
|
H[FILTERLEN -1] = 1.0F;
|
||
|
#endif
|
||
|
if (!testOutLen)
|
||
|
printFirst( H, "H", FILTERLEN, 8 );
|
||
|
|
||
|
printf("\n");
|
||
|
printf("filterLen = %d\t%s%s\t%s:\n", FILTERLEN,
|
||
|
((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
|
||
|
(convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
|
||
|
((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym") );
|
||
|
|
||
|
while (1)
|
||
|
{
|
||
|
|
||
|
for ( yi = 0; yi < NUMY; ++yi )
|
||
|
{
|
||
|
if (!aRunAlgo[yi])
|
||
|
continue;
|
||
|
|
||
|
aSetupCfg[yi] = aSetup[yi]( H, FILTERLEN, &aBlkLen[yi], convFlags );
|
||
|
|
||
|
/* get effective apply function ptr */
|
||
|
if ( aSetupCfg[yi] && aGetFnPtr[yi] )
|
||
|
aConv[yi] = aGetFnPtr[yi]( aSetupCfg[yi] );
|
||
|
|
||
|
if ( aSetupCfg[yi] && aConv[yi] ) {
|
||
|
if (testOutLen)
|
||
|
{
|
||
|
t0 = uclock_sec();
|
||
|
outN[yi] = aConv[yi]( aSetupCfg[yi], X, lenC, Y[yi], Y[0], 1 /* applyFlush */ );
|
||
|
t1 = uclock_sec();
|
||
|
td = t1 - t0;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
const int blkLen = 4096; /* required for 'fast_conv_4K' */
|
||
|
int64_t offC = 0, offS, Nout;
|
||
|
int k;
|
||
|
iter = 0;
|
||
|
outN[yi] = 0;
|
||
|
t0 = uclock_sec();
|
||
|
tstop = t0 + 0.25; /* benchmark duration: 250 ms */
|
||
|
do {
|
||
|
for ( k = 0; k < 128 && offC +blkLen < lenC; ++k )
|
||
|
{
|
||
|
offS = cplxFactor * offC;
|
||
|
Nout = aConv[yi]( aSetupCfg[yi], X +offS, blkLen, Y[yi] +offS, Y[0], (offC +blkLen >= lenC) /* applyFlush */ );
|
||
|
offC += Nout;
|
||
|
++iter;
|
||
|
if ( !Nout )
|
||
|
break;
|
||
|
if ( offC +blkLen >= lenC )
|
||
|
{
|
||
|
outN[yi] += offC;
|
||
|
offC = 0;
|
||
|
}
|
||
|
}
|
||
|
t1 = uclock_sec();
|
||
|
} while ( t1 < tstop );
|
||
|
outN[yi] += offC;
|
||
|
td = t1 - t0;
|
||
|
procSmpPerSec[yi] = cplxFactor * (double)outN[yi] / td;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
t0 = t1 = td = 0.0;
|
||
|
outN[yi] = 0;
|
||
|
}
|
||
|
aDuration[yi] = td;
|
||
|
if ( yi == 0 ) {
|
||
|
const float * Yvals = Y[0];
|
||
|
const int64_t refOutLen = cplxFactor * outN[0];
|
||
|
tdref = td;
|
||
|
if (printDbg) {
|
||
|
printf("convolution '%s' took: %f ms\n", convText[yi], td*1000.0);
|
||
|
printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
|
||
|
}
|
||
|
aSpeedFactor[yi] = 1.0;
|
||
|
/* */
|
||
|
yRangeMin = FLT_MAX;
|
||
|
yRangeMax = FLT_MIN;
|
||
|
for ( i = 0; i < refOutLen; ++i )
|
||
|
{
|
||
|
if ( yRangeMax < Yvals[i] ) yRangeMax = Yvals[i];
|
||
|
if ( yRangeMin > Yvals[i] ) yRangeMin = Yvals[i];
|
||
|
}
|
||
|
yErrLimit = fabsf(yRangeMax - yRangeMin) / ( 100.0F * 1000.0F );
|
||
|
/* yErrLimit = 0.01F; */
|
||
|
if (testOutLen) {
|
||
|
if (1) {
|
||
|
printf("reference output len = %" PRId64 " smp\n", outN[0]);
|
||
|
printf("reference output range |%.1f ..%.1f| = %.1f ==> err limit = %f\n", yRangeMin, yRangeMax, yRangeMax - yRangeMin, yErrLimit);
|
||
|
}
|
||
|
printFirst( Yvals, "Yref", 64, 8 );
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
aSpeedFactor[yi] = tdref / td;
|
||
|
if (printDbg) {
|
||
|
printf("\nconvolution '%s' took: %f ms == %f %% == %f X\n", convText[yi], td*1000.0, td * 100 / tdref, tdref / td);
|
||
|
printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int iMaxSpeedSlowAlgo = -1;
|
||
|
int iFirstFastAlgo = -1;
|
||
|
int iMaxSpeedFastAlgo = -1;
|
||
|
int iPrintedRefOutLen = 0;
|
||
|
{
|
||
|
for ( yc = 1; yc < NUMY; ++yc )
|
||
|
{
|
||
|
if (!aRunAlgo[yc])
|
||
|
continue;
|
||
|
if (aFastAlgo[yc]) {
|
||
|
if ( iMaxSpeedFastAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedFastAlgo] )
|
||
|
iMaxSpeedFastAlgo = yc;
|
||
|
|
||
|
if (iFirstFastAlgo < 0)
|
||
|
iFirstFastAlgo = yc;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if ( iMaxSpeedSlowAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedSlowAlgo] )
|
||
|
iMaxSpeedSlowAlgo = yc;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (printSpeed)
|
||
|
{
|
||
|
if (testOutLen)
|
||
|
{
|
||
|
if (iMaxSpeedSlowAlgo >= 0 )
|
||
|
printf("fastest slow algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedSlowAlgo], aSpeedFactor[iMaxSpeedSlowAlgo], 1000.0 * aDuration[iMaxSpeedSlowAlgo]);
|
||
|
if (0 != iMaxSpeedSlowAlgo && aRunAlgo[0])
|
||
|
printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[0], aSpeedFactor[0], 1000.0 * aDuration[0]);
|
||
|
if (1 != iMaxSpeedSlowAlgo && aRunAlgo[1])
|
||
|
printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[1], aSpeedFactor[1], 1000.0 * aDuration[1]);
|
||
|
|
||
|
if (iFirstFastAlgo >= 0 && iFirstFastAlgo != iMaxSpeedFastAlgo && aRunAlgo[iFirstFastAlgo])
|
||
|
printf("first fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo], aSpeedFactor[iFirstFastAlgo], 1000.0 * aDuration[iFirstFastAlgo]);
|
||
|
if (iFirstFastAlgo >= 0 && iFirstFastAlgo+1 != iMaxSpeedFastAlgo && iFirstFastAlgo+1 < NUMY && aRunAlgo[iFirstFastAlgo+1])
|
||
|
printf("2nd fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo+1], aSpeedFactor[iFirstFastAlgo+1], 1000.0 * aDuration[iFirstFastAlgo+1]);
|
||
|
|
||
|
if ( 0 <= iMaxSpeedFastAlgo && iMaxSpeedFastAlgo < NUMY && aRunAlgo[iMaxSpeedFastAlgo] )
|
||
|
{
|
||
|
printf("fastest fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedFastAlgo], aSpeedFactor[iMaxSpeedFastAlgo], 1000.0 * aDuration[iMaxSpeedFastAlgo]);
|
||
|
if ( 0 <= iMaxSpeedSlowAlgo && iMaxSpeedSlowAlgo < NUMY && aRunAlgo[iMaxSpeedSlowAlgo] )
|
||
|
printf("fast / slow ratio: %f X\n", aSpeedFactor[iMaxSpeedFastAlgo] / aSpeedFactor[iMaxSpeedSlowAlgo] );
|
||
|
}
|
||
|
printf("\n");
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
for ( yc = 0; yc < NUMY; ++yc )
|
||
|
{
|
||
|
if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
|
||
|
continue;
|
||
|
printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\n",
|
||
|
convText[yc], (double)outN[yc]/(1000.0 * 1000.0), 1000.0 * aDuration[yc], procSmpPerSec[yc] * 0.001 );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
for ( yc = 1; yc < NUMY; ++yc )
|
||
|
{
|
||
|
const float * Yref;
|
||
|
const float * Ycurr;
|
||
|
int outMin;
|
||
|
|
||
|
if (!aRunAlgo[yc])
|
||
|
continue;
|
||
|
|
||
|
if (printDbg)
|
||
|
printf("\n");
|
||
|
|
||
|
if ( outN[yc] == 0 )
|
||
|
{
|
||
|
printf("output size 0: '%s' not implemented\n", convText[yc]);
|
||
|
}
|
||
|
else if ( outN[0] != outN[yc] /* && aFastAlgo[yc] */ && testOutLen )
|
||
|
{
|
||
|
if (!iPrintedRefOutLen)
|
||
|
{
|
||
|
printf("reference output size = %" PRId64 ", delta to (cplx) input length = %" PRId64 " smp\n", outN[0], (len / cplxFactor) - outN[0]);
|
||
|
iPrintedRefOutLen = 1;
|
||
|
}
|
||
|
printf("output size doesn't match!: ref (FILTERLEN %d) returned %" PRId64 " smp, '%s' returned %" PRId64 " smp : delta = %" PRId64 " smp\n",
|
||
|
FILTERLEN, outN[0], convText[yc], outN[yc], outN[yc] - outN[0] );
|
||
|
retErr = 1;
|
||
|
}
|
||
|
|
||
|
posMaxErr = 0;
|
||
|
maxErr = -1.0;
|
||
|
Yref = Y[0];
|
||
|
Ycurr = Y[yc];
|
||
|
outMin = ( outN[yc] < outN[0] ) ? outN[yc] : outN[0];
|
||
|
numErrOverLimit = 0;
|
||
|
for ( i = 0; i < outMin; ++i )
|
||
|
{
|
||
|
if ( numErrOverLimit < 6 && fabs(Ycurr[i] - Yref[i]) >= yErrLimit )
|
||
|
{
|
||
|
printf("algo '%s': at %d: ***ERROR*** = %f, errLimit = %f, ref = %f, actual = %f\n",
|
||
|
convText[yc], i, fabs(Ycurr[i] - Yref[i]), yErrLimit, Yref[i], Ycurr[i] );
|
||
|
++numErrOverLimit;
|
||
|
}
|
||
|
|
||
|
if ( fabs(Ycurr[i] - Yref[i]) > maxErr )
|
||
|
{
|
||
|
maxErr = fabsf(Ycurr[i] - Yref[i]);
|
||
|
posMaxErr = i;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if ( printDbg || (iMaxSpeedSlowAlgo == i) || (iMaxSpeedFastAlgo == i) )
|
||
|
printf("max difference for '%s' is %g at sample idx %d of max inp 4093-1 == %f %%\n", convText[yc], maxErr, posMaxErr, maxErr * 100.0 / 4092.0 );
|
||
|
}
|
||
|
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
pffastconv_free(X);
|
||
|
for ( i=0; i < NUMY; ++i)
|
||
|
{
|
||
|
if ( 1 || i < 2 )
|
||
|
pffastconv_free( Y[i] );
|
||
|
if (!aRunAlgo[i])
|
||
|
continue;
|
||
|
aDestroy[i]( aSetupCfg[i] );
|
||
|
}
|
||
|
|
||
|
pffastconv_free(H);
|
||
|
|
||
|
return retErr;
|
||
|
}
|
||
|
|
||
|
/* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
|
||
|
void validate_pffft_simd();
|
||
|
int validate_pffft_simd_ex(FILE * DbgOut);
|
||
|
|
||
|
|
||
|
int main(int argc, char **argv)
|
||
|
{
|
||
|
int result = 0;
|
||
|
int i, k, M, flagsA, flagsB, flagsC, testOutLen, printDbg, printSpeed;
|
||
|
int testOutLens = 1, benchConv = 1, quickTest = 0, slowTest = 0;
|
||
|
int testReal = 1, testCplx = 1, testSymetric = 0;
|
||
|
|
||
|
for ( i = 1; i < argc; ++i ) {
|
||
|
|
||
|
if (!strcmp(argv[i], "--test-simd")) {
|
||
|
int numErrs = validate_pffft_simd_ex(stdout);
|
||
|
fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs);
|
||
|
return ( numErrs > 0 ? 1 : 0 );
|
||
|
}
|
||
|
|
||
|
if (!strcmp(argv[i], "--no-len")) {
|
||
|
testOutLens = 0;
|
||
|
}
|
||
|
else if (!strcmp(argv[i], "--no-bench")) {
|
||
|
benchConv = 0;
|
||
|
}
|
||
|
else if (!strcmp(argv[i], "--quick")) {
|
||
|
quickTest = 1;
|
||
|
}
|
||
|
else if (!strcmp(argv[i], "--slow")) {
|
||
|
slowTest = 1;
|
||
|
}
|
||
|
else if (!strcmp(argv[i], "--real")) {
|
||
|
testCplx = 0;
|
||
|
}
|
||
|
else if (!strcmp(argv[i], "--cplx")) {
|
||
|
testReal = 0;
|
||
|
}
|
||
|
else if (!strcmp(argv[i], "--sym")) {
|
||
|
testSymetric = 1;
|
||
|
}
|
||
|
else /* if (!strcmp(argv[i], "--help")) */ {
|
||
|
printf("usage: %s [--test-simd] [--no-len] [--no-bench] [--quick|--slow] [--real|--cplx] [--sym]\n", argv[0]);
|
||
|
exit(1);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
if (testOutLens)
|
||
|
{
|
||
|
for ( k = 0; k < 3; ++k )
|
||
|
{
|
||
|
if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
|
||
|
continue;
|
||
|
printf("\n\n==========\n");
|
||
|
printf("testing %s %s output lengths ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
|
||
|
printf("==========\n");
|
||
|
flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
|
||
|
flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
|
||
|
flagsC = flagsB | PFFASTCONV_CPLX_SINGLE_FFT;
|
||
|
testOutLen = 1;
|
||
|
printDbg = 0;
|
||
|
printSpeed = 0;
|
||
|
for ( M = 128 - 4; M <= (quickTest ? 128+16 : 256); ++M )
|
||
|
{
|
||
|
if ( (M % 16) != 0 && testSymetric )
|
||
|
continue;
|
||
|
result |= test(M, flagsB, testOutLen, printDbg, printSpeed);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (benchConv)
|
||
|
{
|
||
|
for ( k = 0; k < 3; ++k )
|
||
|
{
|
||
|
if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
|
||
|
continue;
|
||
|
printf("\n\n==========\n");
|
||
|
printf("starting %s %s benchmark against linear convolutions ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
|
||
|
printf("==========\n");
|
||
|
flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
|
||
|
flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
|
||
|
flagsC = flagsB | ( k == 2 ? PFFASTCONV_CPLX_SINGLE_FFT : 0 );
|
||
|
testOutLen = 0;
|
||
|
printDbg = 0;
|
||
|
printSpeed = 1;
|
||
|
if (!slowTest) {
|
||
|
result |= test( 32, flagsC, testOutLen, printDbg, printSpeed);
|
||
|
result |= test( 32+ 16, flagsC, testOutLen, printDbg, printSpeed);
|
||
|
result |= test( 64, flagsC, testOutLen, printDbg, printSpeed);
|
||
|
result |= test( 64+ 32, flagsC, testOutLen, printDbg, printSpeed);
|
||
|
result |= test(128, flagsC, testOutLen, printDbg, printSpeed);
|
||
|
}
|
||
|
if (!quickTest) {
|
||
|
result |= test(128+ 64, flagsC, testOutLen, printDbg, printSpeed);
|
||
|
result |= test(256, flagsC, testOutLen, printDbg, printSpeed);
|
||
|
result |= test(256+128, flagsC, testOutLen, printDbg, printSpeed);
|
||
|
result |= test(512, flagsC, testOutLen, printDbg, printSpeed);
|
||
|
result |= test(1024, flagsC, testOutLen, printDbg, printSpeed);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return result;
|
||
|
}
|
||
|
|