openmpi/ompi/mca/op/avx/op_avx_component.c

331 lines
13 KiB
C

/*
* Copyright (c) 2019-2020 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2020 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2021 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file
*
* This is the "avx" component source code.
*
*/
#include "ompi_config.h"
#include "opal/util/printf.h"
#include "ompi/include/mpi_portable_platform.h"
#include "ompi/constants.h"
#include "ompi/op/op.h"
#include "ompi/mca/op/op.h"
#include "ompi/mca/op/base/base.h"
#include "ompi/mca/op/avx/op_avx.h"
static int avx_component_open(void);
static int avx_component_close(void);
static int avx_component_init_query(bool enable_progress_threads,
bool enable_mpi_thread_multiple);
static struct ompi_op_base_module_1_0_0_t *
avx_component_op_query(struct ompi_op_t *op, int *priority);
static int avx_component_register(void);
static mca_base_var_enum_value_flag_t avx_support_flags[] = {
{ .flag = 0x001, .string = "SSE" },
{ .flag = 0x002, .string = "SSE2" },
{ .flag = 0x004, .string = "SSE3" },
{ .flag = 0x008, .string = "SSE4.1" },
{ .flag = 0x010, .string = "AVX" },
{ .flag = 0x020, .string = "AVX2" },
{ .flag = 0x100, .string = "AVX512F" },
{ .flag = 0x200, .string = "AVX512BW" },
{ .flag = 0, .string = NULL },
};
/**
* A slightly modified code from
* https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
*/
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)
#include <immintrin.h>
static uint32_t has_intel_AVX_features(void)
{
uint32_t flags = 0;
flags |= _may_i_use_cpu_feature(_FEATURE_AVX512F) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_AVX512BW) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_AVX2) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_AVX) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_SSE4_1) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_SSE3) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_SSE2) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0;
flags |= _may_i_use_cpu_feature(_FEATURE_SSE) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0;
return flags;
}
#else /* non-Intel compiler */
#include <stdint.h>
#if defined(_MSC_VER)
#include <intrin.h>
#endif
static void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd)
{
#if defined(_MSC_VER)
__cpuidex(abcd, eax, ecx);
#else
uint32_t ebx = 0, edx = 0;
#if defined( __i386__ ) && defined ( __PIC__ )
/* in case of PIC under 32-bit EBX cannot be clobbered */
__asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx),
#else
__asm__ ( "cpuid" : "+b" (ebx),
#endif /* defined( __i386__ ) && defined ( __PIC__ ) */
"+a" (eax), "+c" (ecx), "=d" (edx) );
abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
#endif
}
static uint32_t has_intel_AVX_features(void)
{
/* From https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits */
const uint32_t avx512f_mask = (1U << 16); // AVX512F (EAX = 7, ECX = 0) : EBX
const uint32_t avx512_bw_mask = (1U << 30); // AVX512BW (EAX = 7, ECX = 0) : EBX
const uint32_t avx2_mask = (1U << 5); // AVX2 (EAX = 7, ECX = 0) : EBX
const uint32_t avx_mask = (1U << 28); // AVX (EAX = 1, ECX = 0) : ECX
const uint32_t sse4_1_mask = (1U << 19); // SSE4.1 (EAX = 1, ECX = 0) : ECX
const uint32_t sse3_mask = (1U << 0); // SSE3 (EAX = 1, ECX = 0) : ECX
const uint32_t sse2_mask = (1U << 26); // SSE2 (EAX = 1, ECX = 0) : EDX
const uint32_t sse_mask = (1U << 15); // SSE (EAX = 1, ECX = 0) : EDX
uint32_t flags = 0, abcd[4];
run_cpuid( 1, 0, abcd );
flags |= (abcd[2] & avx_mask) ? OMPI_OP_AVX_HAS_AVX_FLAG : 0;
flags |= (abcd[2] & sse4_1_mask) ? OMPI_OP_AVX_HAS_SSE4_1_FLAG : 0;
flags |= (abcd[2] & sse3_mask) ? OMPI_OP_AVX_HAS_SSE3_FLAG : 0;
flags |= (abcd[3] & sse2_mask) ? OMPI_OP_AVX_HAS_SSE2_FLAG : 0;
flags |= (abcd[3] & sse_mask) ? OMPI_OP_AVX_HAS_SSE_FLAG : 0;
#if defined(__APPLE__)
uint32_t fma_movbe_osxsave_mask = ((1U << 12) | (1U << 22) | (1U << 27)); /* FMA(12) + MOVBE (22) OSXSAVE (27) */
// OS supports extended processor state management ?
if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask )
return 0;
#endif /* defined(__APPLE__) */
run_cpuid( 7, 0, abcd );
flags |= (abcd[1] & avx512f_mask) ? OMPI_OP_AVX_HAS_AVX512F_FLAG : 0;
flags |= (abcd[1] & avx512_bw_mask) ? OMPI_OP_AVX_HAS_AVX512BW_FLAG : 0;
flags |= (abcd[1] & avx2_mask) ? OMPI_OP_AVX_HAS_AVX2_FLAG : 0;
return flags;
}
#endif /* non-Intel compiler */
ompi_op_avx_component_t mca_op_avx_component = {
{
.opc_version = {
OMPI_OP_BASE_VERSION_1_0_0,
.mca_component_name = "avx",
MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
OMPI_RELEASE_VERSION),
.mca_open_component = avx_component_open,
.mca_close_component = avx_component_close,
.mca_register_component_params = avx_component_register,
},
.opc_data = {
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
.opc_init_query = avx_component_init_query,
.opc_op_query = avx_component_op_query,
},
};
/*
* Component open
*/
static int avx_component_open(void)
{
/* We checked the flags during register, so if they are set to
* zero either the architecture is not suitable or the user disabled
* AVX support.
*
* A first level check to see what level of AVX is available on the
* hardware.
*
* Note that if this function returns non-OMPI_SUCCESS, then this
* component won't even be shown in ompi_info output (which is
* probably not what you want).
*/
return OMPI_SUCCESS;
}
/*
* Component close
*/
static int avx_component_close(void)
{
/* If avx was opened successfully, close it (i.e., release any
resources that may have been allocated on this component).
Note that _component_close() will always be called at the end
of the process, so it may have been after any/all of the other
component functions have been invoked (and possibly even after
modules have been created and/or destroyed). */
return OMPI_SUCCESS;
}
/*
* Register MCA params.
*/
static int
avx_component_register(void)
{
mca_op_avx_component.supported =
mca_op_avx_component.flags = has_intel_AVX_features();
// MCA var enum flag for conveniently seeing SSE/MMX/AVX support
// values
mca_base_var_enum_flag_t *new_enum_flag = NULL;
(void) mca_base_var_enum_create_flag("op_avx_support_flags",
avx_support_flags, &new_enum_flag);
(void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
"capabilities",
"Level of SSE/MMX/AVX support available in the current environment",
MCA_BASE_VAR_TYPE_INT,
&(new_enum_flag->super), 0, 0,
OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_CONSTANT,
&mca_op_avx_component.supported);
(void) mca_base_component_var_register(&mca_op_avx_component.super.opc_version,
"support",
"Level of SSE/MMX/AVX support to be used, capped by the local architecture capabilities",
MCA_BASE_VAR_TYPE_INT,
&(new_enum_flag->super), 0, 0,
OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_LOCAL,
&mca_op_avx_component.flags);
OBJ_RELEASE(new_enum_flag);
mca_op_avx_component.flags &= mca_op_avx_component.supported;
return OMPI_SUCCESS;
}
/*
* Query whether this component wants to be used in this process.
*/
static int
avx_component_init_query(bool enable_progress_threads,
bool enable_mpi_thread_multiple)
{
if( 0 == mca_op_avx_component.flags )
return OMPI_ERR_NOT_SUPPORTED;
return OMPI_SUCCESS;
}
#if OMPI_MCA_OP_HAVE_AVX512
extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx512[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
#endif
#if OMPI_MCA_OP_HAVE_AVX2
extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx2[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
#endif
#if OMPI_MCA_OP_HAVE_AVX
extern ompi_op_base_handler_fn_t ompi_op_avx_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
extern ompi_op_base_3buff_handler_fn_t ompi_op_avx_3buff_functions_avx[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
#endif
/*
* Query whether this component can be used for a specific op
*/
static struct ompi_op_base_module_1_0_0_t*
avx_component_op_query(struct ompi_op_t *op, int *priority)
{
ompi_op_base_module_t *module = NULL;
/* Sanity check -- although the framework should never invoke the
_component_op_query() on non-intrinsic MPI_Op's, we'll put a
check here just to be sure. */
if (0 == (OMPI_OP_FLAGS_INTRINSIC & op->o_flags)) {
return NULL;
}
switch (op->o_f_to_c_index) {
case OMPI_OP_BASE_FORTRAN_MAX:
case OMPI_OP_BASE_FORTRAN_MIN:
case OMPI_OP_BASE_FORTRAN_SUM:
case OMPI_OP_BASE_FORTRAN_PROD:
case OMPI_OP_BASE_FORTRAN_BOR:
case OMPI_OP_BASE_FORTRAN_BAND:
case OMPI_OP_BASE_FORTRAN_BXOR:
module = OBJ_NEW(ompi_op_base_module_t);
for (int i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
#if OMPI_MCA_OP_HAVE_AVX512
if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX512F_FLAG ) {
module->opm_fns[i] = ompi_op_avx_functions_avx512[op->o_f_to_c_index][i];
module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx512[op->o_f_to_c_index][i];
}
#endif
#if OMPI_MCA_OP_HAVE_AVX2
if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX2_FLAG ) {
if( NULL == module->opm_fns[i] ) {
module->opm_fns[i] = ompi_op_avx_functions_avx2[op->o_f_to_c_index][i];
}
if( NULL == module->opm_3buff_fns[i] ) {
module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx2[op->o_f_to_c_index][i];
}
}
#endif
#if OMPI_MCA_OP_HAVE_AVX
if( mca_op_avx_component.flags & OMPI_OP_AVX_HAS_AVX_FLAG ) {
if( NULL == module->opm_fns[i] ) {
module->opm_fns[i] = ompi_op_avx_functions_avx[op->o_f_to_c_index][i];
}
if( NULL == module->opm_3buff_fns[i] ) {
module->opm_3buff_fns[i] = ompi_op_avx_3buff_functions_avx[op->o_f_to_c_index][i];
}
}
#endif
if( NULL != module->opm_fns[i] ) {
OBJ_RETAIN(module);
}
if( NULL != module->opm_3buff_fns[i] ) {
OBJ_RETAIN(module);
}
}
break;
case OMPI_OP_BASE_FORTRAN_LAND:
case OMPI_OP_BASE_FORTRAN_LOR:
case OMPI_OP_BASE_FORTRAN_LXOR:
case OMPI_OP_BASE_FORTRAN_MAXLOC:
case OMPI_OP_BASE_FORTRAN_MINLOC:
case OMPI_OP_BASE_FORTRAN_REPLACE:
default:
break;
}
/* If we got a module from above, we'll return it. Otherwise,
we'll return NULL, indicating that this component does not want
to be considered for selection for this MPI_Op. Note that the
functions each returned a *avx* component pointer
(vs. a *base* component pointer -- where an *avx* component
is a base component plus some other module-specific cached
information), so we have to cast it to the right pointer type
before returning. */
if (NULL != module) {
*priority = 50;
}
return (ompi_op_base_module_1_0_0_t *) module;
}